-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.Rmd
132 lines (100 loc) · 5.06 KB
/
preprocessing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
---
title: "Data Prerocessing"
output: html_document
date: "2023-09-25"
---
```{r}
library(xlsx)
library(tidyverse)
library(sf)
```
Scripts to clean up the original datasets supplied by Quido. This code should not need to be rerun ever
## clean up facilities dataset
```{r}
# load original data
facs <- read.xlsx("NL_geocoded_working.xlsx",1)
# cleanup
facs <- facs %>% select(-NA.) # remove random column
colnames(facs) <- c("country", "country_code", "facility_name", "coordinates", "start_date") # rename columns
facs <- facs %>%
mutate(coordinates = gsub(",", ".", coordinates)) %>% # fix inconsistency in coords
filter(!is.na(coordinates)) # remove rows with no coords (why do these exist?)
facs$start_date[facs$start_date == -99] <- NA # replace -99 (unknown start dates) with NA
facs$start_date <- as.Date(paste(facs$start_date, "-01-01 00:00:00", sep = "")) # convert start date to Date object
# add lat and lng cols
facs <- facs %>%
mutate(lat = as.character(lapply(strsplit(coordinates, "[.]"), function(x) paste(x[1], x[2], sep = "."))),
lng = as.character(lapply(strsplit(coordinates, "[.]"), function(x) paste(x[3], x[4], sep = "."))))
facs <- facs[, c("facility_name", "country", "country_code", "coordinates", "lat", "lng", "start_date")] # reorder columns
write.csv(facs, "data/facilities.csv")
```
## clean up missiles dataset
```{r}
# load original data
missiles <- read.xlsx("Missiles_datageocoding_prep.xlsx", 1)
# cleanup
colnames(missiles) <- c("address", "country", "coordinates", "address_found", "No.rd.", "W.h", "start_date","end_date","armament","RA_or_enclosure" ) # rename columns
missiles <- missiles %>%
mutate(coordinates = gsub(",", ".", coordinates)) # fix inconsistency in coords
missiles$start_date <- as.Date(paste(missiles$start_date, "-01-01 00:00:00", sep = "")) # convert start date to Date object
missiles$end_date <- as.Date(paste(missiles$end_date, "-01-01 00:00:00", sep = ""))
missiles$coordinates[27] <- paste(missiles$coordinates[27],".0",sep = "") # add '.0' at the end of the 27th coord cuz it was misformatted
# add lat and lng cols
missiles <- missiles %>%
mutate(lat = as.character(lapply(strsplit(coordinates, "[.]"), function(x) paste(x[1], x[2], sep = "."))),
lng = as.character(lapply(strsplit(coordinates, "[.]"), function(x) paste(x[3], x[4], sep = "."))))
missiles <- missiles[, c("address_found", "country", "coordinates", "lat", "lng", "start_date", "end_date", "address","No.rd.", "W.h","armament","RA_or_enclosure")] # reorder columns
write.csv(missiles, "data/missiles.csv")
```
## clean up and integrate sat datasets
```{r}
# need to clean up coords n shit
sat1 <- read_csv("data/sat1.csv")
sat2 <- read_csv("data/sat2.csv")
sat3 <- read_csv("data/sat3.csv")
# fix column inconsistencies
setdiff(colnames(sat2), colnames(sat1))
sat1$`Segment Count` <- rep(NA, nrow(sat1))
sat1$`Operations Number` <- rep(NA, nrow(sat1))
colnames(sat1)[colnames(sat1) == "NW Cormer Lat dec"] <- "NW Corner Lat dec"
sat1$`Data Source` <- rep("declass1", nrow(sat1))
setdiff(colnames(sat1), colnames(sat2))
sat2$`Direction Flag` <- rep(NA, nrow(sat2))
sat2$`Data Source` <- rep("declass2", nrow(sat2))
# reformat Display ID for sat1, sat2 due to bad formatting in CSV provided by USGS
reformat_id <- function(id, n) {
id <- strsplit(id, ",")[[1]][1] # split by comma, take first item
id
}
sat1$`Display ID` <- lapply(sat1$`Display ID`, reformat_id, n = 6)
sat2$`Display ID` <- lapply(sat2$`Display ID`, reformat_id, n = 7)
sat1_2 <- rbind(sat1,sat2)
setdiff(colnames(sat1_2), colnames(sat3))
sat3 <- sat3 %>%
mutate(`Camera Type` = str_replace(Camera, "A", "High Resolution Surveillance Camera - Aft")) %>%
mutate(`Camera Type` = str_replace(`Camera Type`, "F", "High Resolution Surveillance Camera - Forward")) %>%
mutate(`Camera Type` = str_replace(`Camera Type`, "T", "Lower Resolution Terrain Mapping Camera")) %>%
select(-Camera)
colnames(sat1_2)[colnames(sat1_2) == "Frame"] <- "Frame Number"
colnames(sat1_2)[colnames(sat1) == "Down Load Available"] <- "Download Available"
sat3$`Direction Flag` <- rep(NA, nrow(sat3))
sat3$`Segment Count` <- rep(NA, nrow(sat3))
sat3$`Data Source` <- rep("declass3", nrow(sat3))
sat <- rbind(sat1_2, sat3)
```
```{r}
# make wkt representation of sat geometry
sat <- sat %>%
mutate(geometry = paste("POLYGON ((",
`NW Corner Long dec`, " ", `NW Corner Lat dec`, ",",
`NE Corner Long dec`, " ", `NE Corner Lat dec`, ",",
`SE Corner Long dec`, " ", `SE Corner Lat dec`, ",",
`SW Corner Long dec`, " ", `SW Corner Lat dec`, ",",
`NW Corner Long dec`, " ", `NW Corner Lat dec`, "))", sep = ""))
sat <- sat[, -c(13:32)]
# interpret display ID a character string rather than a number
sat$`Display ID` <- as.character(sat$`Display ID`)
# define a more general camera resolution parameter
sat <- sat %>% mutate(`Camera Resolution (General)` = ifelse(`Camera Resolution` %in% c("Stereo High", "Vertical High", "2 to 4 feet"), "High", "Low"))
write_csv(sat, "data/sat.csv")
```