-
Notifications
You must be signed in to change notification settings - Fork 0
/
3_attach-ancillary-data.R
197 lines (155 loc) · 7.16 KB
/
3_attach-ancillary-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
## ------------------------------------------ ##
# SPARC Soil P -- Join Ancillary Data
## ------------------------------------------ ##
# Script author(s): Nick J Lyon
# Purpose:
## Attaches ancillary data GoogleSheets to the data created by the preceding step
## ------------------------------------------ ##
# Housekeeping -----
## ------------------------------------------ ##
# Load necessary libraries
# install.packages("librarian")
librarian::shelf(tidyverse, googledrive, magrittr)
# Create necessary sub-folder(s)
dir.create(path = file.path("data", "tidy_data"), showWarnings = F)
dir.create(path = file.path("data", "ancillary_data"), showWarnings = F)
# Clear environment
rm(list = ls())
# Identify needed tidy file(s)
tidy_drive <- googledrive::as_id("https://drive.google.com/drive/u/0/folders/1pjgN-wRlec65NDLBvryibifyx6k9Iqy9")
# Identify the archival data in that folder and download it
googledrive::drive_ls(path = tidy_drive) %>%
dplyr::filter(name == "sparc-soil-p_full-data-p-sums.csv") %>%
googledrive::drive_download(file = .$id, overwrite = T,
path = file.path("data", "tidy_data", .$name))
# Read that file in
full_v1 <- read.csv(file = file.path("data", "tidy_data",
"sparc-soil-p_full-data-p-sums.csv"))
# Glimpse it!
dplyr::glimpse(full_v1)
## ------------------------------------------ ##
# Acquire Ancillary Data ----
## ------------------------------------------ ##
# Identify ancillary data file names
anc_names <- c(paste0("Ancillary_", c("dataset", "site", "block", "plot", "core")))
# Identify the desired ancillary data files
anc_files <- googledrive::drive_ls(path = googledrive::as_id("https://drive.google.com/drive/u/0/folders/1TwN8AwUKc3iLBsTRRzm68owNlUOgkQeI")) %>%
dplyr::filter(name %in% anc_names)
# Did that get all five?
anc_files
# Download files into that
purrr::walk2(.x = anc_files$id, .y = anc_files$name,
.f = ~ googledrive::drive_download(file = .x, overwrite = T,
path = file.path("data",
"ancillary_data", .y)))
## ------------------------------------------ ##
# Attach Ancillary Data ----
## ------------------------------------------ ##
# Make a new version of our primary data to avoid bad errors
full_v2 <- full_v1
# Identify the available ancillary data granularity levels
gran_levels <- c("dataset", "site", "block", "plot", "core")
# Loop across desired/available ancillary data to integrate with data
for(granularity in gran_levels){
# Starting message
message("Integrating ", granularity, "-level ancillary data")
# Identify the file
gran_path <- file.path("data", "ancillary_data", paste0("Ancillary_", granularity, ".xlsx"))
# Read it in
gran_df <- readxl::read_xlsx(path = gran_path)
# Fix weird issue with numbers in various spatial levels
## Site
if("site" %in% names(gran_df)){
gran_df <- gran_df %>%
dplyr::mutate(site = gsub(pattern = "\\.0", replacement = "", x = site)) }
## Block
if("block" %in% names(gran_df)){
gran_df <- gran_df %>%
dplyr::mutate(block = gsub(pattern = "\\.0", replacement = "", x = block)) }
## Plot
if("plot" %in% names(gran_df)){
gran_df <- gran_df %>%
dplyr::mutate(plot = gsub(pattern = "\\.0", replacement = "", x = plot)) }
## Core
if("core" %in% names(gran_df)){
gran_df <- gran_df %>%
dplyr::mutate(core = gsub(pattern = "\\.0", replacement = "", x = core)) }
# Identify spatial organization columns (in this level of the ancillary data)
spatial_cols <- intersect(x = c("lter", "dataset_simp", gran_levels), y = names(gran_df))
# Make the non-granularity columns specific to the level from which they were entered
## Note: manual assignment of column names is *risky*! DO NOT ATTEMPT ELSEWHERE!!
names(gran_df) <- c(spatial_cols, paste0(granularity, "_",
setdiff(x = names(gran_df), y = spatial_cols)))
# Now join onto the larger SPARC data
full_v2 %<>%
dplyr::left_join(y = gran_df, by = spatial_cols) }
# Check what that leaves us with
dplyr::glimpse(full_v2)
# Drop the data object left over from the loop
rm(list = "gran_df")
## ------------------------------------------ ##
# Streamline Ancillary Data ----
## ------------------------------------------ ##
# Now need to collapse 'duplicate' columns from ancillary data into single 'actual' value
## Will do one pipe/object per set of related columns
# Fix latitude/longitude columns
full_v3a <- full_v2 %>%
## Collapse columns together into a single 'actual' one
dplyr::mutate(lat_actual = dplyr::coalesce(lat, as.numeric(core_latitude),
as.numeric(plot_latitude),
as.numeric(block_latitude),
as.numeric(site_latitude),
as.numeric(dataset_latitude)),
lon_actual = dplyr::coalesce(lon,
as.numeric(core_longitude),
as.numeric(plot_longitude),
as.numeric(block_longitude),
as.numeric(site_longitude),
as.numeric(dataset_longitude)),
.after = raw_filename) %>%
# Throw away (sorry) all component columns now that we have 'actual'
dplyr::select(-lat, -lon, -dplyr::ends_with("_latitude"),
-dplyr::ends_with("_longitude"), -dplyr::ends_with("_coordinate_source")) %>%
# Rename actual
dplyr::rename(latitude = lat_actual,
longitude = lon_actual)
# Glimpse it
dplyr::glimpse(full_v3a)
# Streamline precipitation
full_v3b <- full_v3a %>%
# Collapse into one column
dplyr::mutate(precip_actual = dplyr::coalesce(precipitation_mm, site_MAP_mm),
.after = core) %>%
# Drop old columns
dplyr::select(-precipitation_mm, -site_MAP_mm) %>%
# Rename remaining column
dplyr::rename(mean.annual.precip_mm = precip_actual)
# Look at it
dplyr::glimpse(full_v3b)
# Do temperature next
full_v3c <- full_v3b %>%
# Collapse
dplyr::mutate(temp_actual = dplyr::coalesce(site_MAT_C),
.after = mean.annual.precip_mm) %>%
# Drop old
dplyr::select(-site_MAT_C) %>%
# Rename new
dplyr::rename(mean.annual.temp_C = temp_actual)
# Look at it
dplyr::glimpse(full_v3c)
## ------------------------------------------ ##
# Export Ancillary + P Sum Data ----
## ------------------------------------------ ##
# Create a final data object
final_combo <- full_v3c
# Check its structure
dplyr::glimpse(final_combo)
# Define the tidy file name
combo_name <- "sparc-soil-p_full-data-incl-ancillary.csv"
# Save out the final data object
write.csv(x = final_combo, file = file.path("data", "tidy_data", combo_name),
row.names = F, na = "")
# Export to that folder in the Drive
googledrive::drive_upload(media = file.path("data", "tidy_data", combo_name),
overwrite = T, path = tidy_drive)
# End ----