Create export/import functions for GW/SW data

sets scraped from Wasserportal. to do: clean vignettes in order to be in line with new functions (and also use new import functions!)
KWB-R · Sep 8, 2022 · 1cce450 · 1cce450
1 parent 1d26eb5
commit 1cce450
Show file tree

Hide file tree

Showing 13 changed files with 355 additions and 17 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -31,6 +31,7 @@ Imports:
     archive,
     data.table,
     dplyr,
+    fs,
     httr,
     kwb.datetime,
     kwb.utils,
@@ -41,6 +42,7 @@ Imports:
     stringr,
     tibble,
     tidyr,
+    withr,
     xml2
 Suggests: 
     covr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(base_url_download)
 export(columns_to_labels)
 export(get_daily_surfacewater_data)
 export(get_groundwater_data)
@@ -15,13 +16,18 @@ export(get_wasserportal_stations)
 export(get_wasserportal_stations_table)
 export(get_wasserportal_variables)
 export(list_data_to_csv_or_zip)
+export(list_masters_data_to_csv)
+export(list_timeseries_data_to_zip)
 export(read)
 export(readPackageFile)
 export(read_wasserportal)
 export(read_wasserportal_raw)
 export(read_wasserportal_raw_gw)
 export(wasserportal_base_url)
-import(kwb.utils)
+export(wp_masters_data_to_list)
+export(wp_timeseries_data_to_list)
+importFrom(archive,archive_extract)
+importFrom(archive,archive_write)
 importFrom(data.table,rbindlist)
 importFrom(dplyr,bind_cols)
 importFrom(dplyr,bind_rows)
@@ -32,18 +38,21 @@ importFrom(dplyr,pull)
 importFrom(dplyr,rename)
 importFrom(dplyr,select)
 importFrom(dplyr,select_if)
+importFrom(fs,dir_create)
 importFrom(httr,POST)
 importFrom(httr,content)
 importFrom(kwb.datetime,textToEuropeBerlinPosix)
 importFrom(kwb.utils,catAndRun)
 importFrom(kwb.utils,readPackageFile)
 importFrom(kwb.utils,selectColumns)
+importFrom(kwb.utils,selectElements)
 importFrom(kwb.utils,stopFormatted)
 importFrom(kwb.utils,substSpecialChars)
 importFrom(magrittr,"%>%")
 importFrom(parallel,detectCores)
 importFrom(parallel,makeCluster)
 importFrom(parallel,stopCluster)
+importFrom(readr,read_csv)
 importFrom(readr,write_csv)
 importFrom(rlang,.data)
 importFrom(rvest,html_attr)
@@ -56,9 +65,12 @@ importFrom(stringr,str_extract)
 importFrom(stringr,str_remove)
 importFrom(stringr,str_remove_all)
 importFrom(stringr,str_replace)
+importFrom(stringr,str_replace_all)
 importFrom(stringr,str_split_fixed)
 importFrom(tibble,tibble)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
+importFrom(utils,download.file)
 importFrom(utils,read.table)
+importFrom(withr,with_dir)
 importFrom(xml2,read_html)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,13 @@
-# [wasserportal 0.2.0](https://github.com/KWB-R/wasserportal/releases/tag/v0.2.0) <small>2022-09-07</small>
+# [wasserportal 0.2.0](https://github.com/KWB-R/wasserportal/releases/tag/v0.2.0) <small>2022-09-08</small>
 
+* Add functions for exporting time series data to `zip` files (`wp_masters_data_to_list()`) 
+and master data to `csv` files (`wp_timeseries_data_to_list()`), which will be 
+uploaded to [https://kwb-r.github.io/wasserportal](https://kwb-r.github.io/wasserportal)/`<filename>`
+
+* In addition `import` functions for downloading and importing the datasets above 
+into R as lists were added (`list_timeseries_data_to_zip()`, `list_masters_data_to_csv()`)
+
+* Code cleaning by `@hsonne` started
 
 * Fix `master data` requests by using the `master_url` instead of `station_id`, 
 as the latter was not unique. Now functions `get_wasserportal_master_data()` and 

diff --git a/R/list_data_to_csv_or_zip.R b/R/list_data_to_csv_or_zip.R
@@ -5,8 +5,9 @@
 #' @param to_zip convert to zip file (default: FALSE)
 #' @return loops through list of data frames and uses list names as filenames
 #' @export
+#' @importFrom archive archive_write
 #' @importFrom readr write_csv
-#' @importFrom stringr str_replace
+#' @importFrom stringr str_replace str_replace_all
 list_data_to_csv_or_zip <- function(
     data_list,
     file_prefix = "",
@@ -18,11 +19,15 @@ list_data_to_csv_or_zip <- function(
     filename_base <- paste0(
       file_prefix,
       name %>%
-        stringr::str_replace("_", "-") %>%
+        stringr::str_replace_all("_", "-") %>%
         stringr::str_replace("\\.", "_"),
       collapse = "_"
     )
 
+    if(startsWith(filename_base, "surface")) {
+      filename_base <- paste0("daily_", filename_base)
+    }
+
     filename_csv <- paste0(filename_base, ".csv")
     filename_zip <- paste0(filename_base, ".zip")
 
@@ -47,3 +52,50 @@ list_data_to_csv_or_zip <- function(
 
   unlist(tmp)
 }
+
+
+#' Helper function: list timeseries data to zip
+#'
+#' @param timeseries_data_list time series data in list form as retrieved by
+#' \code{\link{get_groundwater_data}} or \code{\link{get_dailygroundwater_data}}
+#' @return loops through list of data frames and uses list names as filenames
+#' @export
+#' @importFrom readr write_csv
+#' @importFrom stringr str_replace
+#' @examples
+#' \dontrun{
+#' stations <- wasserportal::get_stations()
+#' # Groundwater Time Series
+#' gw_tsdata_list <- wasserportal::get_groundwater_data(stations)
+#' gw_tsdata_files <- wasserportal::list_timeseries_data_to_zip(gw_tsdata_list)
+#' # Surface Water Time Series
+#' sw_tsdata_list <- wasserportal::get_daily_surfacewater_data(stations)
+#' sw_tsdata_files <- wasserportal::list_timeseries_data_to_zip(sw_tsdata_list)
+#' }
+list_timeseries_data_to_zip <- function(timeseries_data_list) {
+  list_data_to_csv_or_zip(timeseries_data_list,
+                          file_prefix = "",
+                          to_zip = TRUE)
+
+}
+
+#' Helper function: list masters data to csv
+#'
+#' @param masters_data_list masters data in list form as retrieved by
+#' \code{\link{get_stations}} sublist element "overview_list"
+#' @return loops through list of data frames and uses list names as filenames
+#' @export
+#' @importFrom readr write_csv
+#' @importFrom stringr str_replace
+#' @examples
+#' \dontrun{
+#' stations <- wasserportal::get_stations()
+#' masters_data_csv_files <- wasserportal:list_masters_data_to_csv(stations$overview_list)
+#' masters_data_csv_files
+#' }
+list_masters_data_to_csv <- function(masters_data_list) {
+  list_data_to_csv_or_zip(masters_data_list,
+                          file_prefix = "stations_",
+                          to_zip = FALSE)
+
+}
diff --git a/R/read_wasserportal_raw.R b/R/read_wasserportal_raw.R
@@ -29,8 +29,9 @@ get_station_variables <- function(station_df)
 #' i.e. `get_stations()$crosstable`
 #' @return ????
 #' @export
-#' @import kwb.utils
+#' @importFrom kwb.utils catAndRun selectColumns selectElements
 #' @importFrom kwb.datetime textToEuropeBerlinPosix
+#' @importFrom httr content POST
 read_wasserportal_raw <- function(
   variable,
   station,
@@ -136,6 +137,7 @@ read_wasserportal_raw <- function(
 }
 
 # clean_timestamp_columns ------------------------------------------------------
+
 clean_timestamp_columns <- function(data, include_raw_time)
 {
   raw_timestamps <- kwb.utils::selectColumns(data, "Datum")

diff --git a/R/wp_masters_data_to_list.R b/R/wp_masters_data_to_list.R
@@ -0,0 +1,66 @@
+#' Wasserportal Master Data: download and Import in R List
+#'
+#' @param overview_list names of "overview_list" as retrieved by
+#' \code{\link{get_stations}}
+#' @param target_dir target directory for downloading data (default:
+#' tempdir())
+#' @param is_zipped are the data to be downloaded zipped (default:
+#' FALSE)
+#'
+#' @return downloads csv master data from Wasserportal
+#' @export
+#' @importFrom archive archive_extract
+#' @importFrom fs dir_create
+#' @importFrom utils download.file
+#' @importFrom readr read_csv
+#' @importFrom stringr str_replace str_replace_all
+#' @importFrom withr with_dir
+#' @examples
+#' \dontrun{
+#' stations <- wasserportal::get_stations()
+#' overview_list_names <- names(stations$overview_list)
+#' wp_masters_data_list <- wp_masters_data_to_list(overview_list_names)
+#' }
+wp_masters_data_to_list <- function(overview_list_names,
+                            target_dir = tempdir(),
+                            file_prefix = "stations_",
+                            is_zipped = FALSE) {
+
+  fs::dir_create(target_dir, recurse = TRUE)
+
+
+  filenames_base <- overview_list_names %>%
+    stringr::str_replace_all("_", "-") %>%
+    stringr::str_replace("\\.", "_")
+
+  filenames_base <- sprintf("%s%s",
+                            file_prefix,
+                            filenames_base)
+
+  filenames_csv <- paste0(filenames_base, ".csv")
+  filenames_zip <- paste0(filenames_base, ".zip")
+
+
+  stats::setNames(lapply(seq_len(length(filenames_base)), function(i) {
+
+    url <- sprintf("%s/%s",
+                   base_url_download(),
+                   ifelse(is_zipped,
+                          filenames_zip[i],
+                          filenames_csv[i]))
+
+    if(is_zipped) {
+
+      withr::with_dir(new = target_dir,
+                      code = {
+                        archive::archive_extract(url) %>%
+                          readr::read_csv()
+                      })
+    } else {
+      target_path <- file.path(target_dir, filenames_csv[i])
+      try(utils::download.file(url = url, destfile = target_path))
+      readr::read_csv(file = target_path)
+    }
+  }), nm = filenames_base
+  )
+}
diff --git a/R/wp_time-series_data_to_list.R b/R/wp_time-series_data_to_list.R
@@ -0,0 +1,76 @@
+#' Helper function: base url for download
+#' @keywords internal
+#' @noMd
+#' @noRd
+#' @return base url for download of csv/zip files prepared by R package
+#' @export
+#'
+base_url_download <- function() {
+"https://kwb-r.github.io/wasserportal"
+}
+
+#' Wasserportal Time Series Data: download and Import in R List
+#'
+#' @param overview_list names of "overview_list" as retrieved by
+#' \code{\link{get_stations}}
+#' @param target_dir target directory for downloading data (default:
+#' tempdir())
+#' @param is_zipped are the data to be downloaded zipped (default:
+#' TRUE)
+#'
+#' @return downloads (zipped) data from wasserportal
+#' @export
+#' @importFrom archive archive_extract
+#' @importFrom fs dir_create
+#' @importFrom utils download.file
+#' @importFrom readr read_csv
+#' @importFrom stringr str_replace str_replace_all
+#' @importFrom withr with_dir
+#' @examples
+#' \dontrun{
+#' stations <- wasserportal::get_stations()
+#' overview_list_names <- names(stations$overview_list)
+#' wp_timeseries_data_list <- wp_timeseries_data_to_list(overview_list_names)
+#' }
+wp_timeseries_data_to_list <- function(overview_list_names,
+                            target_dir = tempdir(),
+                            is_zipped = TRUE) {
+
+  fs::dir_create(target_dir, recurse = TRUE)
+
+
+  filenames_base <- overview_list_names %>%
+    stringr::str_replace_all("_", "-") %>%
+    stringr::str_replace("\\.", "_")
+
+  filenames_base <- stringr::str_replace_all(filenames_base,
+                                             "^surface",
+                                             "daily_surface")
+
+  filenames_csv <- paste0(filenames_base, ".csv")
+  filenames_zip <- paste0(filenames_base, ".zip")
+
+
+  stats::setNames(lapply(seq_len(length(filenames_base)), function(i) {
+
+    url <- sprintf("%s/%s",
+                   base_url_download(),
+                   ifelse(is_zipped,
+                          filenames_zip[i],
+                          filenames_csv[i]))
+
+    if(is_zipped) {
+
+      withr::with_dir(new = target_dir,
+                      code = {
+                        archive::archive_extract(url) %>%
+                          readr::read_csv()
+                      })
+    } else {
+      target_path <- file.path(target_dir, filenames_csv[i])
+      try(utils::download.file(url = url, destfile = target_path))
+      readr::read_csv(file = target_path)
+    }
+  }), nm = filenames_base
+  )
+}
diff --git a/man/list_masters_data_to_csv.Rd b/man/list_masters_data_to_csv.Rd
diff --git a/man/list_timeseries_data_to_zip.Rd b/man/list_timeseries_data_to_zip.Rd