Merge pull request #42 from UI-Research/new-indicators

minor modifications to tests
UI-Research · Aug 24, 2024 · 50765bf · 50765bf
2 parents eebfc41 + 69fdce5
commit 50765bf
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 120 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -33,7 +33,7 @@ Suggests:
     testthat (>= 3.0.0)
 Remotes: 
     UrbanInstitute/urbnthemes
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 URL: https://ui-research.github.io/urbnindicators/
 VignetteBuilder: knitr
 Config/testthat/edition: 3
diff --git a/R/compile_acs_data.R b/R/compile_acs_data.R
@@ -331,7 +331,7 @@ compile_acs_data = function(
     retain_moes = TRUE,
     spatial = FALSE) {
 
-warning("\n
+message("\n
 Variable names and geographies for ACS data products can change between years.
 Changes to geographies are particularly significant across decades
 (e.g., from 2019 to 2020), but these changes can occur in any year.\n
@@ -348,7 +348,7 @@ geographies over time should be thoroughly quality checked.\n")
 
   ## warning about inter-decadal tract geometry changes
   if ( (max(years) >= 2020) & (min(years) < 2020) & (geography == "tract") ) {
-    warning("Requested years span the year 2020, which is when the Census Bureau re-configures
+    message("Requested years span the year 2020, which is when the Census Bureau re-configures
       census tract boundaries. It is not valid to compare census tract-level statistics for years before 2020 to
       statistics from 2020 and after; use a crosswalk, such as those provided by NHGIS, to interpolate values.
       A future version of urbnindicators may address this issue automatically.") }
@@ -456,7 +456,7 @@ geographies over time should be thoroughly quality checked.\n")
     { if (retain_moes == TRUE) dplyr::left_join(., moes, by = c("GEOID", "data_source_year")) else . }
 
   ## attach the codebook as an attribute named "codebook" to the returned dataset
-  attr(df_calculated_estimates, "codebook") = generate_codebook(.data = df_calculated_estimates)
+  attr(df_calculated_estimates, "codebook") = generate_codebook(.data = df_calculated_estimates %>% sf::st_drop_geometry())
 
   return(df_calculated_estimates)
 }

diff --git a/inst/test-data/codebook_2024-08-24.rds b/inst/test-data/codebook_2024-08-24.rds
diff --git a/inst/test-data/test_data_2024-08-24.rds b/inst/test-data/test_data_2024-08-24.rds
diff --git a/tests/testthat/test-compile_acs_data.R b/tests/testthat/test-compile_acs_data.R
@@ -1,85 +1,96 @@
-####----Load Test Data----####
-## Statistics for CA and TX Tracts
-df = compile_acs_data(
-  variables = NULL,
-  years = c(2022),
-  geography = "tract",
-  states = c("CA", "TX"),
-  counties = NULL,
-  retain_moes = TRUE,
-  spatial = FALSE)
+message("Update test data prior to testing, as needed.")
+# df = compile_acs_data(
+#   variables = NULL,
+#   years = c(2022),
+#   geography = "tract",
+#   states = c("CA", "TX"),
+#   counties = NULL,
+#   retain_moes = TRUE,
+#   spatial = FALSE)
+#
+# codebook = attr(df, "codebook")
+# saveRDS(object = df, file = file.path("inst", "test-data", "test_data_2024-08-24.rds"))
+# saveRDS(codebook, file = file.path("inst", "test-data", "codebook_2024-08-24.rds"))
 
 ####----Tests----####
 # All percentages have no values greater than one and no values less than zero
-percentage_outliers_maxima = df %>%
-  dplyr::select(dplyr::matches("percent$")) %>%
-  dplyr::summarise(dplyr::across(.cols = dplyr::where(is.numeric), ~ max(.x, na.rm = T))) %>%
-  tidyr::pivot_longer(cols = everything()) %>%
-  dplyr::arrange(desc(value)) %>%
-  dplyr::filter(value > 1) %>%
-  nrow
-
-percentage_outliers_minima = df %>%
-  dplyr::select(dplyr::matches("percent$")) %>%
-  dplyr::summarise(dplyr::across(.cols = where(is.numeric), ~ min(.x, na.rm = T))) %>%
-  tidyr::pivot_longer(cols = everything()) %>%
-  dplyr::arrange(desc(value)) %>%
-  dplyr::filter(value < 0) %>%
-  nrow
-
 testthat::test_that(
   "All percentages have no values greater than one and no values less than zero",
-  { testthat::expect_equal(percentage_outliers_maxima, 0)
-    testthat::expect_equal(percentage_outliers_minima, 0) } )
+  {
+    ## Statistics for CA and TX Tracts
+    df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))
 
-# All measures have meaningful values: maximum and mean greater than zero,
-# all values are not missing, and there are at least two distinct values per measure
+    percentage_outliers_maxima = df %>%
+      dplyr::select(dplyr::matches("percent$")) %>%
+      dplyr::summarise(dplyr::across(.cols = dplyr::where(is.numeric), ~ max(.x, na.rm = T))) %>%
+      tidyr::pivot_longer(cols = everything()) %>%
+      dplyr::arrange(desc(value)) %>%
+      dplyr::filter(value > 1) %>%
+      nrow
 
-summary_statistics = df %>%
-  dplyr::select(GEOID, matches("percent$")) %>%
-  tidyr::pivot_longer(-GEOID) %>%
-  dplyr::group_by(name) %>%
-  dplyr::summarise(
-    count_total  = dplyr::n(),
-    count_na = sum(is.na(value)),
-    min = min(value, na.rm = T),
-    max = max(value, na.rm = T),
-    mean = mean(value, na.rm = T),
-    distinct_values = dplyr::n_distinct(value, na.rm = T))
+    percentage_outliers_minima = df %>%
+      dplyr::select(dplyr::matches("percent$")) %>%
+      dplyr::summarise(dplyr::across(.cols = where(is.numeric), ~ min(.x, na.rm = T))) %>%
+      tidyr::pivot_longer(cols = everything()) %>%
+      dplyr::arrange(desc(value)) %>%
+      dplyr::filter(value < 0) %>%
+      nrow
 
-summary_statistics %>%
-  dplyr::filter(stringr::str_detect(name, "area"))
+    testthat::expect_equal(percentage_outliers_maxima, 0)
+    testthat::expect_equal(percentage_outliers_minima, 0) } )
 
+# All measures have meaningful values: maximum and mean greater than zero,
+# all values are not missing, and there are at least two distinct values per measure
 testthat::test_that(
   "All measures have meaningful values",
-  { purrr::map_dbl(summary_statistics$max, ~testthat::expect_gt(.x, 0) )
+  {
+    ## Statistics for CA and TX Tracts
+    df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))
+
+    summary_statistics = df %>%
+      dplyr::select(GEOID, matches("percent$")) %>%
+      tidyr::pivot_longer(-GEOID) %>%
+      dplyr::group_by(name) %>%
+      dplyr::summarise(
+        count_total  = dplyr::n(),
+        count_na = sum(is.na(value)),
+        min = min(value, na.rm = T),
+        max = max(value, na.rm = T),
+        mean = mean(value, na.rm = T),
+        distinct_values = dplyr::n_distinct(value, na.rm = T))
+
+    purrr::map_dbl(summary_statistics$max, ~testthat::expect_gt(.x, 0) )
     purrr::map_dbl(summary_statistics$mean, ~testthat::expect_gt(.x, 0) )
     purrr::map2_dbl(summary_statistics$count_na, summary_statistics$count_total, ~ testthat::expect_lt(.x, .y) )
     purrr::map_dbl(summary_statistics$min, ~testthat::expect_gte(.x, 0) ) })
 
-# No percentage measure has exactly the same values as that of another percentage measure
-# (Note: this is implemented with tract data across the entire US. This does not
+# No percentage measure has exactly the same values (for every observation) as
+# that of another percentage measure (Note: this is implemented with tract data. This does not
 # inherently apply to other levels of geographic observation, nor for subsets of the US.)
-duplicates = purrr::map_dfr(
-  colnames(df %>% dplyr::select(dplyr::matches("percent$"))),
-  function(colname1) {
-    purrr::map_dfr(
-      colnames(df %>% dplyr::select(-dplyr::matches("_M$|universe"))),
-      function(colname2) {
-        if ((colname1 != colname2) & (identical(df[[colname1]], df[[colname2]]))) {
-          data.frame(variable_one = colname1, variable_two = colname2)}})})
-
-distinct_duplicates = duplicates %>%
-  dplyr::mutate(
-    combined_variables = dplyr::if_else(
-      variable_one > variable_two,
-      paste(variable_one, variable_two),
-      paste(variable_two, variable_one))) %>%
-  dplyr::distinct(combined_variables, .keep_all = T) %>%
-  # this variable intentionally has a single duplicate (differently-named) variable
-  # for consistency across different concepts
-  dplyr::filter(!stringr::str_detect(variable_one, "year_structure_built.*later"))
-
 testthat::test_that(
   "All percentages are distinct",
-  { testthat::expect_equal(nrow(distinct_duplicates), 0) } )
+  {
+    ## Statistics for CA and TX Tracts
+    df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))
+
+    duplicates = purrr::map_dfr(
+      colnames(df %>% dplyr::select(dplyr::matches("percent$"))),
+      function(colname1) {
+        purrr::map_dfr(
+          colnames(df %>% dplyr::select(-dplyr::matches("_M$|universe"))),
+          function(colname2) {
+            if ((colname1 != colname2) & (identical(df[[colname1]], df[[colname2]]))) {
+              data.frame(variable_one = colname1, variable_two = colname2)}})})
+
+    distinct_duplicates = duplicates %>%
+      dplyr::mutate(
+        combined_variables = dplyr::if_else(
+          variable_one > variable_two,
+          paste(variable_one, variable_two),
+          paste(variable_two, variable_one))) %>%
+      dplyr::distinct(combined_variables, .keep_all = T) %>%
+      # this variable intentionally has a single duplicate (differently-named) variable
+      # for consistency across different concepts
+      dplyr::filter(!stringr::str_detect(variable_one, "year_structure_built.*later"))
+
+    testthat::expect_equal(nrow(distinct_duplicates), 0) } )
diff --git a/tests/testthat/test-generate_codebook.R b/tests/testthat/test-generate_codebook.R
@@ -1,77 +1,89 @@
-####----Load Test Data----####
-
-## Statistics for NJ Counties
-df = compile_acs_data(
-  variables = list_acs_variables(year = "2022"),
-  years = 2022,
-  geography = "county",
-  states = "NJ",
-  counties = NULL,
-  retain_moes = TRUE,
-  spatial = TRUE)
-
-codebook = attr(df, "codebook")
-
-#####----TESTING----#####
+## Testing data is created in test-compile_acs_data.R
 
+####----Tests----####
 ## No missingness in codebook
-  results_missingness = codebook %>%
-    dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ is.na(.x))) %>%
-    nrow
-
   testthat::test_that(
     "No column in the codebook has a missing value.",
-    { testthat::expect_equal(results_missingness, 0) } )
+    {
+      codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))
 
-## No transcribed function calls
-  results_transcribed_functions = codebook %>%
-    dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "dplyr"))) %>%
-    nrow
+      results_missingness = codebook %>%
+        dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ is.na(.x))) %>%
+        nrow
+
+      testthat::expect_equal(results_missingness, 0) } )
 
+## No transcribed function calls
   testthat::test_that(
     "No transcribed functions included in codebook output.",
-    { testthat::expect_equal(results_transcribed_functions, 0) } )
+    {
+      codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))
 
-## No missing raw variable codes
-  results_missing_raw_variables = codebook %>%
-    dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "\\(\\)|\\(NA\\)"))) %>%
-    nrow
+      results_transcribed_functions = codebook %>%
+        dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "dplyr"))) %>%
+        nrow
+
+      testthat::expect_equal(results_transcribed_functions, 0) } )
 
+## No missing raw variable codes
   testthat::test_that(
     "No variable definitions contain '(NA)' in lieu of the raw variable code.",
-    { testthat::expect_equal(results_missing_raw_variables, 0) } )
+    {
+      codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))
 
-## No universe variables in numerators (except population density)
-  results_universe_numerators = codebook %>%
-    dplyr::filter(stringr::str_detect(definition, "Numerator.*universe.*Denominator")) %>%
-    nrow
+      results_missing_raw_variables = codebook %>%
+        dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "\\(\\)|\\(NA\\)"))) %>%
+        nrow
+
+      testthat::expect_equal(results_missing_raw_variables, 0) } )
 
+## No universe variables in numerators (except population density)
   testthat::test_that(
     "Only population density contains a universe variable in the numerator.",
-    { testthat::expect_equal(results_universe_numerators, 1) } )
+    {
+      codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))
 
-## No definitions for variables that are percentages of universes (not possible)
-  results_universe_percentages = codebook %>%
-    dplyr::filter(stringr::str_detect(calculated_variable, "universe.*percent$")) %>%
-    nrow
+      results_universe_numerators = codebook %>%
+        dplyr::filter(stringr::str_detect(definition, "Numerator.*universe.*Denominator")) %>%
+        nrow
+
+      testthat::expect_equal(results_universe_numerators, 1) } )
 
+## No definitions for variables that are percentages of universes (not possible)
   testthat::test_that(
     "No calculated variables are perentages of a universe estimate.",
-    { testthat::expect_equal(results_universe_percentages, 0) } )
+    {
+      codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))
 
-## No codebook variable definitions that are missing from the input dataset
-  results_phantom_definitions = codebook %>%
-    dplyr::filter(!(calculated_variable %in% (df %>% colnames))) %>%
-    nrow
+      results_universe_percentages = codebook %>%
+        dplyr::filter(stringr::str_detect(calculated_variable, "universe.*percent$")) %>%
+        nrow
+
+      testthat::expect_equal(results_universe_percentages, 0) } )
 
+## No codebook variable definitions that are missing from the input dataset
   testthat::test_that(
     "No codebook entries for variables that don't exist in the input data.",
-    { testthat::expect_equal(results_phantom_definitions, 0) } )
+    {
+      ## Statistics for CA and TX Tracts
+      df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))
+      codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))
 
-## All variables in the input data are in the codebook
-  derived_variables = df %>% dplyr::select(dplyr::matches("percent$")) %>% colnames
-  undefined_variables = derived_variables[!(derived_variables %in% (codebook %>% dplyr::pull(calculated_variable)))]
+      results_phantom_definitions = codebook %>%
+        dplyr::filter(!(calculated_variable %in% (df %>% colnames))) %>%
+        nrow
+
+      testthat::expect_equal(results_phantom_definitions, 0) } )
 
+## All variables in the input data are in the codebook
   testthat::test_that(
     "All variables in the input data are in the codebook.",
-    { testthat::expect_equal(length(undefined_variables), 0) } )
+    {
+      ## Statistics for CA and TX Tracts
+      df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))
+      codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))
+
+      derived_variables = df %>% dplyr::select(dplyr::matches("percent$")) %>% colnames
+      undefined_variables = derived_variables[!(derived_variables %in% (codebook %>% dplyr::pull(calculated_variable)))]
+
+      testthat::expect_equal(length(undefined_variables), 0) } )