diff --git a/DESCRIPTION b/DESCRIPTION index 225a095..2947c1d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,7 @@ Suggests: testthat (>= 3.0.0) Remotes: UrbanInstitute/urbnthemes -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 URL: https://ui-research.github.io/urbnindicators/ VignetteBuilder: knitr Config/testthat/edition: 3 diff --git a/R/compile_acs_data.R b/R/compile_acs_data.R index ea45e70..1f7f5e7 100644 --- a/R/compile_acs_data.R +++ b/R/compile_acs_data.R @@ -331,7 +331,7 @@ compile_acs_data = function( retain_moes = TRUE, spatial = FALSE) { -warning("\n +message("\n Variable names and geographies for ACS data products can change between years. Changes to geographies are particularly significant across decades (e.g., from 2019 to 2020), but these changes can occur in any year.\n @@ -348,7 +348,7 @@ geographies over time should be thoroughly quality checked.\n") ## warning about inter-decadal tract geometry changes if ( (max(years) >= 2020) & (min(years) < 2020) & (geography == "tract") ) { - warning("Requested years span the year 2020, which is when the Census Bureau re-configures + message("Requested years span the year 2020, which is when the Census Bureau re-configures census tract boundaries. It is not valid to compare census tract-level statistics for years before 2020 to statistics from 2020 and after; use a crosswalk, such as those provided by NHGIS, to interpolate values. A future version of urbnindicators may address this issue automatically.") } @@ -456,7 +456,7 @@ geographies over time should be thoroughly quality checked.\n") { if (retain_moes == TRUE) dplyr::left_join(., moes, by = c("GEOID", "data_source_year")) else . } ## attach the codebook as an attribute named "codebook" to the returned dataset - attr(df_calculated_estimates, "codebook") = generate_codebook(.data = df_calculated_estimates) + attr(df_calculated_estimates, "codebook") = generate_codebook(.data = df_calculated_estimates %>% sf::st_drop_geometry()) return(df_calculated_estimates) } diff --git a/inst/test-data/codebook_2024-08-24.rds b/inst/test-data/codebook_2024-08-24.rds new file mode 100644 index 0000000..d0ea426 Binary files /dev/null and b/inst/test-data/codebook_2024-08-24.rds differ diff --git a/inst/test-data/test_data_2024-08-24.rds b/inst/test-data/test_data_2024-08-24.rds new file mode 100644 index 0000000..06b9bb3 Binary files /dev/null and b/inst/test-data/test_data_2024-08-24.rds differ diff --git a/tests/testthat/test-compile_acs_data.R b/tests/testthat/test-compile_acs_data.R index d1faeb4..89cc7d2 100644 --- a/tests/testthat/test-compile_acs_data.R +++ b/tests/testthat/test-compile_acs_data.R @@ -1,85 +1,96 @@ -####----Load Test Data----#### -## Statistics for CA and TX Tracts -df = compile_acs_data( - variables = NULL, - years = c(2022), - geography = "tract", - states = c("CA", "TX"), - counties = NULL, - retain_moes = TRUE, - spatial = FALSE) +message("Update test data prior to testing, as needed.") +# df = compile_acs_data( +# variables = NULL, +# years = c(2022), +# geography = "tract", +# states = c("CA", "TX"), +# counties = NULL, +# retain_moes = TRUE, +# spatial = FALSE) +# +# codebook = attr(df, "codebook") +# saveRDS(object = df, file = file.path("inst", "test-data", "test_data_2024-08-24.rds")) +# saveRDS(codebook, file = file.path("inst", "test-data", "codebook_2024-08-24.rds")) ####----Tests----#### # All percentages have no values greater than one and no values less than zero -percentage_outliers_maxima = df %>% - dplyr::select(dplyr::matches("percent$")) %>% - dplyr::summarise(dplyr::across(.cols = dplyr::where(is.numeric), ~ max(.x, na.rm = T))) %>% - tidyr::pivot_longer(cols = everything()) %>% - dplyr::arrange(desc(value)) %>% - dplyr::filter(value > 1) %>% - nrow - -percentage_outliers_minima = df %>% - dplyr::select(dplyr::matches("percent$")) %>% - dplyr::summarise(dplyr::across(.cols = where(is.numeric), ~ min(.x, na.rm = T))) %>% - tidyr::pivot_longer(cols = everything()) %>% - dplyr::arrange(desc(value)) %>% - dplyr::filter(value < 0) %>% - nrow - testthat::test_that( "All percentages have no values greater than one and no values less than zero", - { testthat::expect_equal(percentage_outliers_maxima, 0) - testthat::expect_equal(percentage_outliers_minima, 0) } ) + { + ## Statistics for CA and TX Tracts + df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) -# All measures have meaningful values: maximum and mean greater than zero, -# all values are not missing, and there are at least two distinct values per measure + percentage_outliers_maxima = df %>% + dplyr::select(dplyr::matches("percent$")) %>% + dplyr::summarise(dplyr::across(.cols = dplyr::where(is.numeric), ~ max(.x, na.rm = T))) %>% + tidyr::pivot_longer(cols = everything()) %>% + dplyr::arrange(desc(value)) %>% + dplyr::filter(value > 1) %>% + nrow -summary_statistics = df %>% - dplyr::select(GEOID, matches("percent$")) %>% - tidyr::pivot_longer(-GEOID) %>% - dplyr::group_by(name) %>% - dplyr::summarise( - count_total = dplyr::n(), - count_na = sum(is.na(value)), - min = min(value, na.rm = T), - max = max(value, na.rm = T), - mean = mean(value, na.rm = T), - distinct_values = dplyr::n_distinct(value, na.rm = T)) + percentage_outliers_minima = df %>% + dplyr::select(dplyr::matches("percent$")) %>% + dplyr::summarise(dplyr::across(.cols = where(is.numeric), ~ min(.x, na.rm = T))) %>% + tidyr::pivot_longer(cols = everything()) %>% + dplyr::arrange(desc(value)) %>% + dplyr::filter(value < 0) %>% + nrow -summary_statistics %>% - dplyr::filter(stringr::str_detect(name, "area")) + testthat::expect_equal(percentage_outliers_maxima, 0) + testthat::expect_equal(percentage_outliers_minima, 0) } ) +# All measures have meaningful values: maximum and mean greater than zero, +# all values are not missing, and there are at least two distinct values per measure testthat::test_that( "All measures have meaningful values", - { purrr::map_dbl(summary_statistics$max, ~testthat::expect_gt(.x, 0) ) + { + ## Statistics for CA and TX Tracts + df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) + + summary_statistics = df %>% + dplyr::select(GEOID, matches("percent$")) %>% + tidyr::pivot_longer(-GEOID) %>% + dplyr::group_by(name) %>% + dplyr::summarise( + count_total = dplyr::n(), + count_na = sum(is.na(value)), + min = min(value, na.rm = T), + max = max(value, na.rm = T), + mean = mean(value, na.rm = T), + distinct_values = dplyr::n_distinct(value, na.rm = T)) + + purrr::map_dbl(summary_statistics$max, ~testthat::expect_gt(.x, 0) ) purrr::map_dbl(summary_statistics$mean, ~testthat::expect_gt(.x, 0) ) purrr::map2_dbl(summary_statistics$count_na, summary_statistics$count_total, ~ testthat::expect_lt(.x, .y) ) purrr::map_dbl(summary_statistics$min, ~testthat::expect_gte(.x, 0) ) }) -# No percentage measure has exactly the same values as that of another percentage measure -# (Note: this is implemented with tract data across the entire US. This does not +# No percentage measure has exactly the same values (for every observation) as +# that of another percentage measure (Note: this is implemented with tract data. This does not # inherently apply to other levels of geographic observation, nor for subsets of the US.) -duplicates = purrr::map_dfr( - colnames(df %>% dplyr::select(dplyr::matches("percent$"))), - function(colname1) { - purrr::map_dfr( - colnames(df %>% dplyr::select(-dplyr::matches("_M$|universe"))), - function(colname2) { - if ((colname1 != colname2) & (identical(df[[colname1]], df[[colname2]]))) { - data.frame(variable_one = colname1, variable_two = colname2)}})}) - -distinct_duplicates = duplicates %>% - dplyr::mutate( - combined_variables = dplyr::if_else( - variable_one > variable_two, - paste(variable_one, variable_two), - paste(variable_two, variable_one))) %>% - dplyr::distinct(combined_variables, .keep_all = T) %>% - # this variable intentionally has a single duplicate (differently-named) variable - # for consistency across different concepts - dplyr::filter(!stringr::str_detect(variable_one, "year_structure_built.*later")) - testthat::test_that( "All percentages are distinct", - { testthat::expect_equal(nrow(distinct_duplicates), 0) } ) + { + ## Statistics for CA and TX Tracts + df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) + + duplicates = purrr::map_dfr( + colnames(df %>% dplyr::select(dplyr::matches("percent$"))), + function(colname1) { + purrr::map_dfr( + colnames(df %>% dplyr::select(-dplyr::matches("_M$|universe"))), + function(colname2) { + if ((colname1 != colname2) & (identical(df[[colname1]], df[[colname2]]))) { + data.frame(variable_one = colname1, variable_two = colname2)}})}) + + distinct_duplicates = duplicates %>% + dplyr::mutate( + combined_variables = dplyr::if_else( + variable_one > variable_two, + paste(variable_one, variable_two), + paste(variable_two, variable_one))) %>% + dplyr::distinct(combined_variables, .keep_all = T) %>% + # this variable intentionally has a single duplicate (differently-named) variable + # for consistency across different concepts + dplyr::filter(!stringr::str_detect(variable_one, "year_structure_built.*later")) + + testthat::expect_equal(nrow(distinct_duplicates), 0) } ) diff --git a/tests/testthat/test-generate_codebook.R b/tests/testthat/test-generate_codebook.R index a9c187d..5376198 100644 --- a/tests/testthat/test-generate_codebook.R +++ b/tests/testthat/test-generate_codebook.R @@ -1,77 +1,89 @@ -####----Load Test Data----#### - -## Statistics for NJ Counties -df = compile_acs_data( - variables = list_acs_variables(year = "2022"), - years = 2022, - geography = "county", - states = "NJ", - counties = NULL, - retain_moes = TRUE, - spatial = TRUE) - -codebook = attr(df, "codebook") - -#####----TESTING----##### +## Testing data is created in test-compile_acs_data.R +####----Tests----#### ## No missingness in codebook - results_missingness = codebook %>% - dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ is.na(.x))) %>% - nrow - testthat::test_that( "No column in the codebook has a missing value.", - { testthat::expect_equal(results_missingness, 0) } ) + { + codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) -## No transcribed function calls - results_transcribed_functions = codebook %>% - dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "dplyr"))) %>% - nrow + results_missingness = codebook %>% + dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ is.na(.x))) %>% + nrow + + testthat::expect_equal(results_missingness, 0) } ) +## No transcribed function calls testthat::test_that( "No transcribed functions included in codebook output.", - { testthat::expect_equal(results_transcribed_functions, 0) } ) + { + codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) -## No missing raw variable codes - results_missing_raw_variables = codebook %>% - dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "\\(\\)|\\(NA\\)"))) %>% - nrow + results_transcribed_functions = codebook %>% + dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "dplyr"))) %>% + nrow + + testthat::expect_equal(results_transcribed_functions, 0) } ) +## No missing raw variable codes testthat::test_that( "No variable definitions contain '(NA)' in lieu of the raw variable code.", - { testthat::expect_equal(results_missing_raw_variables, 0) } ) + { + codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) -## No universe variables in numerators (except population density) - results_universe_numerators = codebook %>% - dplyr::filter(stringr::str_detect(definition, "Numerator.*universe.*Denominator")) %>% - nrow + results_missing_raw_variables = codebook %>% + dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "\\(\\)|\\(NA\\)"))) %>% + nrow + + testthat::expect_equal(results_missing_raw_variables, 0) } ) +## No universe variables in numerators (except population density) testthat::test_that( "Only population density contains a universe variable in the numerator.", - { testthat::expect_equal(results_universe_numerators, 1) } ) + { + codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) -## No definitions for variables that are percentages of universes (not possible) - results_universe_percentages = codebook %>% - dplyr::filter(stringr::str_detect(calculated_variable, "universe.*percent$")) %>% - nrow + results_universe_numerators = codebook %>% + dplyr::filter(stringr::str_detect(definition, "Numerator.*universe.*Denominator")) %>% + nrow + + testthat::expect_equal(results_universe_numerators, 1) } ) +## No definitions for variables that are percentages of universes (not possible) testthat::test_that( "No calculated variables are perentages of a universe estimate.", - { testthat::expect_equal(results_universe_percentages, 0) } ) + { + codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) -## No codebook variable definitions that are missing from the input dataset - results_phantom_definitions = codebook %>% - dplyr::filter(!(calculated_variable %in% (df %>% colnames))) %>% - nrow + results_universe_percentages = codebook %>% + dplyr::filter(stringr::str_detect(calculated_variable, "universe.*percent$")) %>% + nrow + + testthat::expect_equal(results_universe_percentages, 0) } ) +## No codebook variable definitions that are missing from the input dataset testthat::test_that( "No codebook entries for variables that don't exist in the input data.", - { testthat::expect_equal(results_phantom_definitions, 0) } ) + { + ## Statistics for CA and TX Tracts + df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) + codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) -## All variables in the input data are in the codebook - derived_variables = df %>% dplyr::select(dplyr::matches("percent$")) %>% colnames - undefined_variables = derived_variables[!(derived_variables %in% (codebook %>% dplyr::pull(calculated_variable)))] + results_phantom_definitions = codebook %>% + dplyr::filter(!(calculated_variable %in% (df %>% colnames))) %>% + nrow + + testthat::expect_equal(results_phantom_definitions, 0) } ) +## All variables in the input data are in the codebook testthat::test_that( "All variables in the input data are in the codebook.", - { testthat::expect_equal(length(undefined_variables), 0) } ) + { + ## Statistics for CA and TX Tracts + df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) + codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) + + derived_variables = df %>% dplyr::select(dplyr::matches("percent$")) %>% colnames + undefined_variables = derived_variables[!(derived_variables %in% (codebook %>% dplyr::pull(calculated_variable)))] + + testthat::expect_equal(length(undefined_variables), 0) } )