Skip to content

Commit

Permalink
Merge pull request #42 from UI-Research/new-indicators
Browse files Browse the repository at this point in the history
minor modifications to tests
  • Loading branch information
wcurrangroome authored Aug 24, 2024
2 parents eebfc41 + 69fdce5 commit 50765bf
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 120 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Suggests:
testthat (>= 3.0.0)
Remotes:
UrbanInstitute/urbnthemes
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
URL: https://ui-research.github.io/urbnindicators/
VignetteBuilder: knitr
Config/testthat/edition: 3
6 changes: 3 additions & 3 deletions R/compile_acs_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ compile_acs_data = function(
retain_moes = TRUE,
spatial = FALSE) {

warning("\n
message("\n
Variable names and geographies for ACS data products can change between years.
Changes to geographies are particularly significant across decades
(e.g., from 2019 to 2020), but these changes can occur in any year.\n
Expand All @@ -348,7 +348,7 @@ geographies over time should be thoroughly quality checked.\n")

## warning about inter-decadal tract geometry changes
if ( (max(years) >= 2020) & (min(years) < 2020) & (geography == "tract") ) {
warning("Requested years span the year 2020, which is when the Census Bureau re-configures
message("Requested years span the year 2020, which is when the Census Bureau re-configures
census tract boundaries. It is not valid to compare census tract-level statistics for years before 2020 to
statistics from 2020 and after; use a crosswalk, such as those provided by NHGIS, to interpolate values.
A future version of urbnindicators may address this issue automatically.") }
Expand Down Expand Up @@ -456,7 +456,7 @@ geographies over time should be thoroughly quality checked.\n")
{ if (retain_moes == TRUE) dplyr::left_join(., moes, by = c("GEOID", "data_source_year")) else . }

## attach the codebook as an attribute named "codebook" to the returned dataset
attr(df_calculated_estimates, "codebook") = generate_codebook(.data = df_calculated_estimates)
attr(df_calculated_estimates, "codebook") = generate_codebook(.data = df_calculated_estimates %>% sf::st_drop_geometry())

return(df_calculated_estimates)
}
Expand Down
Binary file added inst/test-data/codebook_2024-08-24.rds
Binary file not shown.
Binary file added inst/test-data/test_data_2024-08-24.rds
Binary file not shown.
145 changes: 78 additions & 67 deletions tests/testthat/test-compile_acs_data.R
Original file line number Diff line number Diff line change
@@ -1,85 +1,96 @@
####----Load Test Data----####
## Statistics for CA and TX Tracts
df = compile_acs_data(
variables = NULL,
years = c(2022),
geography = "tract",
states = c("CA", "TX"),
counties = NULL,
retain_moes = TRUE,
spatial = FALSE)
message("Update test data prior to testing, as needed.")
# df = compile_acs_data(
# variables = NULL,
# years = c(2022),
# geography = "tract",
# states = c("CA", "TX"),
# counties = NULL,
# retain_moes = TRUE,
# spatial = FALSE)
#
# codebook = attr(df, "codebook")
# saveRDS(object = df, file = file.path("inst", "test-data", "test_data_2024-08-24.rds"))
# saveRDS(codebook, file = file.path("inst", "test-data", "codebook_2024-08-24.rds"))

####----Tests----####
# All percentages have no values greater than one and no values less than zero
percentage_outliers_maxima = df %>%
dplyr::select(dplyr::matches("percent$")) %>%
dplyr::summarise(dplyr::across(.cols = dplyr::where(is.numeric), ~ max(.x, na.rm = T))) %>%
tidyr::pivot_longer(cols = everything()) %>%
dplyr::arrange(desc(value)) %>%
dplyr::filter(value > 1) %>%
nrow

percentage_outliers_minima = df %>%
dplyr::select(dplyr::matches("percent$")) %>%
dplyr::summarise(dplyr::across(.cols = where(is.numeric), ~ min(.x, na.rm = T))) %>%
tidyr::pivot_longer(cols = everything()) %>%
dplyr::arrange(desc(value)) %>%
dplyr::filter(value < 0) %>%
nrow

testthat::test_that(
"All percentages have no values greater than one and no values less than zero",
{ testthat::expect_equal(percentage_outliers_maxima, 0)
testthat::expect_equal(percentage_outliers_minima, 0) } )
{
## Statistics for CA and TX Tracts
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))

# All measures have meaningful values: maximum and mean greater than zero,
# all values are not missing, and there are at least two distinct values per measure
percentage_outliers_maxima = df %>%
dplyr::select(dplyr::matches("percent$")) %>%
dplyr::summarise(dplyr::across(.cols = dplyr::where(is.numeric), ~ max(.x, na.rm = T))) %>%
tidyr::pivot_longer(cols = everything()) %>%
dplyr::arrange(desc(value)) %>%
dplyr::filter(value > 1) %>%
nrow

summary_statistics = df %>%
dplyr::select(GEOID, matches("percent$")) %>%
tidyr::pivot_longer(-GEOID) %>%
dplyr::group_by(name) %>%
dplyr::summarise(
count_total = dplyr::n(),
count_na = sum(is.na(value)),
min = min(value, na.rm = T),
max = max(value, na.rm = T),
mean = mean(value, na.rm = T),
distinct_values = dplyr::n_distinct(value, na.rm = T))
percentage_outliers_minima = df %>%
dplyr::select(dplyr::matches("percent$")) %>%
dplyr::summarise(dplyr::across(.cols = where(is.numeric), ~ min(.x, na.rm = T))) %>%
tidyr::pivot_longer(cols = everything()) %>%
dplyr::arrange(desc(value)) %>%
dplyr::filter(value < 0) %>%
nrow

summary_statistics %>%
dplyr::filter(stringr::str_detect(name, "area"))
testthat::expect_equal(percentage_outliers_maxima, 0)
testthat::expect_equal(percentage_outliers_minima, 0) } )

# All measures have meaningful values: maximum and mean greater than zero,
# all values are not missing, and there are at least two distinct values per measure
testthat::test_that(
"All measures have meaningful values",
{ purrr::map_dbl(summary_statistics$max, ~testthat::expect_gt(.x, 0) )
{
## Statistics for CA and TX Tracts
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))

summary_statistics = df %>%
dplyr::select(GEOID, matches("percent$")) %>%
tidyr::pivot_longer(-GEOID) %>%
dplyr::group_by(name) %>%
dplyr::summarise(
count_total = dplyr::n(),
count_na = sum(is.na(value)),
min = min(value, na.rm = T),
max = max(value, na.rm = T),
mean = mean(value, na.rm = T),
distinct_values = dplyr::n_distinct(value, na.rm = T))

purrr::map_dbl(summary_statistics$max, ~testthat::expect_gt(.x, 0) )
purrr::map_dbl(summary_statistics$mean, ~testthat::expect_gt(.x, 0) )
purrr::map2_dbl(summary_statistics$count_na, summary_statistics$count_total, ~ testthat::expect_lt(.x, .y) )
purrr::map_dbl(summary_statistics$min, ~testthat::expect_gte(.x, 0) ) })

# No percentage measure has exactly the same values as that of another percentage measure
# (Note: this is implemented with tract data across the entire US. This does not
# No percentage measure has exactly the same values (for every observation) as
# that of another percentage measure (Note: this is implemented with tract data. This does not
# inherently apply to other levels of geographic observation, nor for subsets of the US.)
duplicates = purrr::map_dfr(
colnames(df %>% dplyr::select(dplyr::matches("percent$"))),
function(colname1) {
purrr::map_dfr(
colnames(df %>% dplyr::select(-dplyr::matches("_M$|universe"))),
function(colname2) {
if ((colname1 != colname2) & (identical(df[[colname1]], df[[colname2]]))) {
data.frame(variable_one = colname1, variable_two = colname2)}})})

distinct_duplicates = duplicates %>%
dplyr::mutate(
combined_variables = dplyr::if_else(
variable_one > variable_two,
paste(variable_one, variable_two),
paste(variable_two, variable_one))) %>%
dplyr::distinct(combined_variables, .keep_all = T) %>%
# this variable intentionally has a single duplicate (differently-named) variable
# for consistency across different concepts
dplyr::filter(!stringr::str_detect(variable_one, "year_structure_built.*later"))

testthat::test_that(
"All percentages are distinct",
{ testthat::expect_equal(nrow(distinct_duplicates), 0) } )
{
## Statistics for CA and TX Tracts
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))

duplicates = purrr::map_dfr(
colnames(df %>% dplyr::select(dplyr::matches("percent$"))),
function(colname1) {
purrr::map_dfr(
colnames(df %>% dplyr::select(-dplyr::matches("_M$|universe"))),
function(colname2) {
if ((colname1 != colname2) & (identical(df[[colname1]], df[[colname2]]))) {
data.frame(variable_one = colname1, variable_two = colname2)}})})

distinct_duplicates = duplicates %>%
dplyr::mutate(
combined_variables = dplyr::if_else(
variable_one > variable_two,
paste(variable_one, variable_two),
paste(variable_two, variable_one))) %>%
dplyr::distinct(combined_variables, .keep_all = T) %>%
# this variable intentionally has a single duplicate (differently-named) variable
# for consistency across different concepts
dplyr::filter(!stringr::str_detect(variable_one, "year_structure_built.*later"))

testthat::expect_equal(nrow(distinct_duplicates), 0) } )
110 changes: 61 additions & 49 deletions tests/testthat/test-generate_codebook.R
Original file line number Diff line number Diff line change
@@ -1,77 +1,89 @@
####----Load Test Data----####

## Statistics for NJ Counties
df = compile_acs_data(
variables = list_acs_variables(year = "2022"),
years = 2022,
geography = "county",
states = "NJ",
counties = NULL,
retain_moes = TRUE,
spatial = TRUE)

codebook = attr(df, "codebook")

#####----TESTING----#####
## Testing data is created in test-compile_acs_data.R

####----Tests----####
## No missingness in codebook
results_missingness = codebook %>%
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ is.na(.x))) %>%
nrow

testthat::test_that(
"No column in the codebook has a missing value.",
{ testthat::expect_equal(results_missingness, 0) } )
{
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))

## No transcribed function calls
results_transcribed_functions = codebook %>%
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "dplyr"))) %>%
nrow
results_missingness = codebook %>%
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ is.na(.x))) %>%
nrow

testthat::expect_equal(results_missingness, 0) } )

## No transcribed function calls
testthat::test_that(
"No transcribed functions included in codebook output.",
{ testthat::expect_equal(results_transcribed_functions, 0) } )
{
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))

## No missing raw variable codes
results_missing_raw_variables = codebook %>%
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "\\(\\)|\\(NA\\)"))) %>%
nrow
results_transcribed_functions = codebook %>%
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "dplyr"))) %>%
nrow

testthat::expect_equal(results_transcribed_functions, 0) } )

## No missing raw variable codes
testthat::test_that(
"No variable definitions contain '(NA)' in lieu of the raw variable code.",
{ testthat::expect_equal(results_missing_raw_variables, 0) } )
{
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))

## No universe variables in numerators (except population density)
results_universe_numerators = codebook %>%
dplyr::filter(stringr::str_detect(definition, "Numerator.*universe.*Denominator")) %>%
nrow
results_missing_raw_variables = codebook %>%
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "\\(\\)|\\(NA\\)"))) %>%
nrow

testthat::expect_equal(results_missing_raw_variables, 0) } )

## No universe variables in numerators (except population density)
testthat::test_that(
"Only population density contains a universe variable in the numerator.",
{ testthat::expect_equal(results_universe_numerators, 1) } )
{
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))

## No definitions for variables that are percentages of universes (not possible)
results_universe_percentages = codebook %>%
dplyr::filter(stringr::str_detect(calculated_variable, "universe.*percent$")) %>%
nrow
results_universe_numerators = codebook %>%
dplyr::filter(stringr::str_detect(definition, "Numerator.*universe.*Denominator")) %>%
nrow

testthat::expect_equal(results_universe_numerators, 1) } )

## No definitions for variables that are percentages of universes (not possible)
testthat::test_that(
"No calculated variables are perentages of a universe estimate.",
{ testthat::expect_equal(results_universe_percentages, 0) } )
{
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))

## No codebook variable definitions that are missing from the input dataset
results_phantom_definitions = codebook %>%
dplyr::filter(!(calculated_variable %in% (df %>% colnames))) %>%
nrow
results_universe_percentages = codebook %>%
dplyr::filter(stringr::str_detect(calculated_variable, "universe.*percent$")) %>%
nrow

testthat::expect_equal(results_universe_percentages, 0) } )

## No codebook variable definitions that are missing from the input dataset
testthat::test_that(
"No codebook entries for variables that don't exist in the input data.",
{ testthat::expect_equal(results_phantom_definitions, 0) } )
{
## Statistics for CA and TX Tracts
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))

## All variables in the input data are in the codebook
derived_variables = df %>% dplyr::select(dplyr::matches("percent$")) %>% colnames
undefined_variables = derived_variables[!(derived_variables %in% (codebook %>% dplyr::pull(calculated_variable)))]
results_phantom_definitions = codebook %>%
dplyr::filter(!(calculated_variable %in% (df %>% colnames))) %>%
nrow

testthat::expect_equal(results_phantom_definitions, 0) } )

## All variables in the input data are in the codebook
testthat::test_that(
"All variables in the input data are in the codebook.",
{ testthat::expect_equal(length(undefined_variables), 0) } )
{
## Statistics for CA and TX Tracts
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators"))
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators"))

derived_variables = df %>% dplyr::select(dplyr::matches("percent$")) %>% colnames
undefined_variables = derived_variables[!(derived_variables %in% (codebook %>% dplyr::pull(calculated_variable)))]

testthat::expect_equal(length(undefined_variables), 0) } )

0 comments on commit 50765bf

Please sign in to comment.