-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #42 from UI-Research/new-indicators
minor modifications to tests
- Loading branch information
Showing
6 changed files
with
143 additions
and
120 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,85 +1,96 @@ | ||
####----Load Test Data----#### | ||
## Statistics for CA and TX Tracts | ||
df = compile_acs_data( | ||
variables = NULL, | ||
years = c(2022), | ||
geography = "tract", | ||
states = c("CA", "TX"), | ||
counties = NULL, | ||
retain_moes = TRUE, | ||
spatial = FALSE) | ||
message("Update test data prior to testing, as needed.") | ||
# df = compile_acs_data( | ||
# variables = NULL, | ||
# years = c(2022), | ||
# geography = "tract", | ||
# states = c("CA", "TX"), | ||
# counties = NULL, | ||
# retain_moes = TRUE, | ||
# spatial = FALSE) | ||
# | ||
# codebook = attr(df, "codebook") | ||
# saveRDS(object = df, file = file.path("inst", "test-data", "test_data_2024-08-24.rds")) | ||
# saveRDS(codebook, file = file.path("inst", "test-data", "codebook_2024-08-24.rds")) | ||
|
||
####----Tests----#### | ||
# All percentages have no values greater than one and no values less than zero | ||
percentage_outliers_maxima = df %>% | ||
dplyr::select(dplyr::matches("percent$")) %>% | ||
dplyr::summarise(dplyr::across(.cols = dplyr::where(is.numeric), ~ max(.x, na.rm = T))) %>% | ||
tidyr::pivot_longer(cols = everything()) %>% | ||
dplyr::arrange(desc(value)) %>% | ||
dplyr::filter(value > 1) %>% | ||
nrow | ||
|
||
percentage_outliers_minima = df %>% | ||
dplyr::select(dplyr::matches("percent$")) %>% | ||
dplyr::summarise(dplyr::across(.cols = where(is.numeric), ~ min(.x, na.rm = T))) %>% | ||
tidyr::pivot_longer(cols = everything()) %>% | ||
dplyr::arrange(desc(value)) %>% | ||
dplyr::filter(value < 0) %>% | ||
nrow | ||
|
||
testthat::test_that( | ||
"All percentages have no values greater than one and no values less than zero", | ||
{ testthat::expect_equal(percentage_outliers_maxima, 0) | ||
testthat::expect_equal(percentage_outliers_minima, 0) } ) | ||
{ | ||
## Statistics for CA and TX Tracts | ||
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
# All measures have meaningful values: maximum and mean greater than zero, | ||
# all values are not missing, and there are at least two distinct values per measure | ||
percentage_outliers_maxima = df %>% | ||
dplyr::select(dplyr::matches("percent$")) %>% | ||
dplyr::summarise(dplyr::across(.cols = dplyr::where(is.numeric), ~ max(.x, na.rm = T))) %>% | ||
tidyr::pivot_longer(cols = everything()) %>% | ||
dplyr::arrange(desc(value)) %>% | ||
dplyr::filter(value > 1) %>% | ||
nrow | ||
|
||
summary_statistics = df %>% | ||
dplyr::select(GEOID, matches("percent$")) %>% | ||
tidyr::pivot_longer(-GEOID) %>% | ||
dplyr::group_by(name) %>% | ||
dplyr::summarise( | ||
count_total = dplyr::n(), | ||
count_na = sum(is.na(value)), | ||
min = min(value, na.rm = T), | ||
max = max(value, na.rm = T), | ||
mean = mean(value, na.rm = T), | ||
distinct_values = dplyr::n_distinct(value, na.rm = T)) | ||
percentage_outliers_minima = df %>% | ||
dplyr::select(dplyr::matches("percent$")) %>% | ||
dplyr::summarise(dplyr::across(.cols = where(is.numeric), ~ min(.x, na.rm = T))) %>% | ||
tidyr::pivot_longer(cols = everything()) %>% | ||
dplyr::arrange(desc(value)) %>% | ||
dplyr::filter(value < 0) %>% | ||
nrow | ||
|
||
summary_statistics %>% | ||
dplyr::filter(stringr::str_detect(name, "area")) | ||
testthat::expect_equal(percentage_outliers_maxima, 0) | ||
testthat::expect_equal(percentage_outliers_minima, 0) } ) | ||
|
||
# All measures have meaningful values: maximum and mean greater than zero, | ||
# all values are not missing, and there are at least two distinct values per measure | ||
testthat::test_that( | ||
"All measures have meaningful values", | ||
{ purrr::map_dbl(summary_statistics$max, ~testthat::expect_gt(.x, 0) ) | ||
{ | ||
## Statistics for CA and TX Tracts | ||
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
summary_statistics = df %>% | ||
dplyr::select(GEOID, matches("percent$")) %>% | ||
tidyr::pivot_longer(-GEOID) %>% | ||
dplyr::group_by(name) %>% | ||
dplyr::summarise( | ||
count_total = dplyr::n(), | ||
count_na = sum(is.na(value)), | ||
min = min(value, na.rm = T), | ||
max = max(value, na.rm = T), | ||
mean = mean(value, na.rm = T), | ||
distinct_values = dplyr::n_distinct(value, na.rm = T)) | ||
|
||
purrr::map_dbl(summary_statistics$max, ~testthat::expect_gt(.x, 0) ) | ||
purrr::map_dbl(summary_statistics$mean, ~testthat::expect_gt(.x, 0) ) | ||
purrr::map2_dbl(summary_statistics$count_na, summary_statistics$count_total, ~ testthat::expect_lt(.x, .y) ) | ||
purrr::map_dbl(summary_statistics$min, ~testthat::expect_gte(.x, 0) ) }) | ||
|
||
# No percentage measure has exactly the same values as that of another percentage measure | ||
# (Note: this is implemented with tract data across the entire US. This does not | ||
# No percentage measure has exactly the same values (for every observation) as | ||
# that of another percentage measure (Note: this is implemented with tract data. This does not | ||
# inherently apply to other levels of geographic observation, nor for subsets of the US.) | ||
duplicates = purrr::map_dfr( | ||
colnames(df %>% dplyr::select(dplyr::matches("percent$"))), | ||
function(colname1) { | ||
purrr::map_dfr( | ||
colnames(df %>% dplyr::select(-dplyr::matches("_M$|universe"))), | ||
function(colname2) { | ||
if ((colname1 != colname2) & (identical(df[[colname1]], df[[colname2]]))) { | ||
data.frame(variable_one = colname1, variable_two = colname2)}})}) | ||
|
||
distinct_duplicates = duplicates %>% | ||
dplyr::mutate( | ||
combined_variables = dplyr::if_else( | ||
variable_one > variable_two, | ||
paste(variable_one, variable_two), | ||
paste(variable_two, variable_one))) %>% | ||
dplyr::distinct(combined_variables, .keep_all = T) %>% | ||
# this variable intentionally has a single duplicate (differently-named) variable | ||
# for consistency across different concepts | ||
dplyr::filter(!stringr::str_detect(variable_one, "year_structure_built.*later")) | ||
|
||
testthat::test_that( | ||
"All percentages are distinct", | ||
{ testthat::expect_equal(nrow(distinct_duplicates), 0) } ) | ||
{ | ||
## Statistics for CA and TX Tracts | ||
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
duplicates = purrr::map_dfr( | ||
colnames(df %>% dplyr::select(dplyr::matches("percent$"))), | ||
function(colname1) { | ||
purrr::map_dfr( | ||
colnames(df %>% dplyr::select(-dplyr::matches("_M$|universe"))), | ||
function(colname2) { | ||
if ((colname1 != colname2) & (identical(df[[colname1]], df[[colname2]]))) { | ||
data.frame(variable_one = colname1, variable_two = colname2)}})}) | ||
|
||
distinct_duplicates = duplicates %>% | ||
dplyr::mutate( | ||
combined_variables = dplyr::if_else( | ||
variable_one > variable_two, | ||
paste(variable_one, variable_two), | ||
paste(variable_two, variable_one))) %>% | ||
dplyr::distinct(combined_variables, .keep_all = T) %>% | ||
# this variable intentionally has a single duplicate (differently-named) variable | ||
# for consistency across different concepts | ||
dplyr::filter(!stringr::str_detect(variable_one, "year_structure_built.*later")) | ||
|
||
testthat::expect_equal(nrow(distinct_duplicates), 0) } ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,77 +1,89 @@ | ||
####----Load Test Data----#### | ||
|
||
## Statistics for NJ Counties | ||
df = compile_acs_data( | ||
variables = list_acs_variables(year = "2022"), | ||
years = 2022, | ||
geography = "county", | ||
states = "NJ", | ||
counties = NULL, | ||
retain_moes = TRUE, | ||
spatial = TRUE) | ||
|
||
codebook = attr(df, "codebook") | ||
|
||
#####----TESTING----##### | ||
## Testing data is created in test-compile_acs_data.R | ||
|
||
####----Tests----#### | ||
## No missingness in codebook | ||
results_missingness = codebook %>% | ||
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ is.na(.x))) %>% | ||
nrow | ||
|
||
testthat::test_that( | ||
"No column in the codebook has a missing value.", | ||
{ testthat::expect_equal(results_missingness, 0) } ) | ||
{ | ||
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
## No transcribed function calls | ||
results_transcribed_functions = codebook %>% | ||
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "dplyr"))) %>% | ||
nrow | ||
results_missingness = codebook %>% | ||
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ is.na(.x))) %>% | ||
nrow | ||
|
||
testthat::expect_equal(results_missingness, 0) } ) | ||
|
||
## No transcribed function calls | ||
testthat::test_that( | ||
"No transcribed functions included in codebook output.", | ||
{ testthat::expect_equal(results_transcribed_functions, 0) } ) | ||
{ | ||
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
## No missing raw variable codes | ||
results_missing_raw_variables = codebook %>% | ||
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "\\(\\)|\\(NA\\)"))) %>% | ||
nrow | ||
results_transcribed_functions = codebook %>% | ||
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "dplyr"))) %>% | ||
nrow | ||
|
||
testthat::expect_equal(results_transcribed_functions, 0) } ) | ||
|
||
## No missing raw variable codes | ||
testthat::test_that( | ||
"No variable definitions contain '(NA)' in lieu of the raw variable code.", | ||
{ testthat::expect_equal(results_missing_raw_variables, 0) } ) | ||
{ | ||
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
## No universe variables in numerators (except population density) | ||
results_universe_numerators = codebook %>% | ||
dplyr::filter(stringr::str_detect(definition, "Numerator.*universe.*Denominator")) %>% | ||
nrow | ||
results_missing_raw_variables = codebook %>% | ||
dplyr::filter(dplyr::if_any(.cols = dplyr::everything(), ~ stringr::str_detect(.x, "\\(\\)|\\(NA\\)"))) %>% | ||
nrow | ||
|
||
testthat::expect_equal(results_missing_raw_variables, 0) } ) | ||
|
||
## No universe variables in numerators (except population density) | ||
testthat::test_that( | ||
"Only population density contains a universe variable in the numerator.", | ||
{ testthat::expect_equal(results_universe_numerators, 1) } ) | ||
{ | ||
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
## No definitions for variables that are percentages of universes (not possible) | ||
results_universe_percentages = codebook %>% | ||
dplyr::filter(stringr::str_detect(calculated_variable, "universe.*percent$")) %>% | ||
nrow | ||
results_universe_numerators = codebook %>% | ||
dplyr::filter(stringr::str_detect(definition, "Numerator.*universe.*Denominator")) %>% | ||
nrow | ||
|
||
testthat::expect_equal(results_universe_numerators, 1) } ) | ||
|
||
## No definitions for variables that are percentages of universes (not possible) | ||
testthat::test_that( | ||
"No calculated variables are perentages of a universe estimate.", | ||
{ testthat::expect_equal(results_universe_percentages, 0) } ) | ||
{ | ||
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
## No codebook variable definitions that are missing from the input dataset | ||
results_phantom_definitions = codebook %>% | ||
dplyr::filter(!(calculated_variable %in% (df %>% colnames))) %>% | ||
nrow | ||
results_universe_percentages = codebook %>% | ||
dplyr::filter(stringr::str_detect(calculated_variable, "universe.*percent$")) %>% | ||
nrow | ||
|
||
testthat::expect_equal(results_universe_percentages, 0) } ) | ||
|
||
## No codebook variable definitions that are missing from the input dataset | ||
testthat::test_that( | ||
"No codebook entries for variables that don't exist in the input data.", | ||
{ testthat::expect_equal(results_phantom_definitions, 0) } ) | ||
{ | ||
## Statistics for CA and TX Tracts | ||
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) | ||
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
## All variables in the input data are in the codebook | ||
derived_variables = df %>% dplyr::select(dplyr::matches("percent$")) %>% colnames | ||
undefined_variables = derived_variables[!(derived_variables %in% (codebook %>% dplyr::pull(calculated_variable)))] | ||
results_phantom_definitions = codebook %>% | ||
dplyr::filter(!(calculated_variable %in% (df %>% colnames))) %>% | ||
nrow | ||
|
||
testthat::expect_equal(results_phantom_definitions, 0) } ) | ||
|
||
## All variables in the input data are in the codebook | ||
testthat::test_that( | ||
"All variables in the input data are in the codebook.", | ||
{ testthat::expect_equal(length(undefined_variables), 0) } ) | ||
{ | ||
## Statistics for CA and TX Tracts | ||
df = readRDS(system.file("test-data", "test_data_2024-08-24.rds", package = "urbnindicators")) | ||
codebook = readRDS(system.file("test-data", "codebook_2024-08-24.rds", package = "urbnindicators")) | ||
|
||
derived_variables = df %>% dplyr::select(dplyr::matches("percent$")) %>% colnames | ||
undefined_variables = derived_variables[!(derived_variables %in% (codebook %>% dplyr::pull(calculated_variable)))] | ||
|
||
testthat::expect_equal(length(undefined_variables), 0) } ) |