Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Separate v4 req vals checks by output type. Resolves #177 #178

Merged
merged 3 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# hubValidations (development version)

* Evaluation of whether all combinations of required values have been submitted through `check_tbl_values_required()` is now chunked by output type for v4 config and above. This reduces memory pressure and should speed up required value validation in hubs with complex task.json files. It also fixes the bug reported in #177.

# hubValidations 0.9.0

* Re-exported functions useful for modelers (#149):
Expand Down
69 changes: 45 additions & 24 deletions R/check_tbl_values_required.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,54 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
force_output_types <- TRUE
} else {
# For pre v4 configs, we use the legacy settings and rules.
output_types <- NULL
output_types <- list(NULL)
force_output_types <- FALSE
}

# Iterate over output types in v4. This reduces memory pressure and also
# keeps output type evaluation separate, resolving the bug reported in #177.
# v3 and below is unaffected and validation proceeds as before using the full
# modeling task expanded grid
missing_df <- purrr::map(output_types, \(.x) {
check_required_output_type_by_modeling_task(
tbl = tbl,
config_tasks = config_tasks,
round_id = round_id,
output_type = .x,
derived_task_ids = derived_task_ids,
force_output_types = force_output_types
)
}) %>%
purrr::list_rbind()

check <- nrow(missing_df) == 0L

if (check) {
details <- NULL
} else {
missing_df <- coerce_to_hub_schema(missing_df, config_tasks)
details <- cli::format_inline("See {.var missing} attribute for details.")
}

capture_check_cnd(
check = check,
file_path = file_path,
msg_subject = "Required task ID/output type/output type ID combinations",
msg_attribute = NULL,
msg_verbs = c("all present.", "missing."),
details = details,
missing = missing_df
)
}

check_required_output_type_by_modeling_task <- function(tbl, config_tasks,
round_id, output_type,
derived_task_ids,
force_output_types) {
req <- expand_model_out_grid(
config_tasks,
round_id = round_id,
output_types = output_types,
output_types = output_type,
required_vals_only = TRUE,
force_output_types = force_output_types,
all_character = TRUE,
Expand All @@ -56,7 +97,7 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
full <- expand_model_out_grid(
config_tasks,
round_id = round_id,
output_types = output_types,
output_types = output_type,
required_vals_only = FALSE,
all_character = TRUE,
as_arrow_table = FALSE,
Expand All @@ -66,30 +107,11 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,

tbl <- join_tbl_to_model_task(full, tbl, subset_to_tbl_cols = TRUE)

missing_df <- purrr::pmap(
purrr::pmap(
combine_mt_inputs(tbl, req, full),
check_modeling_task_values_required
) %>%
purrr::list_rbind()

check <- nrow(missing_df) == 0L

if (check) {
details <- NULL
} else {
missing_df <- coerce_to_hub_schema(missing_df, config_tasks)
details <- cli::format_inline("See {.var missing} attribute for details.")
}

capture_check_cnd(
check = check,
file_path = file_path,
msg_subject = "Required task ID/output type/output type ID combinations",
msg_attribute = NULL,
msg_verbs = c("all present.", "missing."),
details = details,
missing = missing_df
)
}

check_modeling_task_values_required <- function(tbl, req, full) {
Expand Down Expand Up @@ -173,7 +195,6 @@ get_opt_col_list <- function(x, mask, full, req) {
# Identify missing required values for optional value combinations.
# Output full missing rows compiled from optional values and missing required values.
missing_req_rows <- function(opt_cols, x, mask, req, full) {

if (all(opt_cols == FALSE)) {
return(req[!conc_rows(req) %in% conc_rows(x), ])
}
Expand Down
24 changes: 24 additions & 0 deletions tests/testthat/test-check_tbl_values_required.R
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,27 @@ test_that("Reading derived_task_ids from config works", {
check_tbl_values_required(tbl, round_id, file_path, hub_path)
)
})

test_that("v4 config output type leak fixed (#177)", {
hub_path <- test_path("testdata", "hub-177")
file_path <- "FluSight-baseline/2024-12-14-FluSight-baseline.parquet"
round_id <- "2024-12-14"
tbl <- read_model_out_file(
file_path = file_path,
hub_path = hub_path,
coerce_types = "chr"
)
res <- check_tbl_values_required(tbl,
round_id = round_id,
file_path = file_path, hub_path = hub_path
)

expect_s3_class(res,
c(
"check_success", "hub_check",
"rlang_message", "message",
"condition"
),
exact = TRUE
)
})
25 changes: 25 additions & 0 deletions tests/testthat/testdata/hub-177/hub-config/admin.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"schema_version": "https://raw.githubusercontent.com/hubverse-org/schemas/main/v4.0.0/admin-schema.json",
"name": "US CDC FluSight",
"maintainer": "US CDC",
"contact": {
"name": "Joe bloggs",
"email": "[email protected]"
},
"repository": {
"host": "github",
"owner": "cdcepi",
"repository": "FluSight-forecast-hub"
},
"file_format": ["csv", "parquet"],
"timezone": "US/Eastern",
"model_output_dir": "model-output",
"cloud": {
"enabled": true,
"host": {
"name": "aws",
"storage_service": "s3",
"storage_location": "cdcepi-flusight-forecast-hub"
}
}
}
Loading
Loading