Skip to content

Commit

Permalink
Merge branch 'main' into ak/derived-task-id-val-check/110
Browse files Browse the repository at this point in the history
  • Loading branch information
zkamvar authored Dec 12, 2024
2 parents 0a12910 + 1fdf702 commit 089b5e8
Show file tree
Hide file tree
Showing 9 changed files with 843 additions and 32 deletions.
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# hubValidations (development version)

* Added `check_tbl_derived_task_id_vals()` check to `validate_model_data()` that ensures that values in derived task ID columns match expected values for the corresponding derived task IDs in the round as defined in `tasks.json` config (#110). Given the dependence of derived task IDs on the values of other values, the check ignores the combinations of derived task ID values with those of other task IDs and focuses only on identifying values that do not match corresponding accepted values.
* Added `check_tbl_derived_task_id_vals()` check to `validate_model_data()` that ensures that values in derived task ID columns match expected values for the corresponding derived task IDs in the round as defined in `tasks.json` config (#110). Given the dependence of derived task IDs on the values of their source task ID values, the check ignores the combinations of derived task ID values with those of other task IDs and focuses only on identifying values that do not match corresponding accepted values.
* `submission_tmpl()` gains the `force_output_types` allowing users to force optional output types to be included in a submission template when `required_vals_only = TRUE`. In conjunction with the use of the `output_types` argument, this allows users to create submission templates which include optional output types they plan to submit.
* `check_tbl_values_required()` no longer reports false positives for v4 hubs, which fixes the bug reported in #177. Evaluation of whether all combinations of required values have been submitted through `check_tbl_values_required()` is now chunked by output type for v4 config and above. This reduces memory pressure and should speed up required value validation in hubs with complex task.json files.

# hubValidations 0.9.0

Expand Down
69 changes: 45 additions & 24 deletions R/check_tbl_values_required.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,54 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
force_output_types <- TRUE
} else {
# For pre v4 configs, we use the legacy settings and rules.
output_types <- NULL
output_types <- list(NULL)
force_output_types <- FALSE
}

# Iterate over output types in v4. This reduces memory pressure and also
# keeps output type evaluation separate, resolving the bug reported in #177.
# v3 and below is unaffected and validation proceeds as before using the full
# modeling task expanded grid
missing_df <- purrr::map(output_types, \(.x) {
check_required_output_type_by_modeling_task(
tbl = tbl,
config_tasks = config_tasks,
round_id = round_id,
output_type = .x,
derived_task_ids = derived_task_ids,
force_output_types = force_output_types
)
}) %>%
purrr::list_rbind()

check <- nrow(missing_df) == 0L

if (check) {
details <- NULL
} else {
missing_df <- coerce_to_hub_schema(missing_df, config_tasks)
details <- cli::format_inline("See {.var missing} attribute for details.")
}

capture_check_cnd(
check = check,
file_path = file_path,
msg_subject = "Required task ID/output type/output type ID combinations",
msg_attribute = NULL,
msg_verbs = c("all present.", "missing."),
details = details,
missing = missing_df
)
}

check_required_output_type_by_modeling_task <- function(tbl, config_tasks,
round_id, output_type,
derived_task_ids,
force_output_types) {
req <- expand_model_out_grid(
config_tasks,
round_id = round_id,
output_types = output_types,
output_types = output_type,
required_vals_only = TRUE,
force_output_types = force_output_types,
all_character = TRUE,
Expand All @@ -56,7 +97,7 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
full <- expand_model_out_grid(
config_tasks,
round_id = round_id,
output_types = output_types,
output_types = output_type,
required_vals_only = FALSE,
all_character = TRUE,
as_arrow_table = FALSE,
Expand All @@ -66,30 +107,11 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,

tbl <- join_tbl_to_model_task(full, tbl, subset_to_tbl_cols = TRUE)

missing_df <- purrr::pmap(
purrr::pmap(
combine_mt_inputs(tbl, req, full),
check_modeling_task_values_required
) %>%
purrr::list_rbind()

check <- nrow(missing_df) == 0L

if (check) {
details <- NULL
} else {
missing_df <- coerce_to_hub_schema(missing_df, config_tasks)
details <- cli::format_inline("See {.var missing} attribute for details.")
}

capture_check_cnd(
check = check,
file_path = file_path,
msg_subject = "Required task ID/output type/output type ID combinations",
msg_attribute = NULL,
msg_verbs = c("all present.", "missing."),
details = details,
missing = missing_df
)
}

check_modeling_task_values_required <- function(tbl, req, full) {
Expand Down Expand Up @@ -173,7 +195,6 @@ get_opt_col_list <- function(x, mask, full, req) {
# Identify missing required values for optional value combinations.
# Output full missing rows compiled from optional values and missing required values.
missing_req_rows <- function(opt_cols, x, mask, req, full) {

if (all(opt_cols == FALSE)) {
return(req[!conc_rows(req) %in% conc_rows(x), ])
}
Expand Down
54 changes: 49 additions & 5 deletions R/submission_tmpl.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,25 @@
#' values only.
#'
#' @details
#' For task IDs or output_type_ids where all values are optional, by default, columns
#' are included as columns of `NA`s when `required_vals_only = TRUE`.
#' For task IDs where all values are optional, by default, columns
#' are created as columns of `NA`s when `required_vals_only = TRUE`.
#' When such columns exist, the function returns a tibble with zero rows, as no
#' complete cases of required value combinations exists.
#' _(Note that determination of complete cases does excludes valid `NA`
#' `output_type_id` values in `"mean"` and `"median"` output types)._
#' To return a template of incomplete required cases, which includes `NA` columns, use
#' `complete_cases_only = FALSE`.
#'
#' To include output types that are optional in the submission template
#' when `required_vals_only = TRUE` and `complete_cases_only = FALSE`, use
#' `force_output_types = TRUE`. Use this in combination with sub-setting for
#' output types you plan to submit via argument `output_types` to create a
#' submission template customised to your submission plans.
#' _Tip: to ensure you create a template with all required output types, it's
#' a good idea to first run the functions without subsetting or forcing output
#' types and examing the unique values in `output_type` to check which output
#' types are required._
#'
#' When sample output types are included in the output, the `output_type_id`
#' column contains example sample indexes which are useful for identifying the
#' compound task ID structure of multivariate sampling distributions in particular,
Expand Down Expand Up @@ -116,8 +126,19 @@
#' derived_task_ids = "target_end_date",
#' complete_cases_only = FALSE
#' )
#' # Force optional output type, in this case "mean".
#' submission_tmpl(
#' config_tasks = config_tasks,
#' round_id = "2022-12-12",
#' required_vals_only = TRUE,
#' output_types = c("pmf", "quantile", "mean"),
#' force_output_types = TRUE,
#' derived_task_ids = "target_end_date",
#' complete_cases_only = FALSE
#' )
submission_tmpl <- function(hub_con, config_tasks, round_id,
required_vals_only = FALSE,
force_output_types = FALSE,
complete_cases_only = TRUE,
compound_taskid_set = NULL,
output_types = NULL,
Expand All @@ -138,15 +159,39 @@ submission_tmpl <- function(hub_con, config_tasks, round_id,
derived_task_ids, config_tasks, round_id
)
}

tmpl_df <- expand_model_out_grid(config_tasks,
round_id = round_id,
required_vals_only = required_vals_only,
include_sample_ids = TRUE,
compound_taskid_set = compound_taskid_set,
output_types = output_types,
derived_task_ids = derived_task_ids
derived_task_ids = derived_task_ids,
force_output_types = force_output_types
)
if (nrow(tmpl_df) == 0L && !complete_cases_only) {
# If all output_types are optional, expand_model_out_grid returns
# a zero row and column data.frame. To attempt to expand required task id
# values when complete_cases_only = FALSE, we use
# force_output_types = TRUE to force the output types to be included. We
# then remove output type related columns and create a data.frame of
# required task id vales only.
tmpl_df <- expand_model_out_grid(
config_tasks,
round_id = round_id,
required_vals_only = required_vals_only,
include_sample_ids = TRUE,
compound_taskid_set = compound_taskid_set,
output_types = output_types,
derived_task_ids = derived_task_ids,
force_output_types = TRUE
)
output_cols <- hubUtils::std_colnames[c("output_type", "output_type_id", "value")]
tmpl_df <- tmpl_df[setdiff(names(tmpl_df), output_cols)] |>
unique()
}
if (nrow(tmpl_df) == 0L) {
return(tmpl_df)
}

tmpl_cols <- c(
hubUtils::get_round_task_id_names(
Expand All @@ -155,7 +200,6 @@ submission_tmpl <- function(hub_con, config_tasks, round_id,
),
hubUtils::std_colnames[names(hubUtils::std_colnames) != "model_id"]
)

# Add NA columns for value and all optional cols
na_cols <- tmpl_cols[!tmpl_cols %in% names(tmpl_df)]
tmpl_df[, na_cols] <- NA
Expand Down
30 changes: 28 additions & 2 deletions man/submission_tmpl.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions tests/testthat/test-check_tbl_values_required.R
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,27 @@ test_that("Reading derived_task_ids from config works", {
check_tbl_values_required(tbl, round_id, file_path, hub_path)
)
})

test_that("v4 config output type leak fixed (#177)", {
hub_path <- test_path("testdata", "hub-177")
file_path <- "FluSight-baseline/2024-12-14-FluSight-baseline.parquet"
round_id <- "2024-12-14"
tbl <- read_model_out_file(
file_path = file_path,
hub_path = hub_path,
coerce_types = "chr"
)
res <- check_tbl_values_required(tbl,
round_id = round_id,
file_path = file_path, hub_path = hub_path
)

expect_s3_class(res,
c(
"check_success", "hub_check",
"rlang_message", "message",
"condition"
),
exact = TRUE
)
})
49 changes: 49 additions & 0 deletions tests/testthat/test-submission_tmpl.R
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,52 @@ test_that("submission_tmpl ignoring derived task ids works", {
)
)
})


test_that("submission_tmpl force_output_types works", {
config_tasks <- read_config_file(
test_path(
"testdata", "configs",
"tasks-samples-v4.json"
)
)
# When force_output_types is not set, all output_types are optional, a
# zero row and column data.frame is returned by default.
req_non_force_default <- submission_tmpl(
config_tasks = config_tasks,
round_id = "2022-10-22",
required_vals_only = TRUE,
output_types = "sample"
)
expect_equal(dim(req_non_force_default), c(0L, 0L))
# When force_output_types is not set, all output_types are optional and
# complete_cases_only = FALSE a data.frame containing required task ID
# values is returned, with all optional task ids and output type related
# columns set to NA.
expect_warning({
req_non_force <- submission_tmpl(
config_tasks = config_tasks,
round_id = "2022-10-22",
required_vals_only = TRUE,
output_types = "sample",
complete_cases_only = FALSE
)
}, "all optional values") |> suppressMessages()
expect_equal(dim(req_non_force), c(4L, 9L))
expect_equal(unique(req_non_force$output_type), NA_character_)

# When force_output_types is TRUE, the requested output type should be
# returned.
expect_warning({
req_force <- submission_tmpl(
config_tasks = config_tasks,
round_id = "2022-10-22",
required_vals_only = TRUE,
force_output_types = TRUE,
output_types = "sample",
complete_cases_only = FALSE
)
}, "all optional values") |> suppressMessages()
expect_equal(dim(req_force), c(4L, 9L))
expect_equal(unique(req_force$output_type), "sample")
})
25 changes: 25 additions & 0 deletions tests/testthat/testdata/hub-177/hub-config/admin.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"schema_version": "https://raw.githubusercontent.com/hubverse-org/schemas/main/v4.0.0/admin-schema.json",
"name": "US CDC FluSight",
"maintainer": "US CDC",
"contact": {
"name": "Joe bloggs",
"email": "[email protected]"
},
"repository": {
"host": "github",
"owner": "cdcepi",
"repository": "FluSight-forecast-hub"
},
"file_format": ["csv", "parquet"],
"timezone": "US/Eastern",
"model_output_dir": "model-output",
"cloud": {
"enabled": true,
"host": {
"name": "aws",
"storage_service": "s3",
"storage_location": "cdcepi-flusight-forecast-hub"
}
}
}
Loading

0 comments on commit 089b5e8

Please sign in to comment.