diff --git a/DESCRIPTION b/DESCRIPTION index 3d327a12..2ee2c5c7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: hubValidations Title: Testing framework for hubverse hub validations -Version: 0.4.0 +Version: 0.5.0 Authors@R: c( person( given = "Anna", diff --git a/NAMESPACE b/NAMESPACE index a63d9eed..de940c92 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,7 @@ export(is_exec_warn) export(is_failure) export(is_info) export(is_success) +export(match_tbl_to_model_task) export(new_hub_validations) export(not_pass) export(opt_check_metadata_team_max_model_n) diff --git a/NEWS.md b/NEWS.md index 5449c894..12cae75c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,18 @@ +# hubValidations 0.5.0 + +This release introduces **significant improvements in the performance of submission validation** via the following changes: + +* Add ability to sub-set expanded valid value grids by output type through `output_type` argument in `expand_model_out_grid()` (#98). +* Add ability to ignore the values of derived task IDs in expanded valid value grids through argument `derived_task_ids` in `expand_model_out_grid()`. +* Use sub-setting and batching of model output data validation by output type in appropriate lower level checks and add ability to ignore derived task IDs in `validate_model_data()`, `validate_submission()` and `validate_pr()`. + +Both of these changes **allow for the creation of smaller, more focused expanded valid value grids, significantly reducing pressure on memory** when working with large, complex hub configs and making submission validation much more efficient. + +Additional useful functionality: + +* Add ability to subset by output type and ignore derived task IDs to `submission_tmpl()`. Ignoring derived task ids can be particularly useful to avoid creating templates with invalid derived task ID value combinations. +* Add new exported function `match_tbl_to_model_task()` that matches the rows in a `tbl` of model output data to a model task of a given round (as defined in `tasks.json`). + # hubValidations 0.4.0 - Add new `check_tbl_spl_compound_taskid_set()` check function to `validate_model_data()` that ensures that sample compound task id sets for each modeling task match or are coarser than the expected set defined in `tasks.json` config. diff --git a/R/check_tbl_spl_compound_taskid_set.R b/R/check_tbl_spl_compound_taskid_set.R index 042dc561..6107d0d8 100644 --- a/R/check_tbl_spl_compound_taskid_set.R +++ b/R/check_tbl_spl_compound_taskid_set.R @@ -8,6 +8,7 @@ #' Column types must **all be character**. #' @inherit check_tbl_colnames params #' @inherit check_tbl_colnames return +#' @inheritParams expand_model_out_grid #' @details If the check fails, the output of the check includes an `errors` element, #' a list of items, one for each modeling task failing validation. #' The structure depends on the reason the check failed. @@ -31,15 +32,18 @@ #' See [hubverse documentation on samples](https://hubverse.io/en/latest/user-guide/sample-output-type.html) #' for more details. #' @export -check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path) { +check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path, + derived_task_ids = NULL) { config_tasks <- hubUtils::read_config(hub_path, "tasks") if (isFALSE(has_spls_tbl(tbl)) || isFALSE(hubUtils::is_v3_config(config_tasks))) { return(skip_v3_spl_check(file_path)) } - compound_taskid_set <- get_tbl_compound_taskid_set(tbl, config_tasks, round_id, - compact = FALSE, error = FALSE + compound_taskid_set <- get_tbl_compound_taskid_set( + tbl, config_tasks, round_id, + compact = FALSE, error = FALSE, + derived_task_ids = NULL ) check <- purrr::map_lgl( @@ -47,8 +51,6 @@ check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path ~ is.null(attr(.x, "errors")) ) |> all() - - capture_check_cnd( check = check, file_path = file_path, diff --git a/R/check_tbl_spl_compound_tid.R b/R/check_tbl_spl_compound_tid.R index 54fe9246..71edfaa2 100644 --- a/R/check_tbl_spl_compound_tid.R +++ b/R/check_tbl_spl_compound_tid.R @@ -4,6 +4,7 @@ #' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**. #' @inherit check_tbl_colnames params #' @inherit check_tbl_colnames return +#' @inheritParams expand_model_out_grid #' @param compound_taskid_set a list of `compound_taskid_set`s (characters vector of compound task IDs), #' one for each modeling task. Used to override the compound task ID set in the config file, #' for example, when validating coarser samples. @@ -17,7 +18,8 @@ #' for more details. #' @export check_tbl_spl_compound_tid <- function(tbl, round_id, file_path, hub_path, - compound_taskid_set = NULL) { + compound_taskid_set = NULL, + derived_task_ids = NULL) { if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) { cli::cli_abort("Valid {.var compound_taskid_set} must be provided.") } @@ -33,7 +35,9 @@ check_tbl_spl_compound_tid <- function(tbl, round_id, file_path, hub_path, return(skip_v3_spl_check(file_path)) } - hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set) + hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set, + derived_task_ids = derived_task_ids + ) n_tbl <- hash_tbl[hash_tbl$n_compound_idx > 1L, ] check <- nrow(n_tbl) == 0L diff --git a/R/check_tbl_spl_n.R b/R/check_tbl_spl_n.R index 1a6e5b4b..ed967de5 100644 --- a/R/check_tbl_spl_n.R +++ b/R/check_tbl_spl_n.R @@ -16,7 +16,8 @@ #' for more details. #' @export check_tbl_spl_n <- function(tbl, round_id, file_path, hub_path, - compound_taskid_set = NULL) { + compound_taskid_set = NULL, + derived_task_ids = NULL) { if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) { cli::cli_abort("Valid {.var compound_taskid_set} must be provided.") } @@ -32,7 +33,9 @@ check_tbl_spl_n <- function(tbl, round_id, file_path, hub_path, return(skip_v3_spl_check(file_path)) } - hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set) + hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set, + derived_task_ids = derived_task_ids + ) n_ranges <- get_round_spl_n_ranges(config_tasks, round_id) n_tbl <- dplyr::group_by(hash_tbl, .data$compound_idx) %>% diff --git a/R/check_tbl_spl_non_compound_tid.R b/R/check_tbl_spl_non_compound_tid.R index fd1eb0ee..41975fdb 100644 --- a/R/check_tbl_spl_non_compound_tid.R +++ b/R/check_tbl_spl_non_compound_tid.R @@ -17,7 +17,8 @@ #' for more details. #' @export check_tbl_spl_non_compound_tid <- function(tbl, round_id, file_path, hub_path, - compound_taskid_set = NULL) { + compound_taskid_set = NULL, + derived_task_ids = NULL) { if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) { cli::cli_abort("Valid {.var compound_taskid_set} must be provided.") } @@ -33,7 +34,8 @@ check_tbl_spl_non_compound_tid <- function(tbl, round_id, file_path, hub_path, return(skip_v3_spl_check(file_path)) } - hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set) + hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set, + derived_task_ids = derived_task_ids) n_tbl <- dplyr::summarise( hash_tbl, diff --git a/R/check_tbl_value_col.R b/R/check_tbl_value_col.R index 189b7f02..85ce5af3 100644 --- a/R/check_tbl_value_col.R +++ b/R/check_tbl_value_col.R @@ -5,54 +5,34 @@ #' type of the appropriate model task. #' @inherit check_tbl_colnames params #' @inherit check_tbl_col_types return +#' @inheritParams expand_model_out_grid #' @export -check_tbl_value_col <- function(tbl, round_id, file_path, hub_path) { +check_tbl_value_col <- function(tbl, round_id, file_path, hub_path, + derived_task_ids = NULL) { config_tasks <- hubUtils::read_config(hub_path, "tasks") tbl[, names(tbl) != "value"] <- hubData::coerce_to_character( tbl[, names(tbl) != "value"] ) + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } - full <- expand_model_out_grid( - config_tasks, - round_id = round_id, - required_vals_only = FALSE, - all_character = TRUE, - as_arrow_table = FALSE, - bind_model_tasks = FALSE - ) - - join_cols <- names(tbl)[names(tbl) != "value"] # nolint: object_usage_linter - tbl <- purrr::map( - full, - ~ dplyr::inner_join(.x, tbl, by = join_cols) - ) - - round_config <- get_file_round_config(file_path, hub_path) - output_type_config <- round_config[["model_tasks"]] %>% - purrr::map(~ .x[["output_type"]]) - - - details <- purrr::map2( - tbl, output_type_config, - check_modeling_task_value_col - ) %>% + details <- split(tbl, f = tbl$output_type) %>% + purrr::imap( + \(.x, .y) { + check_value_col_by_output_type( + tbl = .x, output_type = .y, + config_tasks = config_tasks, + round_id = round_id, + derived_task_ids = derived_task_ids + ) + } + ) %>% unlist(use.names = TRUE) check <- is.null(details) - ## Example code for attempting bullets of details. Needs more experimentation - ## but parking for now. - # if (!check) { - # details_bullets_div <- function(details) { - # cli::cli_div() - # cli::format_bullets_raw( - # stats::setNames(details, rep("*", length(details))) - # ) - # } - # details <- details_bullets_div(details) - # } - capture_check_cnd( check = check, file_path = file_path, @@ -63,19 +43,28 @@ check_tbl_value_col <- function(tbl, round_id, file_path, hub_path) { ) } - -check_modeling_task_value_col <- function(tbl, output_type_config) { - purrr::imap( - split(tbl, tbl[["output_type"]]), - ~ compare_values_to_config( - tbl = .x, output_type = .y, - output_type_config - ) +check_value_col_by_output_type <- function(tbl, output_type, + config_tasks, round_id, + derived_task_ids = NULL) { + purrr::map2( + .x = match_tbl_to_model_task(tbl, config_tasks, + round_id, output_type, + derived_task_ids = derived_task_ids + ), + .y = get_round_output_types(config_tasks, round_id), + \(.x, .y) { + compare_values_to_config( + tbl = .x, output_type_config = .y, output_type = output_type + ) + } ) %>% unlist(use.names = TRUE) } compare_values_to_config <- function(tbl, output_type, output_type_config) { + if (any(is.null(tbl), is.null(output_type_config))) { + return(NULL) + } details <- NULL values <- tbl$value config <- output_type_config[[output_type]][["value"]] diff --git a/R/check_tbl_values.R b/R/check_tbl_values.R index 89a75de2..420b7e5d 100644 --- a/R/check_tbl_values.R +++ b/R/check_tbl_values.R @@ -1,34 +1,25 @@ #' Check model output data tbl contains valid value combinations #' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**. #' @inherit check_tbl_colnames params +#' @inheritParams expand_model_out_grid #' @inherit check_tbl_colnames return #' @export -check_tbl_values <- function(tbl, round_id, file_path, hub_path) { +check_tbl_values <- function(tbl, round_id, file_path, hub_path, + derived_task_ids = NULL) { config_tasks <- hubUtils::read_config(hub_path, "tasks") - # Coerce accepted vals to character for easier comparison of - # values. Tried to use arrow tbls for comparisons as more efficient when - # working with larger files but currently arrow does not match NAs as dplyr - # does, returning false positives for mean & median rows which contain NA in - # output type ID column. - accepted_vals <- expand_model_out_grid( - config_tasks = config_tasks, - round_id = round_id, - all_character = TRUE - ) - - # This approach uses dplyr to identify tbl rows that don't have a complete match - # in accepted_vals. - accepted_vals$valid <- TRUE - if (hubUtils::is_v3_config(config_tasks)) { - out_type_ids <- tbl[["output_type_id"]] - tbl[tbl$output_type == "sample", "output_type_id"] <- NA - } - - valid_tbl <- dplyr::left_join( - tbl, accepted_vals, - by = names(tbl)[names(tbl) != "value"] - ) + valid_tbl <- tbl %>% + tibble::rowid_to_column() %>% + split(f = tbl$output_type) %>% + purrr::imap( + ~ check_values_by_output_type( + tbl = .x, output_type = .y, + config_tasks = config_tasks, + round_id = round_id, + derived_task_ids = derived_task_ids + ) + ) %>% + purrr::list_rbind() check <- !any(is.na(valid_tbl$valid)) @@ -36,14 +27,14 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) { details <- NULL error_tbl <- NULL } else { - error_summary <- summarise_invalid_values(valid_tbl, accepted_vals) + error_summary <- summarise_invalid_values( + valid_tbl, config_tasks, round_id, + derived_task_ids + ) details <- error_summary$msg if (length(error_summary$invalid_combs_idx) == 0L) { error_tbl <- NULL } else { - if (hubUtils::is_v3_config(config_tasks)) { - tbl[["output_type_id"]] <- out_type_ids - } error_tbl <- tbl[ error_summary$invalid_combs_idx, names(tbl) != "value" @@ -66,10 +57,54 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) { ) } -summarise_invalid_values <- function(valid_tbl, accepted_vals) { - cols <- names(valid_tbl)[!names(valid_tbl) %in% c("value", "valid")] +check_values_by_output_type <- function(tbl, output_type, config_tasks, round_id, + derived_task_ids = NULL) { + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } + + # Coerce accepted vals to character for easier comparison of + # values. Tried to use arrow tbls for comparisons as more efficient when + # working with larger files but currently arrow does not match NAs as dplyr + # does, returning false positives for mean & median rows which contain NA in + # output type ID column. + accepted_vals <- expand_model_out_grid( + config_tasks = config_tasks, + round_id = round_id, + all_character = TRUE, + output_types = output_type, + derived_task_ids = derived_task_ids + ) + + # This approach uses dplyr to identify tbl rows that don't have a complete match + # in accepted_vals. + accepted_vals$valid <- TRUE + if (hubUtils::is_v3_config(config_tasks) && output_type == "sample") { + tbl[tbl$output_type == "sample", "output_type_id"] <- NA + } + + dplyr::left_join( + tbl, accepted_vals, + by = setdiff(names(tbl), c("value", "rowid")) + ) +} + +# Summarise results of check for invalid values by creating appropriate +# messages and extracting the rowids of invalid value combinations with respect +# to the row order in the original tbl. +# Problems are summarised in two parts: +# First we report any invalid values in the tbl that do not match any values in the +# config. Second we report any rows that contain valid values but in invalid +# combinations. +summarise_invalid_values <- function(valid_tbl, config_tasks, round_id, + derived_task_ids) { + # Chack for invalid values + cols <- setdiff(names(valid_tbl), c("value", "valid", "rowid")) uniq_tbl <- purrr::map(valid_tbl[cols], unique) - uniq_config <- purrr::map(accepted_vals[cols], unique) + uniq_config <- get_round_config_values( + config_tasks, round_id, + derived_task_ids + )[cols] invalid_vals <- purrr::map2( uniq_tbl, uniq_config, @@ -90,6 +125,7 @@ summarise_invalid_values <- function(valid_tbl, accepted_vals) { invalid_vals_msg <- NULL } + # Get rowids of invalid value combinations invalid_val_idx <- purrr::imap( invalid_vals, ~ which(valid_tbl[[.y]] %in% .x) @@ -97,13 +133,20 @@ summarise_invalid_values <- function(valid_tbl, accepted_vals) { unlist(use.names = FALSE) %>% unique() invalid_row_idx <- which(is.na(valid_tbl$valid)) - invalid_combs_idx <- setdiff(invalid_val_idx, invalid_row_idx) + # Ignore rows which have already been reported for invalid values + invalid_combs_idx <- setdiff(invalid_row_idx, invalid_val_idx) if (length(invalid_combs_idx) == 0L) { invalid_combs_msg <- NULL } else { + # invalid_combs_idx indicates invalid value combinations in the table joined + # to expanded valid value grid. This changes the row order of the table, so + # to return rowids with respect to the original tbl row order we use + # invalid_combs_idx to extract values from the rowid column of the valid_tbl. + invalid_combs_idx <- valid_tbl$rowid[invalid_combs_idx] invalid_combs_msg <- cli::format_inline( - "Additionally row{?s} {.val {invalid_combs_idx}} contain invalid - combinations of valid values. + "Additionally {cli::qty(length(invalid_combs_idx))} row{?s} + {.val {invalid_combs_idx}} {cli::qty(length(invalid_combs_idx))} + {?contains/contain} invalid combinations of valid values. See {.var error_tbl} for details." ) } @@ -112,40 +155,3 @@ summarise_invalid_values <- function(valid_tbl, accepted_vals) { invalid_combs_idx = invalid_combs_idx ) } - - -get_numeric_output_type_ids <- function(file_path, hub_path) { - get_file_round_config(file_path, hub_path)[["model_tasks"]] %>% - purrr::map(~ .x[["output_type"]]) %>% - unlist(recursive = FALSE) %>% - purrr::map(~ purrr::pluck(.x, "output_type_id")) %>% - purrr::map_lgl(~ is.numeric(unlist(.x))) %>% - purrr::keep(isTRUE) %>% - names() %>% - unique() -} - - -coerce_num_output_type_ids <- function(tbl, file_path, hub_path) { - num_output_types <- get_numeric_output_type_ids( - file_path = file_path, - hub_path = hub_path - ) - - if ( - any(tbl[["output_type"]] %in% num_output_types) && - inherits(tbl[["output_type_id"]], "character") - ) { - type_coerce <- tbl[["output_type"]] %in% num_output_types - num_output_type_id <- suppressWarnings( - as.numeric(tbl$output_type_id[type_coerce]) - ) - # establish only valid coercions to distinguish between the potential for - # two cdf output types in the same round, one numeric and one character. - valid <- !is.na(num_output_type_id) - tbl$output_type_id[type_coerce][valid] <- as.character( - num_output_type_id[valid] - ) - } - tbl -} diff --git a/R/check_tbl_values_required.R b/R/check_tbl_values_required.R index 0077223b..496f581d 100644 --- a/R/check_tbl_values_required.R +++ b/R/check_tbl_values_required.R @@ -5,18 +5,23 @@ #' @inherit check_tbl_colnames params #' @inherit check_tbl_col_types return #' @export -check_tbl_values_required <- function(tbl, round_id, file_path, hub_path) { +check_tbl_values_required <- function(tbl, round_id, file_path, hub_path, + derived_task_ids = NULL) { tbl[["value"]] <- NULL config_tasks <- hubUtils::read_config(hub_path, "tasks") if (hubUtils::is_v3_config(config_tasks)) { tbl[tbl$output_type == "sample", "output_type_id"] <- NA } + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } req <- expand_model_out_grid( config_tasks, round_id = round_id, required_vals_only = TRUE, all_character = TRUE, - bind_model_tasks = FALSE + bind_model_tasks = FALSE, + derived_task_ids = derived_task_ids ) full <- expand_model_out_grid( @@ -25,7 +30,8 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path) { required_vals_only = FALSE, all_character = TRUE, as_arrow_table = FALSE, - bind_model_tasks = FALSE + bind_model_tasks = FALSE, + derived_task_ids = derived_task_ids ) tbl <- purrr::map( diff --git a/R/compound_taskid-utils.R b/R/compound_taskid-utils.R index b37357ad..3f44d4a8 100644 --- a/R/compound_taskid-utils.R +++ b/R/compound_taskid-utils.R @@ -8,6 +8,7 @@ #' @param error Logical. If TRUE, an error will be thrown if the compound task ID set is not valid. #' If FALSE and an error is detected, the detected compound task ID set will be #' returned with error attributes attached. +#' @inheritParams expand_model_out_grid #' #' @return A list of vectors of compound task IDs detected in the tbl, one for each #' modeling task in the round. If `compact` is TRUE, modeling tasks returning NULL @@ -29,11 +30,14 @@ #' ) #' get_tbl_compound_taskid_set <- function(tbl, config_tasks, round_id, - compact = TRUE, error = TRUE) { + compact = TRUE, error = TRUE, + derived_task_ids = NULL) { if (!inherits(tbl, "tbl_df")) { tbl <- dplyr::as_tibble(tbl) } - + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } tbl <- tbl[tbl$output_type == "sample", names(tbl) != "value"] out_tid <- hubUtils::std_colnames["output_type_id"] @@ -48,10 +52,12 @@ get_tbl_compound_taskid_set <- function(tbl, config_tasks, round_id, round_id = round_id, all_character = TRUE, include_sample_ids = FALSE, - bind_model_tasks = FALSE + bind_model_tasks = FALSE, + output_types = "sample", + derived_task_ids = derived_task_ids ), function(.x) { - if (!has_spls_tbl(.x)) { + if (nrow(.x) == 0L) { return(NULL) } dplyr::inner_join(tbl, .x[, names(.x) != out_tid], diff --git a/R/config_tasks-utils.R b/R/config_tasks-utils.R new file mode 100644 index 00000000..2266f529 --- /dev/null +++ b/R/config_tasks-utils.R @@ -0,0 +1,90 @@ +get_round_config <- function(config_tasks, round_id) { + round_idx <- hubUtils::get_round_idx(config_tasks, round_id) + purrr::pluck( + config_tasks, + "rounds", + round_idx + ) +} + +get_round_output_types <- function(config_tasks, round_id) { + round_config <- get_round_config(config_tasks, round_id) + purrr::map( + round_config[["model_tasks"]], + ~ .x[["output_type"]] + ) +} + +get_round_output_type_names <- function(config_tasks, round_id, + collapse = TRUE) { + out <- get_round_output_types(config_tasks, round_id) %>% + purrr::map(names) + + if (collapse) { + purrr::flatten_chr(out) %>% + unique() + } else { + out + } +} + +# get all task_ids values +get_round_config_values <- function(config_tasks, round_id, + derived_task_ids = NULL) { + model_tasks <- hubUtils::get_round_model_tasks(config_tasks, round_id) + task_id_names <- setdiff( + hubUtils::get_round_task_id_names(config_tasks, round_id), + derived_task_ids + ) + task_id_values <- purrr::map( + purrr::set_names(task_id_names), + \(.x) get_task_id_values(.x, model_tasks) + ) + if (!is.null(derived_task_ids)) { + task_id_values <- c( + task_id_values, + purrr::map( + purrr::set_names(derived_task_ids), + ~NA_character_ + ) + ) + } + + output_type_names <- get_round_output_type_names(config_tasks, round_id) + output_type_id_values <- purrr::map( + output_type_names, + \(.x) get_output_type_id_values(.x, model_tasks) + ) %>% purrr::flatten_chr() + + output_types <- list( + output_type = output_type_names, + output_type_id = output_type_id_values + ) + + c(task_id_values, output_types) +} +get_task_id_values <- function(task_id, model_tasks) { + purrr::map( + model_tasks, + ~ .x[["task_ids"]][[task_id]] + ) %>% + unlist(use.names = FALSE) %>% + unique() %>% + as.character() +} + +get_output_type_id_values <- function(output_type, model_tasks) { + out <- purrr::map( + model_tasks, + ~ .x[["output_type"]][[output_type]][["output_type_id"]] + ) %>% + unlist(use.names = FALSE) %>% + unique() %>% + as.character() + + if (length(out) == 0L) { + return(NA_character_) + } else { + return(out) + } +} diff --git a/R/expand_model_out_grid.R b/R/expand_model_out_grid.R index 568ac7c9..6a42ba6e 100644 --- a/R/expand_model_out_grid.R +++ b/R/expand_model_out_grid.R @@ -20,6 +20,11 @@ #' in the round. Can be used to override the compound task ID set defined in the #' config. If `NULL` is provided for a given modeling task, a compound task ID set of #' all task IDs is used. +#' @param output_types Character vector of output type names to include. +#' Use to subset for grids for specific output types. +#' @param derived_task_ids Character vector of derived task ID names (task IDs whose +#' values depend on other task IDs) to ignore. Columns for such task ids will +#' contain `NA`s. #' #' @return If `bind_model_tasks = TRUE` (default) a tibble or arrow table #' containing all possible task ID and related output type ID @@ -88,9 +93,9 @@ #' include_sample_ids = TRUE #' ) #' # Hub with sample output type and compound task ID structure -#' config_tasks <- hubUtils::read_config_file(system.file("config", "tasks-comp-tid.json", -#' package = "hubValidations" -#' )) +#' config_tasks <- hubUtils::read_config_file( +#' system.file("config", "tasks-comp-tid.json", package = "hubValidations") +#' ) #' expand_model_out_grid(config_tasks, #' round_id = "2022-12-26", #' include_sample_ids = TRUE @@ -114,6 +119,30 @@ #' NULL #' ) #' ) +#' # Subset output types +#' config_tasks <- hubUtils::read_config( +#' system.file("testhubs", "samples", package = "hubValidations") +#' ) +#' expand_model_out_grid(config_tasks, +#' round_id = "2022-10-29", +#' include_sample_ids = TRUE, +#' bind_model_tasks = FALSE, +#' output_types = c("sample", "pmf"), +#' ) +#' expand_model_out_grid(config_tasks, +#' round_id = "2022-10-29", +#' include_sample_ids = TRUE, +#' bind_model_tasks = TRUE, +#' output_types = "sample", +#' ) +#' # Ignore derived task IDs +#' expand_model_out_grid(config_tasks, +#' round_id = "2022-10-29", +#' include_sample_ids = TRUE, +#' bind_model_tasks = FALSE, +#' output_types = "sample", +#' derived_task_ids = "target_end_date" +#' ) expand_model_out_grid <- function(config_tasks, round_id, required_vals_only = FALSE, @@ -126,20 +155,22 @@ expand_model_out_grid <- function(config_tasks, as_arrow_table = FALSE, bind_model_tasks = TRUE, include_sample_ids = FALSE, - compound_taskid_set = NULL) { - round_idx <- hubUtils::get_round_idx(config_tasks, round_id) + compound_taskid_set = NULL, + output_types = NULL, + derived_task_ids = NULL) { checkmate::assert_list(compound_taskid_set, null.ok = TRUE) output_type_id_datatype <- rlang::arg_match(output_type_id_datatype) - - round_config <- purrr::pluck( - config_tasks, - "rounds", - round_idx + output_types <- validate_output_types(output_types, config_tasks, round_id) + derived_task_ids <- validate_derived_task_ids( + derived_task_ids, + config_tasks, round_id ) + round_config <- get_round_config(config_tasks, round_id) task_id_l <- purrr::map( round_config[["model_tasks"]], ~ .x[["task_ids"]] %>% + derived_taskids_to_na(derived_task_ids) %>% null_taskids_to_na() ) %>% # Fix round_id value to current round_id in round_id variable column @@ -158,7 +189,13 @@ expand_model_out_grid <- function(config_tasks, output_type_l <- purrr::map( round_config[["model_tasks"]], function(.x) { - .x[["output_type"]] + out <- .x[["output_type"]] + if (is.null(output_types)) { + out + } else { + mt_output_types <- output_types[output_types %in% names(out)] + out[mt_output_types] + } } ) %>% purrr::map( @@ -258,7 +295,6 @@ process_mt_grid_outputs <- function(x, config_tasks, all_character, as_arrow_table = TRUE, bind_model_tasks = TRUE, output_type_id_datatype = output_type_id_datatype) { - if (bind_model_tasks) { # To bind multiple modeling task grids together, we need to ensure they contain # the same columns. Any missing columns are padded with NAs. @@ -304,6 +340,9 @@ process_mt_grid_outputs <- function(x, config_tasks, all_character, # Pad any columns in all_cols missing in x of with new NA columns pad_missing_cols <- function(x, all_cols) { + if (ncol(x) == 0L) { + return(x) + } if (inherits(x, "data.frame")) { x[, all_cols[!all_cols %in% names(x)]] <- NA return(x[, all_cols]) @@ -339,6 +378,22 @@ null_taskids_to_na <- function(model_task) { ) } +# Set derived task_ids to all NULL values. +derived_taskids_to_na <- function(model_task, derived_task_ids) { + if (!is.null(derived_task_ids)) { + purrr::modify_at( + model_task, + .at = derived_task_ids, + .f = ~ list( + required = NULL, + optional = NA + ) + ) + } else { + model_task + } +} + # Adds example sample ids to the output type id column which are unique # across multiple modeling task groups. Only apply to v3 and above sample output # type configurations. @@ -472,3 +527,69 @@ extract_mt_output_type_ids <- function(x, config_tid) { } ) } + +validate_output_types <- function(output_types, config_tasks, round_id, + call = rlang::caller_call()) { + checkmate::assert_character(output_types, null.ok = TRUE) + if (is.null(output_types)) { + return(NULL) + } + round_output_types <- get_round_output_type_names(config_tasks, round_id) + invalid_output_types <- setdiff(output_types, round_output_types) + if (length(invalid_output_types) > 0L) { + cli::cli_abort( + c( + "x" = "{.val {invalid_output_types}} {?is/are} not valid output type{?s}.", + "i" = "{.arg output_types} must be members of: {.val {round_output_types}}" + ), + call = call + ) + } + output_types +} + +validate_derived_task_ids <- function(derived_task_ids, config_tasks, round_id) { + checkmate::assert_character(derived_task_ids, null.ok = TRUE) + if (is.null(derived_task_ids)) { + return(NULL) + } + round_task_ids <- hubUtils::get_round_task_id_names(config_tasks, round_id) + valid_task_ids <- intersect(derived_task_ids, round_task_ids) + if (length(valid_task_ids) < length(derived_task_ids)) { + cli::cli_warn( + c( + "x" = "{.val {setdiff(derived_task_ids, round_task_ids)}} + {?is/are} not valid task ID{?s}. Ignored.", + "i" = "{.arg derived_task_ids} must be a member of: {.val {round_task_ids}}" + ), + call = rlang::caller_call() + ) + } + model_tasks <- hubUtils::get_round_model_tasks(config_tasks, round_id) + has_required <- purrr::map( + model_tasks, + ~ .x[["task_ids"]][valid_task_ids] %>% + purrr::map_lgl( + ~ !is.null(.x$required) + ) + ) %>% + purrr::reduce(`|`) + if (any(has_required)) { + cli::cli_abort( + c( + "x" = "Derived task IDs cannot have required task ID values.", + "!" = "{.val {names(has_required)[has_required]}} ha{?s/ve} + required task ID values. Ignored." + ), + call = rlang::caller_call() + ) + } + valid_task_ids <- intersect( + valid_task_ids, + names(has_required)[!has_required] + ) + if (length(valid_task_ids) == 0L) { + return(NULL) + } + valid_task_ids +} diff --git a/R/match_tbl_to_model_task.R b/R/match_tbl_to_model_task.R new file mode 100644 index 00000000..6974599b --- /dev/null +++ b/R/match_tbl_to_model_task.R @@ -0,0 +1,49 @@ +#' Match model output `tbl` data to their model tasks in `config_tasks`. +#' +#' Split and match model output `tbl` data to their corresponding model tasks in +#' `config_tasks`. Useful for performing model task specific checks on model output. +#' For v3 samples, the `output_type_id` column is set to `NA` for `sample` outputs. +#' @inheritParams expand_model_out_grid +#' @inheritParams check_tbl_colnames +#' +#' @return A list containing a `tbl_df` of model output data matched to a model +#' task with one element per round model task. +#' @export +#' +#' @examples +#' hub_path <- system.file("testhubs/samples", package = "hubValidations") +#' tbl <- read_model_out_file( +#' file_path = "flu-base/2022-10-22-flu-base.csv", +#' hub_path, coerce_types = "chr" +#' ) +#' config_tasks <- hubUtils::read_config(hub_path, "tasks") +#' match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22") +#' match_tbl_to_model_task(tbl, config_tasks, +#' round_id = "2022-10-22", +#' output_types = "sample" +#' ) +match_tbl_to_model_task <- function(tbl, config_tasks, round_id, + output_types = NULL, derived_task_ids = NULL, + all_character = TRUE) { + join_cols <- names(tbl)[names(tbl) != "value"] + if (hubUtils::is_v3_config(config_tasks)) { + tbl[tbl$output_type == "sample", "output_type_id"] <- NA + } + + expand_model_out_grid( + config_tasks, + round_id = round_id, + required_vals_only = FALSE, + all_character = TRUE, + as_arrow_table = FALSE, + bind_model_tasks = FALSE, + output_types = output_types, + derived_task_ids = derived_task_ids + ) %>% + purrr::map(\(.x) { + if (nrow(.x) == 0L) { + return(NULL) + } + dplyr::inner_join(.x, tbl, by = join_cols) + }) +} diff --git a/R/submission_tmpl.R b/R/submission_tmpl.R index a84fde80..2e4ffd0d 100644 --- a/R/submission_tmpl.R +++ b/R/submission_tmpl.R @@ -92,10 +92,30 @@ #' NULL #' ) #' ) +#' # Subsetting for a single output type +#' submission_tmpl( +#' config_tasks = config_tasks, +#' round_id = "2022-12-26", +#' output_types = "sample" +#' ) +#' # Derive a template with ignored derived task ID. Useful to avoid creating +#' # a template with invalid derived task ID value combinations. +#' config_tasks <- hubUtils::read_config( +#' system.file("testhubs", "flusight", package = "hubValidations") +#' ) +#' submission_tmpl( +#' config_tasks = config_tasks, +#' round_id = "2022-12-12", +#' output_types = "pmf", +#' derived_task_ids = "target_end_date", +#' complete_cases_only = FALSE +#' ) submission_tmpl <- function(hub_con, config_tasks, round_id, required_vals_only = FALSE, complete_cases_only = TRUE, - compound_taskid_set = NULL) { + compound_taskid_set = NULL, + output_types = NULL, + derived_task_ids = NULL) { switch(rlang::check_exclusive(hub_con, config_tasks), hub_con = { checkmate::assert_class(hub_con, classes = "hub_connection") @@ -108,7 +128,9 @@ submission_tmpl <- function(hub_con, config_tasks, round_id, round_id = round_id, required_vals_only = required_vals_only, include_sample_ids = TRUE, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + output_types = output_types, + derived_task_ids = derived_task_ids ) tmpl_cols <- c( diff --git a/R/v3-sample-utils.R b/R/v3-sample-utils.R index d8bc3c7f..c85c84ad 100644 --- a/R/v3-sample-utils.R +++ b/R/v3-sample-utils.R @@ -6,8 +6,12 @@ # compound task id sets across modeling tasks). This is achieved by using a full # join between tbl sample data and the model output data sample grid for each # modeling task. This means that only valid task id combinations are considered. -spl_hash_tbl <- function(tbl, round_id, config_tasks, compound_taskid_set = NULL) { +spl_hash_tbl <- function(tbl, round_id, config_tasks, compound_taskid_set = NULL, + derived_task_ids = NULL) { tbl <- tbl[tbl$output_type == "sample", names(tbl) != "value"] + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } mt_spl_grid <- expand_model_out_grid( config_tasks = config_tasks, @@ -15,20 +19,19 @@ spl_hash_tbl <- function(tbl, round_id, config_tasks, compound_taskid_set = NULL all_character = TRUE, include_sample_ids = TRUE, bind_model_tasks = FALSE, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + output_types = "sample", + derived_task_ids = derived_task_ids ) %>% stats::setNames(seq_along(.)) mt_tbls <- purrr::map( mt_spl_grid, function(.x) { - if (is.null(.x) || !has_spls_tbl(.x)) { + if (nrow(.x) == 0L) { NULL } else { - dplyr::filter( - .x, .data$output_type == "sample" - ) %>% - dplyr::rename(compound_idx = "output_type_id") %>% + dplyr::rename(.x, compound_idx = "output_type_id") %>% dplyr::inner_join(tbl, ., by = setdiff( names(tbl), @@ -64,7 +67,6 @@ spl_hash_tbl <- function(tbl, round_id, config_tasks, compound_taskid_set = NULL } get_mt_spl_hash_tbl <- function(tbl, compound_taskids, round_task_ids) { - if (is.null(tbl)) { return(NULL) } diff --git a/R/validate_model_data.R b/R/validate_model_data.R index 5cd7a03b..919f53eb 100644 --- a/R/validate_model_data.R +++ b/R/validate_model_data.R @@ -3,6 +3,7 @@ #' @inheritParams check_tbl_unique_round_id #' @inheritParams validate_model_file #' @inheritParams hubData::create_hub_schema +#' @inheritParams expand_model_out_grid #' @inherit validate_model_file return #' @export #' @details @@ -29,7 +30,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, "double", "integer", "logical", "Date" ), - validations_cfg_path = NULL) { + validations_cfg_path = NULL, + derived_task_ids = NULL) { checks <- new_hub_validations() file_meta <- parse_file_name(file_path) @@ -136,7 +138,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, tbl_chr, round_id = round_id, file_path = file_path, - hub_path = hub_path + hub_path = hub_path, + derived_task_ids = derived_task_ids ), file_path ) if (is_any_error(checks$valid_vals)) { @@ -156,7 +159,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, tbl_chr, round_id = round_id, file_path = file_path, - hub_path = hub_path + hub_path = hub_path, + derived_task_ids = derived_task_ids ), file_path ) @@ -166,7 +170,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, tbl, round_id = round_id, file_path = file_path, - hub_path = hub_path + hub_path = hub_path, + derived_task_ids = derived_task_ids ), file_path ) @@ -191,7 +196,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, tbl_chr, round_id = round_id, file_path = file_path, - hub_path = hub_path + hub_path = hub_path, + derived_task_ids = derived_task_ids ), file_path ) if (is_any_error(checks$spl_compound_taskid_set)) { @@ -205,7 +211,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, round_id = round_id, file_path = file_path, hub_path = hub_path, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + derived_task_ids = derived_task_ids ), file_path ) if (is_any_error(checks$spl_compound_tid)) { @@ -217,7 +224,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, round_id = round_id, file_path = file_path, hub_path = hub_path, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + derived_task_ids = derived_task_ids ), file_path ) if (is_any_error(checks$spl_non_compound_tid)) { @@ -229,7 +237,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, round_id = round_id, file_path = file_path, hub_path = hub_path, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + derived_task_ids = derived_task_ids ), file_path ) } diff --git a/R/validate_pr.R b/R/validate_pr.R index 48e44f03..15b37c41 100644 --- a/R/validate_pr.R +++ b/R/validate_pr.R @@ -107,7 +107,8 @@ validate_pr <- function(hub_path = ".", gh_repo, pr_number, submit_window_ref_date_from = c( "file", "file_path" - )) { + ), + derived_task_ids = NULL) { file_modification_check <- rlang::arg_match(file_modification_check) output_type_id_datatype <- rlang::arg_match(output_type_id_datatype) model_output_dir <- get_hub_model_output_dir(hub_path) # nolint: object_name_linter @@ -179,6 +180,7 @@ validate_pr <- function(hub_path = ".", gh_repo, pr_number, output_type_id_datatype = output_type_id_datatype, validations_cfg_path = validations_cfg_path, skip_submit_window_check = skip_submit_window_check, + derived_task_ids = derived_task_ids, skip_check_config = TRUE ) ) %>% diff --git a/R/validate_submission.R b/R/validate_submission.R index e60a1f76..278007b2 100644 --- a/R/validate_submission.R +++ b/R/validate_submission.R @@ -6,6 +6,7 @@ #' #' @inherit validate_model_data return params #' @inheritParams hubData::create_hub_schema +#' @inheritParams expand_model_out_grid #' @param skip_submit_window_check Logical. Whether to skip the submission window check. #' @param skip_check_config Logical. Whether to skip the hub config validation check. #' check. @@ -49,7 +50,8 @@ validate_submission <- function(hub_path, file_path, round_id_col = NULL, submit_window_ref_date_from = c( "file", "file_path" - )) { + ), + derived_task_ids = NULL) { check_hub_config <- new_hub_validations() output_type_id_datatype <- rlang::arg_match(output_type_id_datatype) @@ -89,7 +91,8 @@ validate_submission <- function(hub_path, file_path, round_id_col = NULL, file_path = file_path, round_id_col = round_id_col, output_type_id_datatype = output_type_id_datatype, - validations_cfg_path = validations_cfg_path + validations_cfg_path = validations_cfg_path, + derived_task_ids = derived_task_ids ) combine(check_hub_config, checks_file, checks_data, checks_submission_time) diff --git a/_pkgdown.yml b/_pkgdown.yml index 61a7cf7f..18b36391 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -37,6 +37,7 @@ reference: - try_check - expand_model_out_grid - get_tbl_compound_taskid_set + - match_tbl_to_model_task - title: "`` methods" diff --git a/man/check_tbl_spl_compound_taskid_set.Rd b/man/check_tbl_spl_compound_taskid_set.Rd index 65c49b84..90a19c41 100644 --- a/man/check_tbl_spl_compound_taskid_set.Rd +++ b/man/check_tbl_spl_compound_taskid_set.Rd @@ -5,7 +5,13 @@ \title{Check model output data tbl sample compound task id sets for each modeling task match or are coarser than the expected set defined in the config.} \usage{ -check_tbl_spl_compound_taskid_set(tbl, round_id, file_path, hub_path) +check_tbl_spl_compound_taskid_set( + tbl, + round_id, + file_path, + hub_path, + derived_task_ids = NULL +) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated. @@ -25,6 +31,10 @@ For more details consult the in the \code{arrow} package. The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} files within the \code{hub-config} directory.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/check_tbl_spl_compound_tid.Rd b/man/check_tbl_spl_compound_tid.Rd index eb18f336..3084f95b 100644 --- a/man/check_tbl_spl_compound_tid.Rd +++ b/man/check_tbl_spl_compound_tid.Rd @@ -10,7 +10,8 @@ check_tbl_spl_compound_tid( round_id, file_path, hub_path, - compound_taskid_set = NULL + compound_taskid_set = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -34,6 +35,10 @@ files within the \code{hub-config} directory.} \item{compound_taskid_set}{a list of \code{compound_taskid_set}s (characters vector of compound task IDs), one for each modeling task. Used to override the compound task ID set in the config file, for example, when validating coarser samples.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/check_tbl_spl_n.Rd b/man/check_tbl_spl_n.Rd index c32afb77..8c714607 100644 --- a/man/check_tbl_spl_n.Rd +++ b/man/check_tbl_spl_n.Rd @@ -5,7 +5,14 @@ \title{Check model output data tbl samples contain the appropriate number of samples for a given compound idx.} \usage{ -check_tbl_spl_n(tbl, round_id, file_path, hub_path, compound_taskid_set = NULL) +check_tbl_spl_n( + tbl, + round_id, + file_path, + hub_path, + compound_taskid_set = NULL, + derived_task_ids = NULL +) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated. Column types must \strong{all be character}.} @@ -28,6 +35,10 @@ files within the \code{hub-config} directory.} \item{compound_taskid_set}{a list of \code{compound_taskid_set}s (characters vector of compound task IDs), one for each modeling task. Used to override the compound task ID set in the config file, for example, when validating coarser samples.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/check_tbl_spl_non_compound_tid.Rd b/man/check_tbl_spl_non_compound_tid.Rd index 0b3daf91..26efa67c 100644 --- a/man/check_tbl_spl_non_compound_tid.Rd +++ b/man/check_tbl_spl_non_compound_tid.Rd @@ -10,7 +10,8 @@ check_tbl_spl_non_compound_tid( round_id, file_path, hub_path, - compound_taskid_set = NULL + compound_taskid_set = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -34,6 +35,10 @@ files within the \code{hub-config} directory.} \item{compound_taskid_set}{a list of \code{compound_taskid_set}s (characters vector of compound task IDs), one for each modeling task. Used to override the compound task ID set in the config file, for example, when validating coarser samples.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/check_tbl_value_col.Rd b/man/check_tbl_value_col.Rd index 36dc8cfb..b443a871 100644 --- a/man/check_tbl_value_col.Rd +++ b/man/check_tbl_value_col.Rd @@ -4,7 +4,13 @@ \alias{check_tbl_value_col} \title{Check output type values of model output data against config} \usage{ -check_tbl_value_col(tbl, round_id, file_path, hub_path) +check_tbl_value_col( + tbl, + round_id, + file_path, + hub_path, + derived_task_ids = NULL +) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated.} @@ -23,6 +29,10 @@ For more details consult the in the \code{arrow} package. The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} files within the \code{hub-config} directory.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/check_tbl_values.Rd b/man/check_tbl_values.Rd index 19229251..386bb55c 100644 --- a/man/check_tbl_values.Rd +++ b/man/check_tbl_values.Rd @@ -4,7 +4,7 @@ \alias{check_tbl_values} \title{Check model output data tbl contains valid value combinations} \usage{ -check_tbl_values(tbl, round_id, file_path, hub_path) +check_tbl_values(tbl, round_id, file_path, hub_path, derived_task_ids = NULL) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated. Column types must \strong{all be character}.} @@ -23,6 +23,10 @@ For more details consult the in the \code{arrow} package. The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} files within the \code{hub-config} directory.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/check_tbl_values_required.Rd b/man/check_tbl_values_required.Rd index 084df7c2..83cd5aab 100644 --- a/man/check_tbl_values_required.Rd +++ b/man/check_tbl_values_required.Rd @@ -5,7 +5,13 @@ \title{Check all required task ID/output type/output type ID value combinations present in model data.} \usage{ -check_tbl_values_required(tbl, round_id, file_path, hub_path) +check_tbl_values_required( + tbl, + round_id, + file_path, + hub_path, + derived_task_ids = NULL +) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated. Column types must \strong{all be character}.} @@ -24,6 +30,10 @@ For more details consult the in the \code{arrow} package. The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} files within the \code{hub-config} directory.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/expand_model_out_grid.Rd b/man/expand_model_out_grid.Rd index 701476ea..03f0563f 100644 --- a/man/expand_model_out_grid.Rd +++ b/man/expand_model_out_grid.Rd @@ -14,7 +14,9 @@ expand_model_out_grid( as_arrow_table = FALSE, bind_model_tasks = TRUE, include_sample_ids = FALSE, - compound_taskid_set = NULL + compound_taskid_set = NULL, + output_types = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -60,6 +62,13 @@ the \code{output_type_id} column.} in the round. Can be used to override the compound task ID set defined in the config. If \code{NULL} is provided for a given modeling task, a compound task ID set of all task IDs is used.} + +\item{output_types}{Character vector of output type names to include. +Use to subset for grids for specific output types.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ If \code{bind_model_tasks = TRUE} (default) a tibble or arrow table @@ -131,9 +140,9 @@ expand_model_out_grid(config_tasks, include_sample_ids = TRUE ) # Hub with sample output type and compound task ID structure -config_tasks <- hubUtils::read_config_file(system.file("config", "tasks-comp-tid.json", - package = "hubValidations" -)) +config_tasks <- hubUtils::read_config_file( + system.file("config", "tasks-comp-tid.json", package = "hubValidations") +) expand_model_out_grid(config_tasks, round_id = "2022-12-26", include_sample_ids = TRUE @@ -157,4 +166,28 @@ expand_model_out_grid(config_tasks, NULL ) ) +# Subset output types +config_tasks <- hubUtils::read_config( + system.file("testhubs", "samples", package = "hubValidations") +) +expand_model_out_grid(config_tasks, + round_id = "2022-10-29", + include_sample_ids = TRUE, + bind_model_tasks = FALSE, + output_types = c("sample", "pmf"), +) +expand_model_out_grid(config_tasks, + round_id = "2022-10-29", + include_sample_ids = TRUE, + bind_model_tasks = TRUE, + output_types = "sample", +) +# Ignore derived task IDs +expand_model_out_grid(config_tasks, + round_id = "2022-10-29", + include_sample_ids = TRUE, + bind_model_tasks = FALSE, + output_types = "sample", + derived_task_ids = "target_end_date" +) } diff --git a/man/get_tbl_compound_taskid_set.Rd b/man/get_tbl_compound_taskid_set.Rd index 98efe5ad..02ca0732 100644 --- a/man/get_tbl_compound_taskid_set.Rd +++ b/man/get_tbl_compound_taskid_set.Rd @@ -9,7 +9,8 @@ get_tbl_compound_taskid_set( config_tasks, round_id, compact = TRUE, - error = TRUE + error = TRUE, + derived_task_ids = NULL ) } \arguments{ @@ -25,6 +26,10 @@ Column types must \strong{all be character}.} \item{error}{Logical. If TRUE, an error will be thrown if the compound task ID set is not valid. If FALSE and an error is detected, the detected compound task ID set will be returned with error attributes attached.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ A list of vectors of compound task IDs detected in the tbl, one for each diff --git a/man/match_tbl_to_model_task.Rd b/man/match_tbl_to_model_task.Rd new file mode 100644 index 00000000..bf930b1e --- /dev/null +++ b/man/match_tbl_to_model_task.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/match_tbl_to_model_task.R +\name{match_tbl_to_model_task} +\alias{match_tbl_to_model_task} +\title{Match model output \code{tbl} data to their model tasks in \code{config_tasks}.} +\usage{ +match_tbl_to_model_task( + tbl, + config_tasks, + round_id, + output_types = NULL, + derived_task_ids = NULL, + all_character = TRUE +) +} +\arguments{ +\item{tbl}{a tibble/data.frame of the contents of the file being validated.} + +\item{config_tasks}{a list version of the content's of a hub's \code{tasks.json} +config file, accessed through the \code{"config_tasks"} attribute of a \verb{} +object or function \code{\link[hubUtils:read_config]{hubUtils::read_config()}}.} + +\item{round_id}{Character string. Round identifier. If the round is set to +\code{round_id_from_variable: true}, IDs are values of the task ID defined in the round's +\code{round_id} property of \code{config_tasks}. +Otherwise should match round's \code{round_id} value in config. Ignored if hub +contains only a single round.} + +\item{output_types}{Character vector of output type names to include. +Use to subset for grids for specific output types.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} + +\item{all_character}{Logical. Whether to return all character column.} +} +\value{ +A list containing a \code{tbl_df} of model output data matched to a model +task with one element per round model task. +} +\description{ +Split and match model output \code{tbl} data to their corresponding model tasks in +\code{config_tasks}. Useful for performing model task specific checks on model output. +For v3 samples, the \code{output_type_id} column is set to \code{NA} for \code{sample} outputs. +} +\examples{ +hub_path <- system.file("testhubs/samples", package = "hubValidations") +tbl <- read_model_out_file( + file_path = "flu-base/2022-10-22-flu-base.csv", + hub_path, coerce_types = "chr" +) +config_tasks <- hubUtils::read_config(hub_path, "tasks") +match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22") +match_tbl_to_model_task(tbl, config_tasks, + round_id = "2022-10-22", + output_types = "sample" +) +} diff --git a/man/submission_tmpl.Rd b/man/submission_tmpl.Rd index dfc6ced6..3b1a7efa 100644 --- a/man/submission_tmpl.Rd +++ b/man/submission_tmpl.Rd @@ -10,7 +10,9 @@ submission_tmpl( round_id, required_vals_only = FALSE, complete_cases_only = TRUE, - compound_taskid_set = NULL + compound_taskid_set = NULL, + output_types = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -38,6 +40,13 @@ are included in the output.} in the round. Can be used to override the compound task ID set defined in the config. If \code{NULL} is provided for a given modeling task, a compound task ID set of all task IDs is used.} + +\item{output_types}{Character vector of output type names to include. +Use to subset for grids for specific output types.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ a tibble template containing an expanded grid of valid task ID and @@ -127,4 +136,22 @@ submission_tmpl( NULL ) ) +# Subsetting for a single output type +submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-12-26", + output_types = "sample" +) +# Derive a template with ignored derived task ID. Useful to avoid creating +# a template with invalid derived task ID value combinations. +config_tasks <- hubUtils::read_config( + system.file("testhubs", "flusight", package = "hubValidations") +) +submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-12-12", + output_types = "pmf", + derived_task_ids = "target_end_date", + complete_cases_only = FALSE +) } diff --git a/man/validate_model_data.Rd b/man/validate_model_data.Rd index a06355cc..4ffa95e1 100644 --- a/man/validate_model_data.Rd +++ b/man/validate_model_data.Rd @@ -10,7 +10,8 @@ validate_model_data( round_id_col = NULL, output_type_id_datatype = c("from_config", "auto", "character", "double", "integer", "logical", "Date"), - validations_cfg_path = NULL + validations_cfg_path = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -47,6 +48,10 @@ behaviour so use with care.} \item{validations_cfg_path}{Path to \code{validations.yml} file. If \code{NULL} defaults to \code{hub-config/validations.yml}.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ An object of class \code{hub_validations}. Each named element contains diff --git a/man/validate_pr.Rd b/man/validate_pr.Rd index 9bc41312..3e6db5db 100644 --- a/man/validate_pr.Rd +++ b/man/validate_pr.Rd @@ -15,7 +15,8 @@ validate_pr( skip_submit_window_check = FALSE, file_modification_check = c("error", "warn", "message", "none"), allow_submit_window_mods = TRUE, - submit_window_ref_date_from = c("file", "file_path") + submit_window_ref_date_from = c("file", "file_path"), + derived_task_ids = NULL ) } \arguments{ @@ -81,6 +82,10 @@ Only applicable when a round is configured to determine the submission windows relative to the value in a date column in model output files. Not applicable when explicit submission window start and end dates are provided in the hub's config.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ An object of class \code{hub_validations}. diff --git a/man/validate_submission.Rd b/man/validate_submission.Rd index 71b403ef..cbbee86a 100644 --- a/man/validate_submission.Rd +++ b/man/validate_submission.Rd @@ -13,7 +13,8 @@ validate_submission( "logical", "Date"), skip_submit_window_check = FALSE, skip_check_config = FALSE, - submit_window_ref_date_from = c("file", "file_path") + submit_window_ref_date_from = c("file", "file_path"), + derived_task_ids = NULL ) } \arguments{ @@ -64,6 +65,10 @@ Only applicable when a round is configured to determine the submission windows relative to the value in a date column in model output files. Not applicable when explicit submission window start and end dates are provided in the hub's config.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ An object of class \code{hub_validations}. Each named element contains diff --git a/tests/testthat/_snaps/check_tbl_spl_compound_taskid_set.md b/tests/testthat/_snaps/check_tbl_spl_compound_taskid_set.md index 6949c488..2606df6d 100644 --- a/tests/testthat/_snaps/check_tbl_spl_compound_taskid_set.md +++ b/tests/testthat/_snaps/check_tbl_spl_compound_taskid_set.md @@ -268,3 +268,13 @@ $ use_cli_format : logi TRUE - attr(*, "class")= chr [1:5] "check_error" "hub_check" "rlang_error" "error" ... +# Ignoring derived_task_ids in check_tbl_spl_compound_taskid_set works + + Code + check_tbl_spl_compound_taskid_set(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date") + Output + + Message: + All samples in a model task conform to single, unique compound task ID set that matches or is coarser than the configured `compound_taksid_set`. + diff --git a/tests/testthat/_snaps/check_tbl_spl_compound_tid.md b/tests/testthat/_snaps/check_tbl_spl_compound_tid.md index 08eab7bd..097993b9 100644 --- a/tests/testthat/_snaps/check_tbl_spl_compound_tid.md +++ b/tests/testthat/_snaps/check_tbl_spl_compound_tid.md @@ -84,3 +84,13 @@ Message: Each sample compound task ID contains single, unique value. +# Ignoring derived_task_ids in check_tbl_spl_compound_tid works + + Code + check_tbl_spl_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date") + Output + + Message: + Each sample compound task ID contains single, unique value. + diff --git a/tests/testthat/_snaps/check_tbl_spl_n.md b/tests/testthat/_snaps/check_tbl_spl_n.md index e8b0be82..7d646f3c 100644 --- a/tests/testthat/_snaps/check_tbl_spl_n.md +++ b/tests/testthat/_snaps/check_tbl_spl_n.md @@ -155,7 +155,7 @@ -# Overriding compound_taskid_set in check_tbl_spl_compound_tid works +# Overriding compound_taskid_set in check_tbl_spl_n works Code str(check_tbl_spl_n(tbl_coarse, round_id, file_path, hub_path)) @@ -244,3 +244,12 @@ Warning: Number of samples per compound idx not consistent. Sample numbers supplied per compound idx vary between 9 and 10. See `errors` attribute for details. +# Ignoring derived_task_ids in check_tbl_spl_n works + + Code + check_tbl_spl_n(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date") + Output + + Message: + Required samples per compound idx task present. + diff --git a/tests/testthat/_snaps/check_tbl_spl_non_compound_tid.md b/tests/testthat/_snaps/check_tbl_spl_non_compound_tid.md index 428fe7fd..0b815291 100644 --- a/tests/testthat/_snaps/check_tbl_spl_non_compound_tid.md +++ b/tests/testthat/_snaps/check_tbl_spl_non_compound_tid.md @@ -61,3 +61,13 @@ Message: Task ID combinations of non compound task id values consistent across modeling task samples. +# Ignoring derived_task_ids in check_tbl_spl_compound_tid works + + Code + check_tbl_spl_non_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date") + Output + + Message: + Task ID combinations of non compound task id values consistent across modeling task samples. + diff --git a/tests/testthat/_snaps/check_tbl_value_col.md b/tests/testthat/_snaps/check_tbl_value_col.md index c9b1cbe3..0aa81a7b 100644 --- a/tests/testthat/_snaps/check_tbl_value_col.md +++ b/tests/testthat/_snaps/check_tbl_value_col.md @@ -34,3 +34,12 @@ Warning: Values in column `value` are not all valid with respect to modeling task config. Value "fail" cannot be coerced to expected data type "integer" for output type "quantile".Values in column `value` are not all valid with respect to modeling task config. Values "196.83", "244.85", "310.9", "499.9", "394.8", "461.85", "629.7", "534.6", "599.75", "727.5", "669.7", "725.562456357284", "801.689162048122", "800.239938093011", "915.5", "949.59", "938.67", "962.03", ..., "2775.56600111008", and "2139.07" cannot be coerced to expected data type "integer" for output type "quantile".Values in column `value` are not all valid with respect to modeling task config. Value -6 is smaller than allowed minimum value 0 for output type "quantile". +# Ignoring derived_task_ids in check_tbl_value_col works + + Code + check_tbl_value_col(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date") + Output + + Message: + Values in column `value` all valid with respect to modeling task config. + diff --git a/tests/testthat/_snaps/check_tbl_values.md b/tests/testthat/_snaps/check_tbl_values.md index 9b9d3654..fb0d3e76 100644 --- a/tests/testthat/_snaps/check_tbl_values.md +++ b/tests/testthat/_snaps/check_tbl_values.md @@ -38,3 +38,33 @@ Error: ! `tbl` contains invalid values/value combinations. Column `horizon` contains invalid values "11" and "12". +# Ignoring derived_task_ids in check_tbl_values works + + Code + check_tbl_values(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date") + Output + + Message: + `tbl` contains valid values/value combinations. + +--- + + Code + check_tbl_values(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date") + Output + + Error: + ! `tbl` contains invalid values/value combinations. Column `horizon` contains invalid value "9". Additionally row 2 contains invalid combinations of valid values. See `error_tbl` for details. + +--- + + Code + check_tbl_values(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date")$ + error_tbl + Output + # A tibble: 1 x 7 + location reference_date horizon target_end_date target output_type + + 1 US 2022-10-22 1 2022-10-29 wk inc flu hosp pmf + # i 1 more variable: output_type_id + diff --git a/tests/testthat/_snaps/check_tbl_values_required.md b/tests/testthat/_snaps/check_tbl_values_required.md index 89c819fd..a52aeb84 100644 --- a/tests/testthat/_snaps/check_tbl_values_required.md +++ b/tests/testthat/_snaps/check_tbl_values_required.md @@ -171,3 +171,12 @@ # i 11 more rows # i 1 more variable: output_type_id +# Ignoring derived_task_ids in check_tbl_values_required works + + Code + check_tbl_values_required(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date") + Output + + Message: + Required task ID/output type/output type ID combinations all present. + diff --git a/tests/testthat/_snaps/expand_model_out_grid.md b/tests/testthat/_snaps/expand_model_out_grid.md index edec6408..c1a91de2 100644 --- a/tests/testthat/_snaps/expand_model_out_grid.md +++ b/tests/testthat/_snaps/expand_model_out_grid.md @@ -485,6 +485,152 @@ 10 2022-12-26 wk ahead inc flu h~ 1 01 sample 4 # i 32 more rows +# expand_model_out_grid output type subsetting works + + Code + expand_model_out_grid(config_tasks, round_id = "2022-12-26", + include_sample_ids = TRUE, bind_model_tasks = FALSE, output_types = c("pmf", + "sample"), ) + Output + [[1]] + # A tibble: 6 x 6 + forecast_date target horizon location output_type output_type_id + + 1 2022-12-26 wk ahead inc flu ho~ 2 US sample 1 + 2 2022-12-26 wk ahead inc flu ho~ 2 01 sample 1 + 3 2022-12-26 wk ahead inc flu ho~ 2 02 sample 1 + 4 2022-12-26 wk ahead inc flu ho~ 1 US sample 2 + 5 2022-12-26 wk ahead inc flu ho~ 1 01 sample 2 + 6 2022-12-26 wk ahead inc flu ho~ 1 02 sample 2 + + [[2]] + # A tibble: 30 x 6 + forecast_date target horizon location output_type output_type_id + + 1 2022-12-26 wk flu hosp rate c~ 2 US pmf large_decrease + 2 2022-12-26 wk flu hosp rate c~ 1 US pmf large_decrease + 3 2022-12-26 wk flu hosp rate c~ 2 01 pmf large_decrease + 4 2022-12-26 wk flu hosp rate c~ 1 01 pmf large_decrease + 5 2022-12-26 wk flu hosp rate c~ 2 02 pmf large_decrease + 6 2022-12-26 wk flu hosp rate c~ 1 02 pmf large_decrease + 7 2022-12-26 wk flu hosp rate c~ 2 US pmf decrease + 8 2022-12-26 wk flu hosp rate c~ 1 US pmf decrease + 9 2022-12-26 wk flu hosp rate c~ 2 01 pmf decrease + 10 2022-12-26 wk flu hosp rate c~ 1 01 pmf decrease + # i 20 more rows + + +--- + + Code + expand_model_out_grid(config_tasks, round_id = "2022-12-26", + include_sample_ids = TRUE, bind_model_tasks = FALSE, output_types = "sample", ) + Output + [[1]] + # A tibble: 6 x 6 + forecast_date target horizon location output_type output_type_id + + 1 2022-12-26 wk ahead inc flu ho~ 2 US sample 1 + 2 2022-12-26 wk ahead inc flu ho~ 2 01 sample 1 + 3 2022-12-26 wk ahead inc flu ho~ 2 02 sample 1 + 4 2022-12-26 wk ahead inc flu ho~ 1 US sample 2 + 5 2022-12-26 wk ahead inc flu ho~ 1 01 sample 2 + 6 2022-12-26 wk ahead inc flu ho~ 1 02 sample 2 + + [[2]] + # A tibble: 0 x 0 + + +--- + + Code + expand_model_out_grid(config_tasks, round_id = "2022-12-26", + include_sample_ids = TRUE, bind_model_tasks = TRUE, output_types = "sample", ) + Output + # A tibble: 6 x 6 + forecast_date target horizon location output_type output_type_id + + 1 2022-12-26 wk ahead inc flu ho~ 2 US sample 1 + 2 2022-12-26 wk ahead inc flu ho~ 2 01 sample 1 + 3 2022-12-26 wk ahead inc flu ho~ 2 02 sample 1 + 4 2022-12-26 wk ahead inc flu ho~ 1 US sample 2 + 5 2022-12-26 wk ahead inc flu ho~ 1 01 sample 2 + 6 2022-12-26 wk ahead inc flu ho~ 1 02 sample 2 + +--- + + Code + expand_model_out_grid(config_tasks, round_id = "2022-12-26", + include_sample_ids = FALSE, bind_model_tasks = TRUE, output_types = c( + "random", "sample"), ) + Condition + Error in `expand_model_out_grid()`: + x "random" is not valid output type. + i `output_types` must be members of: "sample", "mean", and "pmf" + +--- + + Code + expand_model_out_grid(config_tasks, round_id = "2022-12-26", + include_sample_ids = FALSE, bind_model_tasks = FALSE, output_types = c( + "random"), ) + Condition + Error in `expand_model_out_grid()`: + x "random" is not valid output type. + i `output_types` must be members of: "sample", "mean", and "pmf" + +# expand_model_out_grid derived_task_ids ignoring works + + Code + expand_model_out_grid(config_tasks, round_id = "2022-10-22", + include_sample_ids = FALSE, bind_model_tasks = TRUE, output_types = "sample", + derived_task_ids = "target_end_date") + Output + # A tibble: 80 x 8 + reference_date target horizon location variant target_end_date output_type + + 1 2022-10-22 wk inc f~ 0 US AA NA sample + 2 2022-10-22 wk inc f~ 1 US AA NA sample + 3 2022-10-22 wk inc f~ 2 US AA NA sample + 4 2022-10-22 wk inc f~ 3 US AA NA sample + 5 2022-10-22 wk inc f~ 0 01 AA NA sample + 6 2022-10-22 wk inc f~ 1 01 AA NA sample + 7 2022-10-22 wk inc f~ 2 01 AA NA sample + 8 2022-10-22 wk inc f~ 3 01 AA NA sample + 9 2022-10-22 wk inc f~ 0 02 AA NA sample + 10 2022-10-22 wk inc f~ 1 02 AA NA sample + # i 70 more rows + # i 1 more variable: output_type_id + +--- + + Code + expand_model_out_grid(config_tasks, round_id = "2022-10-22", + include_sample_ids = TRUE, bind_model_tasks = TRUE, output_types = "sample", + derived_task_ids = "target_end_date", required_vals_only = TRUE) + Condition + Warning: + The compound task IDs horizon and target_end_date have all optional values. Representation of compound sample modeling tasks is not fully specified. + Output + # A tibble: 4 x 5 + reference_date location variant output_type output_type_id + + 1 2022-10-22 US AA sample 1 + 2 2022-10-22 01 AA sample 2 + 3 2022-10-22 US BB sample 3 + 4 2022-10-22 01 BB sample 4 + +--- + + Code + expand_model_out_grid(config_tasks, round_id = "2022-10-22", + include_sample_ids = FALSE, bind_model_tasks = FALSE, output_types = "sample", + derived_task_ids = c("location", "variant")) + Condition + Error in `expand_model_out_grid()`: + x Derived task IDs cannot have required task ID values. + ! "location" and "variant" have required task ID values. Ignored. + # expand_model_out_grid errors correctly Code diff --git a/tests/testthat/_snaps/match_tbl_to_model_task.md b/tests/testthat/_snaps/match_tbl_to_model_task.md new file mode 100644 index 00000000..cc4d3ae3 --- /dev/null +++ b/tests/testthat/_snaps/match_tbl_to_model_task.md @@ -0,0 +1,67 @@ +# match_tbl_to_model_task works + + Code + match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22") + Output + [[1]] + # A tibble: 60 x 8 + reference_date target horizon location target_end_date output_type + + 1 2022-10-22 wk flu hosp rate~ 0 US 2022-10-22 pmf + 2 2022-10-22 wk flu hosp rate~ 0 01 2022-10-22 pmf + 3 2022-10-22 wk flu hosp rate~ 0 02 2022-10-22 pmf + 4 2022-10-22 wk flu hosp rate~ 0 04 2022-10-22 pmf + 5 2022-10-22 wk flu hosp rate~ 0 05 2022-10-22 pmf + 6 2022-10-22 wk flu hosp rate~ 1 US 2022-10-29 pmf + 7 2022-10-22 wk flu hosp rate~ 1 01 2022-10-29 pmf + 8 2022-10-22 wk flu hosp rate~ 1 02 2022-10-29 pmf + 9 2022-10-22 wk flu hosp rate~ 1 04 2022-10-29 pmf + 10 2022-10-22 wk flu hosp rate~ 1 05 2022-10-29 pmf + # i 50 more rows + # i 2 more variables: output_type_id , value + + [[2]] + # A tibble: 1,530 x 8 + reference_date target horizon location target_end_date output_type + + 1 2022-10-22 wk inc flu hosp 0 US 2022-10-22 mean + 2 2022-10-22 wk inc flu hosp 0 01 2022-10-22 mean + 3 2022-10-22 wk inc flu hosp 0 02 2022-10-22 mean + 4 2022-10-22 wk inc flu hosp 0 04 2022-10-22 mean + 5 2022-10-22 wk inc flu hosp 0 05 2022-10-22 mean + 6 2022-10-22 wk inc flu hosp 1 US 2022-10-29 mean + 7 2022-10-22 wk inc flu hosp 1 01 2022-10-29 mean + 8 2022-10-22 wk inc flu hosp 1 02 2022-10-29 mean + 9 2022-10-22 wk inc flu hosp 1 04 2022-10-29 mean + 10 2022-10-22 wk inc flu hosp 1 05 2022-10-29 mean + # i 1,520 more rows + # i 2 more variables: output_type_id , value + + +--- + + Code + match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22", + output_types = "sample") + Output + [[1]] + NULL + + [[2]] + # A tibble: 1,500 x 8 + reference_date target horizon location target_end_date output_type + + 1 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 2 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 3 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 4 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 5 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 6 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 7 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 8 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 9 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 10 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + # i 1,490 more rows + # i 2 more variables: output_type_id , value + + diff --git a/tests/testthat/_snaps/submission_tmpl.md b/tests/testthat/_snaps/submission_tmpl.md index c3939889..247f27b0 100644 --- a/tests/testthat/_snaps/submission_tmpl.md +++ b/tests/testthat/_snaps/submission_tmpl.md @@ -290,3 +290,64 @@ x "random_var" is not valid task ID. i The `compound_taskid_set` must be a subset of "forecast_date", "target", "horizon", and "location". +# submission_tmpl output type subsetting works + + Code + submission_tmpl(config_tasks = config_tasks, round_id = "2022-12-26", + output_types = "sample") + Output + # A tibble: 6 x 7 + forecast_date target horizon location output_type output_type_id value + + 1 2022-12-26 wk ahead inc ~ 2 US sample 1 NA + 2 2022-12-26 wk ahead inc ~ 2 01 sample 1 NA + 3 2022-12-26 wk ahead inc ~ 2 02 sample 1 NA + 4 2022-12-26 wk ahead inc ~ 1 US sample 2 NA + 5 2022-12-26 wk ahead inc ~ 1 01 sample 2 NA + 6 2022-12-26 wk ahead inc ~ 1 02 sample 2 NA + +--- + + Code + submission_tmpl(config_tasks = config_tasks, round_id = "2022-12-26", + output_types = c("mean", "sample")) + Output + # A tibble: 12 x 7 + forecast_date target horizon location output_type output_type_id value + + 1 2022-12-26 wk ahead inc~ 2 US mean NA + 2 2022-12-26 wk ahead inc~ 1 US mean NA + 3 2022-12-26 wk ahead inc~ 2 01 mean NA + 4 2022-12-26 wk ahead inc~ 1 01 mean NA + 5 2022-12-26 wk ahead inc~ 2 02 mean NA + 6 2022-12-26 wk ahead inc~ 1 02 mean NA + 7 2022-12-26 wk ahead inc~ 2 US sample 1 NA + 8 2022-12-26 wk ahead inc~ 2 01 sample 1 NA + 9 2022-12-26 wk ahead inc~ 2 02 sample 1 NA + 10 2022-12-26 wk ahead inc~ 1 US sample 2 NA + 11 2022-12-26 wk ahead inc~ 1 01 sample 2 NA + 12 2022-12-26 wk ahead inc~ 1 02 sample 2 NA + +# submission_tmpl ignoring derived task ids works + + Code + submission_tmpl(config_tasks = config_tasks, round_id = "2022-10-22", + output_types = "sample", derived_task_ids = "target_end_date", + complete_cases_only = FALSE) + Output + # A tibble: 80 x 9 + reference_date target horizon location variant target_end_date output_type + + 1 2022-10-22 wk inc f~ 0 US AA NA sample + 2 2022-10-22 wk inc f~ 1 US AA NA sample + 3 2022-10-22 wk inc f~ 2 US AA NA sample + 4 2022-10-22 wk inc f~ 3 US AA NA sample + 5 2022-10-22 wk inc f~ 0 01 AA NA sample + 6 2022-10-22 wk inc f~ 1 01 AA NA sample + 7 2022-10-22 wk inc f~ 2 01 AA NA sample + 8 2022-10-22 wk inc f~ 3 01 AA NA sample + 9 2022-10-22 wk inc f~ 0 02 AA NA sample + 10 2022-10-22 wk inc f~ 1 02 AA NA sample + # i 70 more rows + # i 2 more variables: output_type_id , value + diff --git a/tests/testthat/_snaps/validate_submission.md b/tests/testthat/_snaps/validate_submission.md index 34331062..bbd82308 100644 --- a/tests/testthat/_snaps/validate_submission.md +++ b/tests/testthat/_snaps/validate_submission.md @@ -943,3 +943,219 @@ Message: Column data types match hub schema. +# Ignoring derived_task_ids in validate_submission works + + Code + str(validate_submission(hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", skip_submit_window_check = TRUE, + derived_task_ids = "target_end_date")) + Output + List of 24 + $ valid_config :List of 4 + ..$ message : chr "All hub config files are valid. \n " + ..$ where : chr "samples" + ..$ call : chr "check_config_hub_valid" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_exists :List of 4 + ..$ message : chr "File exists at path 'model-output/flu-base/2022-10-22-flu-base.csv'. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_exists" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_name :List of 4 + ..$ message : chr "File name \"2022-10-22-flu-base.csv\" is valid. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_name" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_location :List of 4 + ..$ message : chr "File directory name matches `model_id`\n metadata in file name. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_location" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ round_id_valid :List of 4 + ..$ message : chr "`round_id` is valid. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_valid_round_id" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_format :List of 4 + ..$ message : chr "File is accepted hub format. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_format" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ metadata_exists :List of 4 + ..$ message : chr "Metadata file exists at path 'model-metadata/flu-base.yml'. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_submission_metadata_file_exists" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_read :List of 4 + ..$ message : chr "File could be read successfully. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_read" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ valid_round_id_col :List of 4 + ..$ message : chr "`round_id_col` name is valid. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_valid_round_id_col" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ unique_round_id :List of 4 + ..$ message : chr "`round_id` column \"reference_date\" contains a single, unique round ID value. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_unique_round_id" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ match_round_id :List of 4 + ..$ message : chr "All `round_id_col` \"reference_date\" values match submission `round_id` from file name. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_match_round_id" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ colnames :List of 4 + ..$ message : chr "Column names are consistent with expected round task IDs and std column names. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_colnames" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ col_types :List of 4 + ..$ message : chr "Column data types match hub schema. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_col_types" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ valid_vals :List of 5 + ..$ message : chr "`tbl` contains valid values/value combinations. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ error_tbl : NULL + ..$ call : chr "check_tbl_values" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ rows_unique :List of 4 + ..$ message : chr "All combinations of task ID column/`output_type`/`output_type_id` values are unique. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_rows_unique" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ req_vals :List of 5 + ..$ message : chr "Required task ID/output type/output type ID combinations all present. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ missing : tibble [0 x 7] (S3: tbl_df/tbl/data.frame) + .. ..$ location : chr(0) + .. ..$ reference_date : chr(0) + .. ..$ horizon : chr(0) + .. ..$ target_end_date: chr(0) + .. ..$ target : chr(0) + .. ..$ output_type : chr(0) + .. ..$ output_type_id : chr(0) + ..$ call : chr "check_tbl_values_required" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ value_col_valid :List of 4 + ..$ message : chr "Values in column `value` all valid with respect to modeling task config. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_value_col" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ value_col_non_desc :List of 4 + ..$ message : chr "No quantile or cdf output types to check for non-descending values.\n Check skipped." + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_value_col_ascending" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_info" "hub_check" "rlang_message" "message" ... + $ value_col_sum1 :List of 5 + ..$ message : chr "Values in `value` column do sum to 1 for all unique task ID value combination of pmf\n output types. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ error_tbl : NULL + ..$ call : chr "check_tbl_value_col_sum1" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ spl_compound_taskid_set:List of 6 + ..$ message : chr "All samples in a model task conform to single, unique compound task ID set that matches or is\n coarser than"| __truncated__ + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ errors : NULL + ..$ compound_taskid_set:List of 2 + .. ..$ 1: NULL + .. ..$ 2: chr [1:2] "reference_date" "location" + ..$ call : chr "check_tbl_spl_compound_taskid_set" + ..$ use_cli_format : logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ spl_compound_tid :List of 5 + ..$ message : chr "Each sample compound task ID contains single, unique value. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ errors : NULL + ..$ call : chr "check_tbl_spl_compound_tid" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ spl_non_compound_tid :List of 5 + ..$ message : chr "Task ID combinations of non compound task id values consistent across modeling task samples. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ errors : NULL + ..$ call : chr "check_tbl_spl_non_compound_tid" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ spl_n :List of 5 + ..$ message : chr "Required samples per compound idx task present. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ errors : NULL + ..$ call : chr "check_tbl_spl_n" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ horizon_timediff :List of 4 + ..$ message : chr "Time differences between t0 var `reference_date` and t1 var\n `target_end_date` all match expected perio"| __truncated__ + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : NULL + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + - attr(*, "class")= chr [1:2] "hub_validations" "list" + +--- + + Code + validate_submission(hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", skip_submit_window_check = TRUE, + derived_task_ids = "target_end_date")[c("valid_vals", "req_vals", + "value_col_valid", "spl_n", "spl_compound_taskid_set", "spl_compound_tid", + "spl_non_compound_tid")] + Output + $valid_vals + + Message: + `tbl` contains valid values/value combinations. + + $req_vals + + Message: + Required task ID/output type/output type ID combinations all present. + + $value_col_valid + + Message: + Values in column `value` all valid with respect to modeling task config. + + $spl_n + + Message: + Required samples per compound idx task present. + + $spl_compound_taskid_set + + Message: + All samples in a model task conform to single, unique compound task ID set that matches or is coarser than the configured `compound_taksid_set`. + + $spl_compound_tid + + Message: + Each sample compound task ID contains single, unique value. + + $spl_non_compound_tid + + Message: + Task ID combinations of non compound task id values consistent across modeling task samples. + + diff --git a/tests/testthat/test-check_tbl_spl_compound_taskid_set.R b/tests/testthat/test-check_tbl_spl_compound_taskid_set.R index b63b45a8..b1c7cdd3 100644 --- a/tests/testthat/test-check_tbl_spl_compound_taskid_set.R +++ b/tests/testthat/test-check_tbl_spl_compound_taskid_set.R @@ -168,3 +168,31 @@ test_that("Finer compound_taskid_sets work", { ) ) }) + +test_that("Ignoring derived_task_ids in check_tbl_spl_compound_taskid_set works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- tbl_orig <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + # Introduce invalid value to derived task id that should be ignored when using + # `derived_task_ids`. + tbl[1, "target_end_date"] <- "random_date" + expect_snapshot( + check_tbl_spl_compound_taskid_set(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + # Check that ignoring derived task ids returns same result as not ignoring. + expect_equal( + check_tbl_spl_compound_taskid_set(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ), + check_tbl_spl_compound_taskid_set(tbl_orig, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) +}) diff --git a/tests/testthat/test-check_tbl_spl_compound_tid.R b/tests/testthat/test-check_tbl_spl_compound_tid.R index 78dc1f6e..8d3d4293 100644 --- a/tests/testthat/test-check_tbl_spl_compound_tid.R +++ b/tests/testthat/test-check_tbl_spl_compound_tid.R @@ -56,8 +56,6 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", n_samples = 1L ) - - # Normal validation should return check failure expect_snapshot( str( @@ -72,3 +70,31 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", ) ) }) + +test_that("Ignoring derived_task_ids in check_tbl_spl_compound_tid works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- tbl_orig <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + # Introduce invalid value to derived task id that should be ignored when using + # `derived_task_ids`. + tbl[1, "target_end_date"] <- "random_date" + expect_snapshot( + check_tbl_spl_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + # Check that ignoring derived task ids returns same result as not ignoring. + expect_equal( + check_tbl_spl_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ), + check_tbl_spl_compound_tid(tbl_orig, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) +}) diff --git a/tests/testthat/test-check_tbl_spl_n.R b/tests/testthat/test-check_tbl_spl_n.R index 53467ea2..db56f9cc 100644 --- a/tests/testthat/test-check_tbl_spl_n.R +++ b/tests/testthat/test-check_tbl_spl_n.R @@ -65,7 +65,7 @@ test_that("check_tbl_spl_n works", { }) -test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", { +test_that("Overriding compound_taskid_set in check_tbl_spl_n works", { hub_path <- test_path("testdata/hub-spl") file_path <- "flu-base/2022-10-22-flu-base.csv" round_id <- "2022-10-22" @@ -128,3 +128,32 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", ) ) }) + + +test_that("Ignoring derived_task_ids in check_tbl_spl_n works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- tbl_orig <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + # Introduce invalid value to derived task id that should be ignored when using + # `derived_task_ids`. + tbl[1, "target_end_date"] <- "random_date" + expect_snapshot( + check_tbl_spl_n(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + # Check that ignoring derived task ids returns same result as not ignoring. + expect_equal( + check_tbl_spl_n(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ), + check_tbl_spl_n(tbl_orig, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) +}) diff --git a/tests/testthat/test-check_tbl_spl_non_compound_tid.R b/tests/testthat/test-check_tbl_spl_non_compound_tid.R index 43f535e5..ac735c7a 100644 --- a/tests/testthat/test-check_tbl_spl_non_compound_tid.R +++ b/tests/testthat/test-check_tbl_spl_non_compound_tid.R @@ -72,3 +72,31 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", ) ) }) + +test_that("Ignoring derived_task_ids in check_tbl_spl_compound_tid works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- tbl_orig <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + # Introduce invalid value to derived task id that should be ignored when using + # `derived_task_ids`. + tbl[1, "target_end_date"] <- "random_date" + expect_snapshot( + check_tbl_spl_non_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + # Check that ignoring derived task ids returns same result as not ignoring. + expect_equal( + check_tbl_spl_non_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ), + check_tbl_spl_non_compound_tid(tbl_orig, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) +}) diff --git a/tests/testthat/test-check_tbl_value_col.R b/tests/testthat/test-check_tbl_value_col.R index e7a02dac..12ca766e 100644 --- a/tests/testthat/test-check_tbl_value_col.R +++ b/tests/testthat/test-check_tbl_value_col.R @@ -43,3 +43,31 @@ test_that("check_tbl_value_col errors correctly", { check_tbl_value_col(tbl, round_id, file_path, hub_path) ) }) + +test_that("Ignoring derived_task_ids in check_tbl_value_col works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- tbl_orig <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + # Introduce invalid value to derived task id that should be ignored when using + # `derived_task_ids`. + tbl[1, "target_end_date"] <- "random_date" + expect_snapshot( + check_tbl_value_col(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + # Check that ignoring derived task ids returns same result as not ignoring. + expect_equal( + check_tbl_value_col(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ), + check_tbl_value_col(tbl_orig, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) +}) diff --git a/tests/testthat/test-check_tbl_values.R b/tests/testthat/test-check_tbl_values.R index 35c116be..bee5d360 100644 --- a/tests/testthat/test-check_tbl_values.R +++ b/tests/testthat/test-check_tbl_values.R @@ -195,3 +195,56 @@ test_that("check_tbl_values works with v3 spec samples", { ) ) }) + + +test_that("Ignoring derived_task_ids in check_tbl_values works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- tbl_orig <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + # Introduce invalid value to derived task id that should be ignored when using + # `derived_task_ids`. + tbl[1, "target_end_date"] <- "random_date" + expect_snapshot( + check_tbl_values(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + # Check that ignoring derived task ids returns same result as not ignoring. + expect_equal( + check_tbl_values(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ), + check_tbl_values(tbl_orig, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + + # Trigger invalid value error + tbl[1, "horizon"] <- tbl_orig[1, "horizon"] <- "9" + # Trigger invalid value combination error + tbl[2, "output_type"] <- tbl_orig[2, "output_type"] <- "pmf" + expect_snapshot( + check_tbl_values(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + expect_snapshot( + check_tbl_values(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + )$error_tbl + ) + + expect_equal( + check_tbl_values(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ), + check_tbl_values(tbl_orig, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) +}) diff --git a/tests/testthat/test-check_tbl_values_required.R b/tests/testthat/test-check_tbl_values_required.R index b9e90c27..4c5484db 100644 --- a/tests/testthat/test-check_tbl_values_required.R +++ b/tests/testthat/test-check_tbl_values_required.R @@ -222,5 +222,32 @@ test_that("check_tbl_values_required works with v3 spec samples", { c("pmf", "sample", "mean", "median") ) expect_true(all(missing$location == "US")) +}) +test_that("Ignoring derived_task_ids in check_tbl_values_required works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- tbl_orig <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + # Introduce invalid value to derived task id that should be ignored when using + # `derived_task_ids`. + tbl[1, "target_end_date"] <- "random_date" + expect_snapshot( + check_tbl_values_required(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + # Check that ignoring derived task ids returns same result as not ignoring. + expect_equal( + check_tbl_values_required(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ), + check_tbl_values_required(tbl_orig, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) }) diff --git a/tests/testthat/test-expand_model_out_grid.R b/tests/testthat/test-expand_model_out_grid.R index 930362a8..f94f4230 100644 --- a/tests/testthat/test-expand_model_out_grid.R +++ b/tests/testthat/test-expand_model_out_grid.R @@ -340,6 +340,100 @@ test_that("expand_model_out_grid output controls with samples work correctly", { ) }) + +test_that("expand_model_out_grid output type subsetting works", { + config_tasks <- hubUtils::read_config_file( + system.file("config", "tasks-comp-tid.json", + package = "hubValidations" + ) + ) + expect_snapshot( + expand_model_out_grid(config_tasks, + round_id = "2022-12-26", + include_sample_ids = TRUE, + bind_model_tasks = FALSE, + output_types = c("pmf", "sample"), + ) + ) + + expect_snapshot( + expand_model_out_grid(config_tasks, + round_id = "2022-12-26", + include_sample_ids = TRUE, + bind_model_tasks = FALSE, + output_types = "sample", + ) + ) + + expect_snapshot( + expand_model_out_grid(config_tasks, + round_id = "2022-12-26", + include_sample_ids = TRUE, + bind_model_tasks = TRUE, + output_types = "sample", + ) + ) + + # If a valid output type is provided, invalid ones just ignored + expect_snapshot( + expand_model_out_grid(config_tasks, + round_id = "2022-12-26", + include_sample_ids = FALSE, + bind_model_tasks = TRUE, + output_types = c("random", "sample"), + ), + error = TRUE + ) + + # If no valid output type provided, errors + expect_snapshot( + expand_model_out_grid(config_tasks, + round_id = "2022-12-26", + include_sample_ids = FALSE, + bind_model_tasks = FALSE, + output_types = c("random"), + ), + error = TRUE + ) +}) + +test_that("expand_model_out_grid derived_task_ids ignoring works", { + config_tasks <- hubUtils::read_config(test_path("testdata", "hub-spl")) + + expect_snapshot( + expand_model_out_grid(config_tasks, + round_id = "2022-10-22", + include_sample_ids = FALSE, + bind_model_tasks = TRUE, + output_types = "sample", + derived_task_ids = "target_end_date" + ) + ) + expect_snapshot( + expand_model_out_grid(config_tasks, + round_id = "2022-10-22", + include_sample_ids = TRUE, + bind_model_tasks = TRUE, + output_types = "sample", + derived_task_ids = "target_end_date", + required_vals_only = TRUE + ) + ) + + expect_snapshot( + expand_model_out_grid(config_tasks, + round_id = "2022-10-22", + include_sample_ids = FALSE, + bind_model_tasks = FALSE, + output_types = "sample", + derived_task_ids = c("location", "variant") + ), + error = TRUE + ) +}) + + + test_that("expand_model_out_grid errors correctly", { # Specifying a round in a hub with multiple rounds hub_con <- hubData::connect_hub( diff --git a/tests/testthat/test-match_tbl_to_model_task.R b/tests/testthat/test-match_tbl_to_model_task.R new file mode 100644 index 00000000..18cfc3be --- /dev/null +++ b/tests/testthat/test-match_tbl_to_model_task.R @@ -0,0 +1,18 @@ +test_that("match_tbl_to_model_task works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + tbl <- read_model_out_file( + file_path = "flu-base/2022-10-22-flu-base.csv", + hub_path, coerce_types = "chr" + ) + config_tasks <- hubUtils::read_config(hub_path, "tasks") + + expect_snapshot( + match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22") + ) + expect_snapshot( + match_tbl_to_model_task(tbl, config_tasks, + round_id = "2022-10-22", + output_types = "sample" + ) + ) +}) diff --git a/tests/testthat/test-submission_tmpl.R b/tests/testthat/test-submission_tmpl.R index 8d1b8e05..6e9aae72 100644 --- a/tests/testthat/test-submission_tmpl.R +++ b/tests/testthat/test-submission_tmpl.R @@ -217,3 +217,41 @@ test_that("submission_tmpl errors correctly", { error = TRUE ) }) + +test_that("submission_tmpl output type subsetting works", { + config_tasks <- hubUtils::read_config_file(system.file("config", "tasks-comp-tid.json", + package = "hubValidations" + )) + + # Subsetting for a single output type + expect_snapshot( + submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-12-26", + output_types = "sample" + ) + ) + + # Subsetting for a two output types + expect_snapshot( + submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-12-26", + output_types = c("mean", "sample") + ) + ) +}) + +test_that("submission_tmpl ignoring derived task ids works", { + config_tasks <- hubUtils::read_config(test_path("testdata", "hub-spl")) + + expect_snapshot( + submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-10-22", + output_types = "sample", + derived_task_ids = "target_end_date", + complete_cases_only = FALSE + ) + ) +}) diff --git a/tests/testthat/test-validate_submission.R b/tests/testthat/test-validate_submission.R index 89ae1d05..1916cdbd 100644 --- a/tests/testthat/test-validate_submission.R +++ b/tests/testthat/test-validate_submission.R @@ -319,3 +319,63 @@ test_that("validate_submission handles overriding output type id data type corre )[["col_types"]] ) }) + +test_that("Ignoring derived_task_ids in validate_submission works", { + # Validation passes + expect_snapshot( + str( + validate_submission( + hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", + skip_submit_window_check = TRUE, + derived_task_ids = "target_end_date" + ) + ) + ) + # Results of validation the same + expect_equal( + validate_submission( + hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", + skip_submit_window_check = TRUE, + derived_task_ids = "target_end_date" + ), + validate_submission( + hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", + skip_submit_window_check = TRUE + ) + ) + + # Ensure derived_task_ids values are ignored in validate submission by introducing + # deliberate error in derived_task_ids through mocking. + # This should not impact successful validation of affected checks + tbl_mod <- read_model_out_file( + file_path = "flu-base/2022-10-22-flu-base.csv", + hub_path = system.file("testhubs/samples", package = "hubValidations"), + coerce_types = "chr" + ) + tbl_mod[1, "target_end_date"] <- "2092-10-22" + mockery::stub( + validate_submission, + "read_model_out_file", + tbl_mod, + 2 + ) + expect_snapshot( + validate_submission( + hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", + skip_submit_window_check = TRUE, + derived_task_ids = "target_end_date" + )[c( + "valid_vals", + "req_vals", + "value_col_valid", + "spl_n", + "spl_compound_taskid_set", + "spl_compound_tid", + "spl_non_compound_tid" + )] + ) +}) diff --git a/vignettes/articles/validate-pr.Rmd b/vignettes/articles/validate-pr.Rmd index 2b178c24..9571c1af 100644 --- a/vignettes/articles/validate-pr.Rmd +++ b/vignettes/articles/validate-pr.Rmd @@ -56,6 +56,29 @@ Here's an example of what the workflow looks like on GitHub: knitr::include_graphics("assets/ga-action-screenshot.png") ``` +### Ignoring derived task IDs to improve performance + +Argument **`derived_task_ids`** allows for the specification of **task IDs that are derived from other task IDs** to be ignored, which **can often lead to a significant improvement in validation performance**. + +#### What are derived task IDs? + +Derived task IDs are a class of task ID whose values depend on the values of other task IDs. As such, the **validity of derived task ID values is dependent on the values of the task IDs they are derived from** and the validity of value combinations of derived and other task IDs is much more restricted. A common example of a derived task ID is `target_end_date` which is most often derived from the `reference_date` or `origin_date` and `horizon` task ids. + +#### Implications of derived task IDs on validation performance + +With standard validation, derived task IDs like `target_end_date` tend to pollute the expanded grid used to validate valid value combination with invalid combinations. That's because, while a given combination of `reference_date` and `horizon` values will only have a single valid `target_end_date` value, the `target_end_date` property in the config will contain all possible valid values for all combinations of `reference_date` and `horizon`. Because its the values in the config used to create the expanded valid values grid, the sizes these grids can reach as function of config complexity can often put excessive strain on the system's memory, affecting overall performance of PR validation. + +#### How to ignore derived task IDs + +Supplying the names of derived task IDs to argument `derived_task_ids` will ignore them during validation checks and, depending on config complexity, this **can lead to a significant improvement in validation performance**. + + ### Skipping submission window checks @@ -109,6 +132,7 @@ These settings can be modified if required though the use of arguments `file_mod Is ignored when checking model metadata files as well as when `file_modification_check` is set to `"none"`. +