From 2aa70e9c75275e5393780452651b9c44051d6e92 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Fri, 9 Aug 2024 16:52:29 +0300 Subject: [PATCH 01/17] optimise check_tbl_values by output_type batching & ignoring derived task-ids. Related to #93 --- R/check_tbl_values.R | 76 +++++++++++++++++++++++++------------- R/config_tasks-utils.R | 61 ++++++++++++++++++++++++++++++ R/validate_model_data.R | 7 +++- man/check_tbl_values.Rd | 6 ++- man/validate_model_data.Rd | 7 +++- 5 files changed, 127 insertions(+), 30 deletions(-) diff --git a/R/check_tbl_values.R b/R/check_tbl_values.R index 89a75de2..5ddb954e 100644 --- a/R/check_tbl_values.R +++ b/R/check_tbl_values.R @@ -1,34 +1,24 @@ #' Check model output data tbl contains valid value combinations #' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**. #' @inherit check_tbl_colnames params +#' @inheritParams expand_model_out_grid #' @inherit check_tbl_colnames return #' @export -check_tbl_values <- function(tbl, round_id, file_path, hub_path) { +check_tbl_values <- function(tbl, round_id, file_path, hub_path, + derived_task_ids = NULL) { config_tasks <- hubUtils::read_config(hub_path, "tasks") - - # Coerce accepted vals to character for easier comparison of - # values. Tried to use arrow tbls for comparisons as more efficient when - # working with larger files but currently arrow does not match NAs as dplyr - # does, returning false positives for mean & median rows which contain NA in - # output type ID column. - accepted_vals <- expand_model_out_grid( - config_tasks = config_tasks, - round_id = round_id, - all_character = TRUE - ) - - # This approach uses dplyr to identify tbl rows that don't have a complete match - # in accepted_vals. - accepted_vals$valid <- TRUE - if (hubUtils::is_v3_config(config_tasks)) { - out_type_ids <- tbl[["output_type_id"]] - tbl[tbl$output_type == "sample", "output_type_id"] <- NA + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ } - valid_tbl <- dplyr::left_join( - tbl, accepted_vals, - by = names(tbl)[names(tbl) != "value"] - ) + valid_tbl <- tbl %>% + split(f = tbl$output_type) %>% + purrr::imap( + ~ check_values_by_output_type(tbl = .x, output_type = .y, + config_tasks = config_tasks, + round_id = round_id, + derived_task_ids = derived_task_ids) + ) %>% purrr::list_rbind() check <- !any(is.na(valid_tbl$valid)) @@ -36,7 +26,8 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) { details <- NULL error_tbl <- NULL } else { - error_summary <- summarise_invalid_values(valid_tbl, accepted_vals) + error_summary <- summarise_invalid_values(valid_tbl, config_tasks, round_id, + derived_task_ids) details <- error_summary$msg if (length(error_summary$invalid_combs_idx) == 0L) { error_tbl <- NULL @@ -66,10 +57,43 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) { ) } -summarise_invalid_values <- function(valid_tbl, accepted_vals) { +check_values_by_output_type <- function(tbl, output_type, config_tasks, round_id, + derived_task_ids = NULL) { + + # Coerce accepted vals to character for easier comparison of + # values. Tried to use arrow tbls for comparisons as more efficient when + # working with larger files but currently arrow does not match NAs as dplyr + # does, returning false positives for mean & median rows which contain NA in + # output type ID column. + accepted_vals <- expand_model_out_grid( + config_tasks = config_tasks, + round_id = round_id, + all_character = TRUE, + output_types = output_type, + derived_task_ids = derived_task_ids + ) + + # This approach uses dplyr to identify tbl rows that don't have a complete match + # in accepted_vals. + accepted_vals$valid <- TRUE + if (hubUtils::is_v3_config(config_tasks)) { + out_type_ids <- tbl[["output_type_id"]] + tbl[tbl$output_type == "sample", "output_type_id"] <- NA + } + + valid_tbl <- dplyr::left_join( + tbl, accepted_vals, + by = names(tbl)[names(tbl) != "value"] + ) + +} + +summarise_invalid_values <- function(valid_tbl, config_tasks, round_id, + derived_task_ids) { cols <- names(valid_tbl)[!names(valid_tbl) %in% c("value", "valid")] uniq_tbl <- purrr::map(valid_tbl[cols], unique) - uniq_config <- purrr::map(accepted_vals[cols], unique) + uniq_config <- get_round_config_values(config_tasks, round_id, + derived_task_ids)[cols] invalid_vals <- purrr::map2( uniq_tbl, uniq_config, diff --git a/R/config_tasks-utils.R b/R/config_tasks-utils.R index 1c6e1bc6..2266f529 100644 --- a/R/config_tasks-utils.R +++ b/R/config_tasks-utils.R @@ -27,3 +27,64 @@ get_round_output_type_names <- function(config_tasks, round_id, out } } + +# get all task_ids values +get_round_config_values <- function(config_tasks, round_id, + derived_task_ids = NULL) { + model_tasks <- hubUtils::get_round_model_tasks(config_tasks, round_id) + task_id_names <- setdiff( + hubUtils::get_round_task_id_names(config_tasks, round_id), + derived_task_ids + ) + task_id_values <- purrr::map( + purrr::set_names(task_id_names), + \(.x) get_task_id_values(.x, model_tasks) + ) + if (!is.null(derived_task_ids)) { + task_id_values <- c( + task_id_values, + purrr::map( + purrr::set_names(derived_task_ids), + ~NA_character_ + ) + ) + } + + output_type_names <- get_round_output_type_names(config_tasks, round_id) + output_type_id_values <- purrr::map( + output_type_names, + \(.x) get_output_type_id_values(.x, model_tasks) + ) %>% purrr::flatten_chr() + + output_types <- list( + output_type = output_type_names, + output_type_id = output_type_id_values + ) + + c(task_id_values, output_types) +} +get_task_id_values <- function(task_id, model_tasks) { + purrr::map( + model_tasks, + ~ .x[["task_ids"]][[task_id]] + ) %>% + unlist(use.names = FALSE) %>% + unique() %>% + as.character() +} + +get_output_type_id_values <- function(output_type, model_tasks) { + out <- purrr::map( + model_tasks, + ~ .x[["output_type"]][[output_type]][["output_type_id"]] + ) %>% + unlist(use.names = FALSE) %>% + unique() %>% + as.character() + + if (length(out) == 0L) { + return(NA_character_) + } else { + return(out) + } +} diff --git a/R/validate_model_data.R b/R/validate_model_data.R index 5cd7a03b..f4014149 100644 --- a/R/validate_model_data.R +++ b/R/validate_model_data.R @@ -3,6 +3,7 @@ #' @inheritParams check_tbl_unique_round_id #' @inheritParams validate_model_file #' @inheritParams hubData::create_hub_schema +#' @inheritParams expand_model_out_grid #' @inherit validate_model_file return #' @export #' @details @@ -29,7 +30,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, "double", "integer", "logical", "Date" ), - validations_cfg_path = NULL) { + validations_cfg_path = NULL, + derived_task_ids = NULL) { checks <- new_hub_validations() file_meta <- parse_file_name(file_path) @@ -136,7 +138,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, tbl_chr, round_id = round_id, file_path = file_path, - hub_path = hub_path + hub_path = hub_path, + derived_task_ids = derived_task_ids ), file_path ) if (is_any_error(checks$valid_vals)) { diff --git a/man/check_tbl_values.Rd b/man/check_tbl_values.Rd index 19229251..386bb55c 100644 --- a/man/check_tbl_values.Rd +++ b/man/check_tbl_values.Rd @@ -4,7 +4,7 @@ \alias{check_tbl_values} \title{Check model output data tbl contains valid value combinations} \usage{ -check_tbl_values(tbl, round_id, file_path, hub_path) +check_tbl_values(tbl, round_id, file_path, hub_path, derived_task_ids = NULL) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated. Column types must \strong{all be character}.} @@ -23,6 +23,10 @@ For more details consult the in the \code{arrow} package. The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} files within the \code{hub-config} directory.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/validate_model_data.Rd b/man/validate_model_data.Rd index a06355cc..4ffa95e1 100644 --- a/man/validate_model_data.Rd +++ b/man/validate_model_data.Rd @@ -10,7 +10,8 @@ validate_model_data( round_id_col = NULL, output_type_id_datatype = c("from_config", "auto", "character", "double", "integer", "logical", "Date"), - validations_cfg_path = NULL + validations_cfg_path = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -47,6 +48,10 @@ behaviour so use with care.} \item{validations_cfg_path}{Path to \code{validations.yml} file. If \code{NULL} defaults to \code{hub-config/validations.yml}.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ An object of class \code{hub_validations}. Each named element contains From 93a4bf2fef08641553e794f37a9aedac1d1865de Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Mon, 12 Aug 2024 17:08:50 +0300 Subject: [PATCH 02/17] Ignore derived task-ids in check_tbl_values_required --- R/check_tbl_values_required.R | 12 +++++++++--- man/check_tbl_values_required.Rd | 12 +++++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/R/check_tbl_values_required.R b/R/check_tbl_values_required.R index 0077223b..496f581d 100644 --- a/R/check_tbl_values_required.R +++ b/R/check_tbl_values_required.R @@ -5,18 +5,23 @@ #' @inherit check_tbl_colnames params #' @inherit check_tbl_col_types return #' @export -check_tbl_values_required <- function(tbl, round_id, file_path, hub_path) { +check_tbl_values_required <- function(tbl, round_id, file_path, hub_path, + derived_task_ids = NULL) { tbl[["value"]] <- NULL config_tasks <- hubUtils::read_config(hub_path, "tasks") if (hubUtils::is_v3_config(config_tasks)) { tbl[tbl$output_type == "sample", "output_type_id"] <- NA } + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } req <- expand_model_out_grid( config_tasks, round_id = round_id, required_vals_only = TRUE, all_character = TRUE, - bind_model_tasks = FALSE + bind_model_tasks = FALSE, + derived_task_ids = derived_task_ids ) full <- expand_model_out_grid( @@ -25,7 +30,8 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path) { required_vals_only = FALSE, all_character = TRUE, as_arrow_table = FALSE, - bind_model_tasks = FALSE + bind_model_tasks = FALSE, + derived_task_ids = derived_task_ids ) tbl <- purrr::map( diff --git a/man/check_tbl_values_required.Rd b/man/check_tbl_values_required.Rd index 084df7c2..83cd5aab 100644 --- a/man/check_tbl_values_required.Rd +++ b/man/check_tbl_values_required.Rd @@ -5,7 +5,13 @@ \title{Check all required task ID/output type/output type ID value combinations present in model data.} \usage{ -check_tbl_values_required(tbl, round_id, file_path, hub_path) +check_tbl_values_required( + tbl, + round_id, + file_path, + hub_path, + derived_task_ids = NULL +) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated. Column types must \strong{all be character}.} @@ -24,6 +30,10 @@ For more details consult the in the \code{arrow} package. The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} files within the \code{hub-config} directory.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: From 2a2e94888b5e8d7f9a63769cdef7f918b5b0eea3 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Wed, 14 Aug 2024 11:48:53 +0300 Subject: [PATCH 03/17] add match_tbl_to_model_task function --- NAMESPACE | 1 + R/match_tbl_to_model_task.R | 49 ++++++++++++++ man/match_tbl_to_model_task.Rd | 59 ++++++++++++++++ .../_snaps/match_tbl_to_model_task.md | 67 +++++++++++++++++++ tests/testthat/test-match_tbl_to_model_task.R | 18 +++++ 5 files changed, 194 insertions(+) create mode 100644 R/match_tbl_to_model_task.R create mode 100644 man/match_tbl_to_model_task.Rd create mode 100644 tests/testthat/_snaps/match_tbl_to_model_task.md create mode 100644 tests/testthat/test-match_tbl_to_model_task.R diff --git a/NAMESPACE b/NAMESPACE index a63d9eed..de940c92 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,7 @@ export(is_exec_warn) export(is_failure) export(is_info) export(is_success) +export(match_tbl_to_model_task) export(new_hub_validations) export(not_pass) export(opt_check_metadata_team_max_model_n) diff --git a/R/match_tbl_to_model_task.R b/R/match_tbl_to_model_task.R new file mode 100644 index 00000000..6974599b --- /dev/null +++ b/R/match_tbl_to_model_task.R @@ -0,0 +1,49 @@ +#' Match model output `tbl` data to their model tasks in `config_tasks`. +#' +#' Split and match model output `tbl` data to their corresponding model tasks in +#' `config_tasks`. Useful for performing model task specific checks on model output. +#' For v3 samples, the `output_type_id` column is set to `NA` for `sample` outputs. +#' @inheritParams expand_model_out_grid +#' @inheritParams check_tbl_colnames +#' +#' @return A list containing a `tbl_df` of model output data matched to a model +#' task with one element per round model task. +#' @export +#' +#' @examples +#' hub_path <- system.file("testhubs/samples", package = "hubValidations") +#' tbl <- read_model_out_file( +#' file_path = "flu-base/2022-10-22-flu-base.csv", +#' hub_path, coerce_types = "chr" +#' ) +#' config_tasks <- hubUtils::read_config(hub_path, "tasks") +#' match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22") +#' match_tbl_to_model_task(tbl, config_tasks, +#' round_id = "2022-10-22", +#' output_types = "sample" +#' ) +match_tbl_to_model_task <- function(tbl, config_tasks, round_id, + output_types = NULL, derived_task_ids = NULL, + all_character = TRUE) { + join_cols <- names(tbl)[names(tbl) != "value"] + if (hubUtils::is_v3_config(config_tasks)) { + tbl[tbl$output_type == "sample", "output_type_id"] <- NA + } + + expand_model_out_grid( + config_tasks, + round_id = round_id, + required_vals_only = FALSE, + all_character = TRUE, + as_arrow_table = FALSE, + bind_model_tasks = FALSE, + output_types = output_types, + derived_task_ids = derived_task_ids + ) %>% + purrr::map(\(.x) { + if (nrow(.x) == 0L) { + return(NULL) + } + dplyr::inner_join(.x, tbl, by = join_cols) + }) +} diff --git a/man/match_tbl_to_model_task.Rd b/man/match_tbl_to_model_task.Rd new file mode 100644 index 00000000..bf930b1e --- /dev/null +++ b/man/match_tbl_to_model_task.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/match_tbl_to_model_task.R +\name{match_tbl_to_model_task} +\alias{match_tbl_to_model_task} +\title{Match model output \code{tbl} data to their model tasks in \code{config_tasks}.} +\usage{ +match_tbl_to_model_task( + tbl, + config_tasks, + round_id, + output_types = NULL, + derived_task_ids = NULL, + all_character = TRUE +) +} +\arguments{ +\item{tbl}{a tibble/data.frame of the contents of the file being validated.} + +\item{config_tasks}{a list version of the content's of a hub's \code{tasks.json} +config file, accessed through the \code{"config_tasks"} attribute of a \verb{} +object or function \code{\link[hubUtils:read_config]{hubUtils::read_config()}}.} + +\item{round_id}{Character string. Round identifier. If the round is set to +\code{round_id_from_variable: true}, IDs are values of the task ID defined in the round's +\code{round_id} property of \code{config_tasks}. +Otherwise should match round's \code{round_id} value in config. Ignored if hub +contains only a single round.} + +\item{output_types}{Character vector of output type names to include. +Use to subset for grids for specific output types.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} + +\item{all_character}{Logical. Whether to return all character column.} +} +\value{ +A list containing a \code{tbl_df} of model output data matched to a model +task with one element per round model task. +} +\description{ +Split and match model output \code{tbl} data to their corresponding model tasks in +\code{config_tasks}. Useful for performing model task specific checks on model output. +For v3 samples, the \code{output_type_id} column is set to \code{NA} for \code{sample} outputs. +} +\examples{ +hub_path <- system.file("testhubs/samples", package = "hubValidations") +tbl <- read_model_out_file( + file_path = "flu-base/2022-10-22-flu-base.csv", + hub_path, coerce_types = "chr" +) +config_tasks <- hubUtils::read_config(hub_path, "tasks") +match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22") +match_tbl_to_model_task(tbl, config_tasks, + round_id = "2022-10-22", + output_types = "sample" +) +} diff --git a/tests/testthat/_snaps/match_tbl_to_model_task.md b/tests/testthat/_snaps/match_tbl_to_model_task.md new file mode 100644 index 00000000..cc4d3ae3 --- /dev/null +++ b/tests/testthat/_snaps/match_tbl_to_model_task.md @@ -0,0 +1,67 @@ +# match_tbl_to_model_task works + + Code + match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22") + Output + [[1]] + # A tibble: 60 x 8 + reference_date target horizon location target_end_date output_type + + 1 2022-10-22 wk flu hosp rate~ 0 US 2022-10-22 pmf + 2 2022-10-22 wk flu hosp rate~ 0 01 2022-10-22 pmf + 3 2022-10-22 wk flu hosp rate~ 0 02 2022-10-22 pmf + 4 2022-10-22 wk flu hosp rate~ 0 04 2022-10-22 pmf + 5 2022-10-22 wk flu hosp rate~ 0 05 2022-10-22 pmf + 6 2022-10-22 wk flu hosp rate~ 1 US 2022-10-29 pmf + 7 2022-10-22 wk flu hosp rate~ 1 01 2022-10-29 pmf + 8 2022-10-22 wk flu hosp rate~ 1 02 2022-10-29 pmf + 9 2022-10-22 wk flu hosp rate~ 1 04 2022-10-29 pmf + 10 2022-10-22 wk flu hosp rate~ 1 05 2022-10-29 pmf + # i 50 more rows + # i 2 more variables: output_type_id , value + + [[2]] + # A tibble: 1,530 x 8 + reference_date target horizon location target_end_date output_type + + 1 2022-10-22 wk inc flu hosp 0 US 2022-10-22 mean + 2 2022-10-22 wk inc flu hosp 0 01 2022-10-22 mean + 3 2022-10-22 wk inc flu hosp 0 02 2022-10-22 mean + 4 2022-10-22 wk inc flu hosp 0 04 2022-10-22 mean + 5 2022-10-22 wk inc flu hosp 0 05 2022-10-22 mean + 6 2022-10-22 wk inc flu hosp 1 US 2022-10-29 mean + 7 2022-10-22 wk inc flu hosp 1 01 2022-10-29 mean + 8 2022-10-22 wk inc flu hosp 1 02 2022-10-29 mean + 9 2022-10-22 wk inc flu hosp 1 04 2022-10-29 mean + 10 2022-10-22 wk inc flu hosp 1 05 2022-10-29 mean + # i 1,520 more rows + # i 2 more variables: output_type_id , value + + +--- + + Code + match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22", + output_types = "sample") + Output + [[1]] + NULL + + [[2]] + # A tibble: 1,500 x 8 + reference_date target horizon location target_end_date output_type + + 1 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 2 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 3 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 4 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 5 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 6 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 7 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 8 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 9 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + 10 2022-10-22 wk inc flu hosp 0 US 2022-10-22 sample + # i 1,490 more rows + # i 2 more variables: output_type_id , value + + diff --git a/tests/testthat/test-match_tbl_to_model_task.R b/tests/testthat/test-match_tbl_to_model_task.R new file mode 100644 index 00000000..18cfc3be --- /dev/null +++ b/tests/testthat/test-match_tbl_to_model_task.R @@ -0,0 +1,18 @@ +test_that("match_tbl_to_model_task works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + tbl <- read_model_out_file( + file_path = "flu-base/2022-10-22-flu-base.csv", + hub_path, coerce_types = "chr" + ) + config_tasks <- hubUtils::read_config(hub_path, "tasks") + + expect_snapshot( + match_tbl_to_model_task(tbl, config_tasks, round_id = "2022-10-22") + ) + expect_snapshot( + match_tbl_to_model_task(tbl, config_tasks, + round_id = "2022-10-22", + output_types = "sample" + ) + ) +}) From 2bc332c348677a490849fd9f7d17769a79aa8f6f Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Wed, 14 Aug 2024 11:51:13 +0300 Subject: [PATCH 04/17] refactor check_tbl_value_col to work on smaller subsets of data --- R/check_tbl_value_col.R | 73 +++++++++++++++----------------------- man/check_tbl_value_col.Rd | 12 ++++++- 2 files changed, 40 insertions(+), 45 deletions(-) diff --git a/R/check_tbl_value_col.R b/R/check_tbl_value_col.R index 189b7f02..d60626ff 100644 --- a/R/check_tbl_value_col.R +++ b/R/check_tbl_value_col.R @@ -5,54 +5,32 @@ #' type of the appropriate model task. #' @inherit check_tbl_colnames params #' @inherit check_tbl_col_types return +#' @inheritParams expand_model_out_grid #' @export -check_tbl_value_col <- function(tbl, round_id, file_path, hub_path) { +check_tbl_value_col <- function(tbl, round_id, file_path, hub_path, + derived_task_ids = NULL) { config_tasks <- hubUtils::read_config(hub_path, "tasks") tbl[, names(tbl) != "value"] <- hubData::coerce_to_character( tbl[, names(tbl) != "value"] ) + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } - full <- expand_model_out_grid( - config_tasks, - round_id = round_id, - required_vals_only = FALSE, - all_character = TRUE, - as_arrow_table = FALSE, - bind_model_tasks = FALSE - ) - - join_cols <- names(tbl)[names(tbl) != "value"] # nolint: object_usage_linter - tbl <- purrr::map( - full, - ~ dplyr::inner_join(.x, tbl, by = join_cols) - ) - - round_config <- get_file_round_config(file_path, hub_path) - output_type_config <- round_config[["model_tasks"]] %>% - purrr::map(~ .x[["output_type"]]) - - - details <- purrr::map2( - tbl, output_type_config, - check_modeling_task_value_col - ) %>% + details <- split(tbl, f = tbl$output_type) %>% + purrr::imap( + ~ check_value_col_by_output_type( + tbl = .x, output_type = .y, + config_tasks = config_tasks, + round_id = round_id, + derived_task_ids = derived_task_ids + ) + ) %>% unlist(use.names = TRUE) check <- is.null(details) - ## Example code for attempting bullets of details. Needs more experimentation - ## but parking for now. - # if (!check) { - # details_bullets_div <- function(details) { - # cli::cli_div() - # cli::format_bullets_raw( - # stats::setNames(details, rep("*", length(details))) - # ) - # } - # details <- details_bullets_div(details) - # } - capture_check_cnd( check = check, file_path = file_path, @@ -63,19 +41,26 @@ check_tbl_value_col <- function(tbl, round_id, file_path, hub_path) { ) } - -check_modeling_task_value_col <- function(tbl, output_type_config) { - purrr::imap( - split(tbl, tbl[["output_type"]]), - ~ compare_values_to_config( - tbl = .x, output_type = .y, - output_type_config +check_value_col_by_output_type <- function(tbl, output_type, + config_tasks, round_id, + derived_task_ids = NULL) { + purrr::map2( + .x = match_tbl_to_model_task(tbl, config_tasks, + round_id, output_type, + derived_task_ids = derived_task_ids + ), + .y = get_round_output_types(config_tasks, round_id), + \(.x, .y) compare_values_to_config( + tbl = .x, output_type_config = .y, output_type = output_type ) ) %>% unlist(use.names = TRUE) } compare_values_to_config <- function(tbl, output_type, output_type_config) { + if (any(is.null(tbl), is.null(output_type_config))) { + return(NULL) + } details <- NULL values <- tbl$value config <- output_type_config[[output_type]][["value"]] diff --git a/man/check_tbl_value_col.Rd b/man/check_tbl_value_col.Rd index 36dc8cfb..b443a871 100644 --- a/man/check_tbl_value_col.Rd +++ b/man/check_tbl_value_col.Rd @@ -4,7 +4,13 @@ \alias{check_tbl_value_col} \title{Check output type values of model output data against config} \usage{ -check_tbl_value_col(tbl, round_id, file_path, hub_path) +check_tbl_value_col( + tbl, + round_id, + file_path, + hub_path, + derived_task_ids = NULL +) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated.} @@ -23,6 +29,10 @@ For more details consult the in the \code{arrow} package. The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} files within the \code{hub-config} directory.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: From 7fe78b48d67078de2b54d4d0b3821499370e5389 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Wed, 14 Aug 2024 16:52:32 +0300 Subject: [PATCH 05/17] propagate output subsetting and derived_task_ids arg to spl checks --- R/check_tbl_spl_compound_taskid_set.R | 18 ++++++---- R/check_tbl_spl_compound_tid.R | 8 +++-- R/check_tbl_spl_n.R | 7 ++-- R/check_tbl_spl_non_compound_tid.R | 6 ++-- R/compound_taskid-utils.R | 14 +++++--- R/v3-sample-utils.R | 20 ++++++----- man/check_tbl_spl_compound_taskid_set.Rd | 12 ++++++- man/check_tbl_spl_compound_tid.Rd | 7 +++- man/check_tbl_spl_n.Rd | 13 +++++++- man/check_tbl_spl_non_compound_tid.Rd | 7 +++- man/get_tbl_compound_taskid_set.Rd | 7 +++- .../check_tbl_spl_compound_taskid_set.md | 10 ++++++ .../_snaps/check_tbl_spl_compound_tid.md | 10 ++++++ tests/testthat/_snaps/check_tbl_spl_n.md | 11 ++++++- .../_snaps/check_tbl_spl_non_compound_tid.md | 10 ++++++ .../test-check_tbl_spl_compound_taskid_set.R | 16 +++++++++ .../test-check_tbl_spl_compound_tid.R | 16 +++++++++ tests/testthat/test-check_tbl_spl_n.R | 33 ++++++++++++++----- .../test-check_tbl_spl_non_compound_tid.R | 19 ++++++++++- 19 files changed, 204 insertions(+), 40 deletions(-) diff --git a/R/check_tbl_spl_compound_taskid_set.R b/R/check_tbl_spl_compound_taskid_set.R index 48155a0c..6107d0d8 100644 --- a/R/check_tbl_spl_compound_taskid_set.R +++ b/R/check_tbl_spl_compound_taskid_set.R @@ -8,6 +8,7 @@ #' Column types must **all be character**. #' @inherit check_tbl_colnames params #' @inherit check_tbl_colnames return +#' @inheritParams expand_model_out_grid #' @details If the check fails, the output of the check includes an `errors` element, #' a list of items, one for each modeling task failing validation. #' The structure depends on the reason the check failed. @@ -31,15 +32,18 @@ #' See [hubverse documentation on samples](https://hubverse.io/en/latest/user-guide/sample-output-type.html) #' for more details. #' @export -check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path) { +check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path, + derived_task_ids = NULL) { config_tasks <- hubUtils::read_config(hub_path, "tasks") if (isFALSE(has_spls_tbl(tbl)) || isFALSE(hubUtils::is_v3_config(config_tasks))) { return(skip_v3_spl_check(file_path)) } - compound_taskid_set <- get_tbl_compound_taskid_set(tbl, config_tasks, round_id, - compact = FALSE, error = FALSE + compound_taskid_set <- get_tbl_compound_taskid_set( + tbl, config_tasks, round_id, + compact = FALSE, error = FALSE, + derived_task_ids = NULL ) check <- purrr::map_lgl( @@ -47,8 +51,6 @@ check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path ~ is.null(attr(.x, "errors")) ) |> all() - - capture_check_cnd( check = check, file_path = file_path, @@ -59,7 +61,11 @@ check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path details = compile_msg(compound_taskid_set), errors = compile_errors(compound_taskid_set), error = TRUE, - compound_taskid_set = if (check) { compound_taskid_set } else { NA } + compound_taskid_set = if (check) { + compound_taskid_set + } else { + NA + } ) } diff --git a/R/check_tbl_spl_compound_tid.R b/R/check_tbl_spl_compound_tid.R index d79d29fc..f0a3f28f 100644 --- a/R/check_tbl_spl_compound_tid.R +++ b/R/check_tbl_spl_compound_tid.R @@ -4,6 +4,7 @@ #' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**. #' @inherit check_tbl_colnames params #' @inherit check_tbl_colnames return +#' @inheritParams expand_model_out_grid #' @param compound_taskid_set a list of `compound_taskid_set`s (characters vector of compound task IDs), #' one for each modeling task. Used to override the compound task ID set in the config file, #' for example, when validating coarser samples. @@ -17,7 +18,8 @@ #' for more details. #' @export check_tbl_spl_compound_tid <- function(tbl, round_id, file_path, hub_path, - compound_taskid_set = NULL) { + compound_taskid_set = NULL, + derived_task_ids = NULL) { if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) { cli::cli_abort("Valid {.var compound_taskid_set} must be provided.") } @@ -33,7 +35,9 @@ check_tbl_spl_compound_tid <- function(tbl, round_id, file_path, hub_path, return(skip_v3_spl_check(file_path)) } - hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set) + hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set, + derived_task_ids = derived_task_ids + ) # TODO: Currently, samples must strictly match the compound task ID set expectations # and cannot handle coarser-grained compound task ID sets. n_tbl <- hash_tbl[hash_tbl$n_compound_idx > 1L, ] diff --git a/R/check_tbl_spl_n.R b/R/check_tbl_spl_n.R index 1a6e5b4b..ed967de5 100644 --- a/R/check_tbl_spl_n.R +++ b/R/check_tbl_spl_n.R @@ -16,7 +16,8 @@ #' for more details. #' @export check_tbl_spl_n <- function(tbl, round_id, file_path, hub_path, - compound_taskid_set = NULL) { + compound_taskid_set = NULL, + derived_task_ids = NULL) { if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) { cli::cli_abort("Valid {.var compound_taskid_set} must be provided.") } @@ -32,7 +33,9 @@ check_tbl_spl_n <- function(tbl, round_id, file_path, hub_path, return(skip_v3_spl_check(file_path)) } - hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set) + hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set, + derived_task_ids = derived_task_ids + ) n_ranges <- get_round_spl_n_ranges(config_tasks, round_id) n_tbl <- dplyr::group_by(hash_tbl, .data$compound_idx) %>% diff --git a/R/check_tbl_spl_non_compound_tid.R b/R/check_tbl_spl_non_compound_tid.R index fd1eb0ee..41975fdb 100644 --- a/R/check_tbl_spl_non_compound_tid.R +++ b/R/check_tbl_spl_non_compound_tid.R @@ -17,7 +17,8 @@ #' for more details. #' @export check_tbl_spl_non_compound_tid <- function(tbl, round_id, file_path, hub_path, - compound_taskid_set = NULL) { + compound_taskid_set = NULL, + derived_task_ids = NULL) { if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) { cli::cli_abort("Valid {.var compound_taskid_set} must be provided.") } @@ -33,7 +34,8 @@ check_tbl_spl_non_compound_tid <- function(tbl, round_id, file_path, hub_path, return(skip_v3_spl_check(file_path)) } - hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set) + hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set, + derived_task_ids = derived_task_ids) n_tbl <- dplyr::summarise( hash_tbl, diff --git a/R/compound_taskid-utils.R b/R/compound_taskid-utils.R index b37357ad..3f44d4a8 100644 --- a/R/compound_taskid-utils.R +++ b/R/compound_taskid-utils.R @@ -8,6 +8,7 @@ #' @param error Logical. If TRUE, an error will be thrown if the compound task ID set is not valid. #' If FALSE and an error is detected, the detected compound task ID set will be #' returned with error attributes attached. +#' @inheritParams expand_model_out_grid #' #' @return A list of vectors of compound task IDs detected in the tbl, one for each #' modeling task in the round. If `compact` is TRUE, modeling tasks returning NULL @@ -29,11 +30,14 @@ #' ) #' get_tbl_compound_taskid_set <- function(tbl, config_tasks, round_id, - compact = TRUE, error = TRUE) { + compact = TRUE, error = TRUE, + derived_task_ids = NULL) { if (!inherits(tbl, "tbl_df")) { tbl <- dplyr::as_tibble(tbl) } - + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } tbl <- tbl[tbl$output_type == "sample", names(tbl) != "value"] out_tid <- hubUtils::std_colnames["output_type_id"] @@ -48,10 +52,12 @@ get_tbl_compound_taskid_set <- function(tbl, config_tasks, round_id, round_id = round_id, all_character = TRUE, include_sample_ids = FALSE, - bind_model_tasks = FALSE + bind_model_tasks = FALSE, + output_types = "sample", + derived_task_ids = derived_task_ids ), function(.x) { - if (!has_spls_tbl(.x)) { + if (nrow(.x) == 0L) { return(NULL) } dplyr::inner_join(tbl, .x[, names(.x) != out_tid], diff --git a/R/v3-sample-utils.R b/R/v3-sample-utils.R index d5438c5f..4494168c 100644 --- a/R/v3-sample-utils.R +++ b/R/v3-sample-utils.R @@ -6,8 +6,12 @@ # compound task id sets across modeling tasks). This is achieved by using a full # join between tbl sample data and the model output data sample grid for each # modeling task. This means that only valid task id combinations are considered. -spl_hash_tbl <- function(tbl, round_id, config_tasks, compound_taskid_set = NULL) { +spl_hash_tbl <- function(tbl, round_id, config_tasks, compound_taskid_set = NULL, + derived_task_ids = NULL) { tbl <- tbl[tbl$output_type == "sample", names(tbl) != "value"] + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } mt_spl_grid <- expand_model_out_grid( config_tasks = config_tasks, @@ -15,20 +19,19 @@ spl_hash_tbl <- function(tbl, round_id, config_tasks, compound_taskid_set = NULL all_character = TRUE, include_sample_ids = TRUE, bind_model_tasks = FALSE, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + output_types = "sample", + derived_task_ids = derived_task_ids ) %>% stats::setNames(seq_along(.)) mt_tbls <- purrr::map( mt_spl_grid, function(.x) { - if (is.null(.x) || !has_spls_tbl(.x)) { + if (nrow(.x) == 0L) { NULL } else { - dplyr::filter( - .x, .data$output_type == "sample" - ) %>% - dplyr::rename(compound_idx = "output_type_id") %>% + dplyr::rename(.x, compound_idx = "output_type_id") %>% dplyr::inner_join(tbl, ., by = setdiff( names(tbl), @@ -59,12 +62,11 @@ spl_hash_tbl <- function(tbl, round_id, config_tasks, compound_taskid_set = NULL ) ) %>% purrr::compact() %>% - purrr::imap( ~ dplyr::mutate(.x, mt_id = as.integer(.y))) %>% # add mt_id + purrr::imap(~ dplyr::mutate(.x, mt_id = as.integer(.y))) %>% # add mt_id purrr::list_rbind() } get_mt_spl_hash_tbl <- function(tbl, compound_taskids, round_task_ids) { - if (is.null(tbl)) { return(NULL) } diff --git a/man/check_tbl_spl_compound_taskid_set.Rd b/man/check_tbl_spl_compound_taskid_set.Rd index 65c49b84..90a19c41 100644 --- a/man/check_tbl_spl_compound_taskid_set.Rd +++ b/man/check_tbl_spl_compound_taskid_set.Rd @@ -5,7 +5,13 @@ \title{Check model output data tbl sample compound task id sets for each modeling task match or are coarser than the expected set defined in the config.} \usage{ -check_tbl_spl_compound_taskid_set(tbl, round_id, file_path, hub_path) +check_tbl_spl_compound_taskid_set( + tbl, + round_id, + file_path, + hub_path, + derived_task_ids = NULL +) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated. @@ -25,6 +31,10 @@ For more details consult the in the \code{arrow} package. The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} files within the \code{hub-config} directory.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/check_tbl_spl_compound_tid.Rd b/man/check_tbl_spl_compound_tid.Rd index eb18f336..3084f95b 100644 --- a/man/check_tbl_spl_compound_tid.Rd +++ b/man/check_tbl_spl_compound_tid.Rd @@ -10,7 +10,8 @@ check_tbl_spl_compound_tid( round_id, file_path, hub_path, - compound_taskid_set = NULL + compound_taskid_set = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -34,6 +35,10 @@ files within the \code{hub-config} directory.} \item{compound_taskid_set}{a list of \code{compound_taskid_set}s (characters vector of compound task IDs), one for each modeling task. Used to override the compound task ID set in the config file, for example, when validating coarser samples.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/check_tbl_spl_n.Rd b/man/check_tbl_spl_n.Rd index c32afb77..8c714607 100644 --- a/man/check_tbl_spl_n.Rd +++ b/man/check_tbl_spl_n.Rd @@ -5,7 +5,14 @@ \title{Check model output data tbl samples contain the appropriate number of samples for a given compound idx.} \usage{ -check_tbl_spl_n(tbl, round_id, file_path, hub_path, compound_taskid_set = NULL) +check_tbl_spl_n( + tbl, + round_id, + file_path, + hub_path, + compound_taskid_set = NULL, + derived_task_ids = NULL +) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated. Column types must \strong{all be character}.} @@ -28,6 +35,10 @@ files within the \code{hub-config} directory.} \item{compound_taskid_set}{a list of \code{compound_taskid_set}s (characters vector of compound task IDs), one for each modeling task. Used to override the compound task ID set in the config file, for example, when validating coarser samples.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/check_tbl_spl_non_compound_tid.Rd b/man/check_tbl_spl_non_compound_tid.Rd index 0b3daf91..26efa67c 100644 --- a/man/check_tbl_spl_non_compound_tid.Rd +++ b/man/check_tbl_spl_non_compound_tid.Rd @@ -10,7 +10,8 @@ check_tbl_spl_non_compound_tid( round_id, file_path, hub_path, - compound_taskid_set = NULL + compound_taskid_set = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -34,6 +35,10 @@ files within the \code{hub-config} directory.} \item{compound_taskid_set}{a list of \code{compound_taskid_set}s (characters vector of compound task IDs), one for each modeling task. Used to override the compound task ID set in the config file, for example, when validating coarser samples.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/get_tbl_compound_taskid_set.Rd b/man/get_tbl_compound_taskid_set.Rd index 98efe5ad..02ca0732 100644 --- a/man/get_tbl_compound_taskid_set.Rd +++ b/man/get_tbl_compound_taskid_set.Rd @@ -9,7 +9,8 @@ get_tbl_compound_taskid_set( config_tasks, round_id, compact = TRUE, - error = TRUE + error = TRUE, + derived_task_ids = NULL ) } \arguments{ @@ -25,6 +26,10 @@ Column types must \strong{all be character}.} \item{error}{Logical. If TRUE, an error will be thrown if the compound task ID set is not valid. If FALSE and an error is detected, the detected compound task ID set will be returned with error attributes attached.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ A list of vectors of compound task IDs detected in the tbl, one for each diff --git a/tests/testthat/_snaps/check_tbl_spl_compound_taskid_set.md b/tests/testthat/_snaps/check_tbl_spl_compound_taskid_set.md index 6949c488..2606df6d 100644 --- a/tests/testthat/_snaps/check_tbl_spl_compound_taskid_set.md +++ b/tests/testthat/_snaps/check_tbl_spl_compound_taskid_set.md @@ -268,3 +268,13 @@ $ use_cli_format : logi TRUE - attr(*, "class")= chr [1:5] "check_error" "hub_check" "rlang_error" "error" ... +# Ignoring derived_task_ids in check_tbl_spl_compound_taskid_set works + + Code + check_tbl_spl_compound_taskid_set(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date") + Output + + Message: + All samples in a model task conform to single, unique compound task ID set that matches or is coarser than the configured `compound_taksid_set`. + diff --git a/tests/testthat/_snaps/check_tbl_spl_compound_tid.md b/tests/testthat/_snaps/check_tbl_spl_compound_tid.md index 08eab7bd..097993b9 100644 --- a/tests/testthat/_snaps/check_tbl_spl_compound_tid.md +++ b/tests/testthat/_snaps/check_tbl_spl_compound_tid.md @@ -84,3 +84,13 @@ Message: Each sample compound task ID contains single, unique value. +# Ignoring derived_task_ids in check_tbl_spl_compound_tid works + + Code + check_tbl_spl_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date") + Output + + Message: + Each sample compound task ID contains single, unique value. + diff --git a/tests/testthat/_snaps/check_tbl_spl_n.md b/tests/testthat/_snaps/check_tbl_spl_n.md index e8b0be82..7d646f3c 100644 --- a/tests/testthat/_snaps/check_tbl_spl_n.md +++ b/tests/testthat/_snaps/check_tbl_spl_n.md @@ -155,7 +155,7 @@ -# Overriding compound_taskid_set in check_tbl_spl_compound_tid works +# Overriding compound_taskid_set in check_tbl_spl_n works Code str(check_tbl_spl_n(tbl_coarse, round_id, file_path, hub_path)) @@ -244,3 +244,12 @@ Warning: Number of samples per compound idx not consistent. Sample numbers supplied per compound idx vary between 9 and 10. See `errors` attribute for details. +# Ignoring derived_task_ids in check_tbl_spl_n works + + Code + check_tbl_spl_n(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date") + Output + + Message: + Required samples per compound idx task present. + diff --git a/tests/testthat/_snaps/check_tbl_spl_non_compound_tid.md b/tests/testthat/_snaps/check_tbl_spl_non_compound_tid.md index 428fe7fd..0b815291 100644 --- a/tests/testthat/_snaps/check_tbl_spl_non_compound_tid.md +++ b/tests/testthat/_snaps/check_tbl_spl_non_compound_tid.md @@ -61,3 +61,13 @@ Message: Task ID combinations of non compound task id values consistent across modeling task samples. +# Ignoring derived_task_ids in check_tbl_spl_compound_tid works + + Code + check_tbl_spl_non_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date") + Output + + Message: + Task ID combinations of non compound task id values consistent across modeling task samples. + diff --git a/tests/testthat/test-check_tbl_spl_compound_taskid_set.R b/tests/testthat/test-check_tbl_spl_compound_taskid_set.R index 784370bb..7e915175 100644 --- a/tests/testthat/test-check_tbl_spl_compound_taskid_set.R +++ b/tests/testthat/test-check_tbl_spl_compound_taskid_set.R @@ -168,3 +168,19 @@ test_that("Finer compound_taskid_sets work", { ) ) }) + +test_that("Ignoring derived_task_ids in check_tbl_spl_compound_taskid_set works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + expect_snapshot( + check_tbl_spl_compound_taskid_set(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) +}) diff --git a/tests/testthat/test-check_tbl_spl_compound_tid.R b/tests/testthat/test-check_tbl_spl_compound_tid.R index f0a720db..e6e6717d 100644 --- a/tests/testthat/test-check_tbl_spl_compound_tid.R +++ b/tests/testthat/test-check_tbl_spl_compound_tid.R @@ -71,3 +71,19 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", compound_taskid_set = compound_taskid_set) ) }) + +test_that("Ignoring derived_task_ids in check_tbl_spl_compound_tid works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + expect_snapshot( + check_tbl_spl_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) +}) diff --git a/tests/testthat/test-check_tbl_spl_n.R b/tests/testthat/test-check_tbl_spl_n.R index 8369529a..a51a40df 100644 --- a/tests/testthat/test-check_tbl_spl_n.R +++ b/tests/testthat/test-check_tbl_spl_n.R @@ -65,7 +65,7 @@ test_that("check_tbl_spl_n works", { }) -test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", { +test_that("Overriding compound_taskid_set in check_tbl_spl_n works", { hub_path <- test_path("testdata/hub-spl") file_path <- "flu-base/2022-10-22-flu-base.csv" round_id <- "2022-10-22" @@ -77,10 +77,10 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", c("reference_date", "horizon") ) tbl_coarse <- create_spl_file("2022-10-22", - compound_taskid_set = compound_taskid_set, - write = FALSE, - out_datatype = "chr", - n_samples = 1L + compound_taskid_set = compound_taskid_set, + write = FALSE, + out_datatype = "chr", + n_samples = 1L ) # Validation of coarser files should return check failure. @@ -104,9 +104,9 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", # Create 100 spls of each compound idx tbl_full <- create_spl_file("2022-10-22", - compound_taskid_set = compound_taskid_set, - write = FALSE, - out_datatype = "chr" + compound_taskid_set = compound_taskid_set, + write = FALSE, + out_datatype = "chr" ) # This succeeds! @@ -128,3 +128,20 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", ) ) }) + + +test_that("Ignoring derived_task_ids in check_tbl_spl_n works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + expect_snapshot( + check_tbl_spl_n(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) +}) diff --git a/tests/testthat/test-check_tbl_spl_non_compound_tid.R b/tests/testthat/test-check_tbl_spl_non_compound_tid.R index 2755e812..7ed285e1 100644 --- a/tests/testthat/test-check_tbl_spl_non_compound_tid.R +++ b/tests/testthat/test-check_tbl_spl_non_compound_tid.R @@ -68,6 +68,23 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", # also succeeds. expect_snapshot( check_tbl_spl_non_compound_tid(tbl_coarse, round_id, file_path, hub_path, - compound_taskid_set = compound_taskid_set) + compound_taskid_set = compound_taskid_set + ) + ) +}) + +test_that("Ignoring derived_task_ids in check_tbl_spl_compound_tid works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + expect_snapshot( + check_tbl_spl_non_compound_tid(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) ) }) From 1b003f274d4767ee6dd677e9c2037157afefc07e Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Wed, 14 Aug 2024 17:21:00 +0300 Subject: [PATCH 06/17] propagate output subsetting and derived_task_ids arg to higher level validation fns --- R/validate_model_data.R | 18 +- R/validate_pr.R | 4 +- R/validate_submission.R | 7 +- man/validate_pr.Rd | 7 +- man/validate_submission.Rd | 7 +- tests/testthat/_snaps/validate_submission.md | 171 +++++++++++++++++++ tests/testthat/test-validate_submission.R | 28 +++ 7 files changed, 231 insertions(+), 11 deletions(-) diff --git a/R/validate_model_data.R b/R/validate_model_data.R index f4014149..919f53eb 100644 --- a/R/validate_model_data.R +++ b/R/validate_model_data.R @@ -159,7 +159,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, tbl_chr, round_id = round_id, file_path = file_path, - hub_path = hub_path + hub_path = hub_path, + derived_task_ids = derived_task_ids ), file_path ) @@ -169,7 +170,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, tbl, round_id = round_id, file_path = file_path, - hub_path = hub_path + hub_path = hub_path, + derived_task_ids = derived_task_ids ), file_path ) @@ -194,7 +196,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, tbl_chr, round_id = round_id, file_path = file_path, - hub_path = hub_path + hub_path = hub_path, + derived_task_ids = derived_task_ids ), file_path ) if (is_any_error(checks$spl_compound_taskid_set)) { @@ -208,7 +211,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, round_id = round_id, file_path = file_path, hub_path = hub_path, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + derived_task_ids = derived_task_ids ), file_path ) if (is_any_error(checks$spl_compound_tid)) { @@ -220,7 +224,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, round_id = round_id, file_path = file_path, hub_path = hub_path, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + derived_task_ids = derived_task_ids ), file_path ) if (is_any_error(checks$spl_non_compound_tid)) { @@ -232,7 +237,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, round_id = round_id, file_path = file_path, hub_path = hub_path, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + derived_task_ids = derived_task_ids ), file_path ) } diff --git a/R/validate_pr.R b/R/validate_pr.R index 48e44f03..15b37c41 100644 --- a/R/validate_pr.R +++ b/R/validate_pr.R @@ -107,7 +107,8 @@ validate_pr <- function(hub_path = ".", gh_repo, pr_number, submit_window_ref_date_from = c( "file", "file_path" - )) { + ), + derived_task_ids = NULL) { file_modification_check <- rlang::arg_match(file_modification_check) output_type_id_datatype <- rlang::arg_match(output_type_id_datatype) model_output_dir <- get_hub_model_output_dir(hub_path) # nolint: object_name_linter @@ -179,6 +180,7 @@ validate_pr <- function(hub_path = ".", gh_repo, pr_number, output_type_id_datatype = output_type_id_datatype, validations_cfg_path = validations_cfg_path, skip_submit_window_check = skip_submit_window_check, + derived_task_ids = derived_task_ids, skip_check_config = TRUE ) ) %>% diff --git a/R/validate_submission.R b/R/validate_submission.R index e60a1f76..278007b2 100644 --- a/R/validate_submission.R +++ b/R/validate_submission.R @@ -6,6 +6,7 @@ #' #' @inherit validate_model_data return params #' @inheritParams hubData::create_hub_schema +#' @inheritParams expand_model_out_grid #' @param skip_submit_window_check Logical. Whether to skip the submission window check. #' @param skip_check_config Logical. Whether to skip the hub config validation check. #' check. @@ -49,7 +50,8 @@ validate_submission <- function(hub_path, file_path, round_id_col = NULL, submit_window_ref_date_from = c( "file", "file_path" - )) { + ), + derived_task_ids = NULL) { check_hub_config <- new_hub_validations() output_type_id_datatype <- rlang::arg_match(output_type_id_datatype) @@ -89,7 +91,8 @@ validate_submission <- function(hub_path, file_path, round_id_col = NULL, file_path = file_path, round_id_col = round_id_col, output_type_id_datatype = output_type_id_datatype, - validations_cfg_path = validations_cfg_path + validations_cfg_path = validations_cfg_path, + derived_task_ids = derived_task_ids ) combine(check_hub_config, checks_file, checks_data, checks_submission_time) diff --git a/man/validate_pr.Rd b/man/validate_pr.Rd index 9bc41312..3e6db5db 100644 --- a/man/validate_pr.Rd +++ b/man/validate_pr.Rd @@ -15,7 +15,8 @@ validate_pr( skip_submit_window_check = FALSE, file_modification_check = c("error", "warn", "message", "none"), allow_submit_window_mods = TRUE, - submit_window_ref_date_from = c("file", "file_path") + submit_window_ref_date_from = c("file", "file_path"), + derived_task_ids = NULL ) } \arguments{ @@ -81,6 +82,10 @@ Only applicable when a round is configured to determine the submission windows relative to the value in a date column in model output files. Not applicable when explicit submission window start and end dates are provided in the hub's config.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ An object of class \code{hub_validations}. diff --git a/man/validate_submission.Rd b/man/validate_submission.Rd index 71b403ef..cbbee86a 100644 --- a/man/validate_submission.Rd +++ b/man/validate_submission.Rd @@ -13,7 +13,8 @@ validate_submission( "logical", "Date"), skip_submit_window_check = FALSE, skip_check_config = FALSE, - submit_window_ref_date_from = c("file", "file_path") + submit_window_ref_date_from = c("file", "file_path"), + derived_task_ids = NULL ) } \arguments{ @@ -64,6 +65,10 @@ Only applicable when a round is configured to determine the submission windows relative to the value in a date column in model output files. Not applicable when explicit submission window start and end dates are provided in the hub's config.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ An object of class \code{hub_validations}. Each named element contains diff --git a/tests/testthat/_snaps/validate_submission.md b/tests/testthat/_snaps/validate_submission.md index 34331062..87cd9ca4 100644 --- a/tests/testthat/_snaps/validate_submission.md +++ b/tests/testthat/_snaps/validate_submission.md @@ -943,3 +943,174 @@ Message: Column data types match hub schema. +# Ignoring derived_task_ids in validate_submission works + + Code + str(validate_submission(hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", skip_submit_window_check = TRUE, + derived_task_ids = "target_end_date")) + Output + List of 24 + $ valid_config :List of 4 + ..$ message : chr "All hub config files are valid. \n " + ..$ where : chr "samples" + ..$ call : chr "check_config_hub_valid" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_exists :List of 4 + ..$ message : chr "File exists at path 'model-output/flu-base/2022-10-22-flu-base.csv'. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_exists" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_name :List of 4 + ..$ message : chr "File name \"2022-10-22-flu-base.csv\" is valid. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_name" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_location :List of 4 + ..$ message : chr "File directory name matches `model_id`\n metadata in file name. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_location" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ round_id_valid :List of 4 + ..$ message : chr "`round_id` is valid. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_valid_round_id" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_format :List of 4 + ..$ message : chr "File is accepted hub format. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_format" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ metadata_exists :List of 4 + ..$ message : chr "Metadata file exists at path 'model-metadata/flu-base.yml'. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_submission_metadata_file_exists" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ file_read :List of 4 + ..$ message : chr "File could be read successfully. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_file_read" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ valid_round_id_col :List of 4 + ..$ message : chr "`round_id_col` name is valid. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_valid_round_id_col" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ unique_round_id :List of 4 + ..$ message : chr "`round_id` column \"reference_date\" contains a single, unique round ID value. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_unique_round_id" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ match_round_id :List of 4 + ..$ message : chr "All `round_id_col` \"reference_date\" values match submission `round_id` from file name. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_match_round_id" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ colnames :List of 4 + ..$ message : chr "Column names are consistent with expected round task IDs and std column names. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_colnames" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ col_types :List of 4 + ..$ message : chr "Column data types match hub schema. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_col_types" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ valid_vals :List of 5 + ..$ message : chr "`tbl` contains valid values/value combinations. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ error_tbl : NULL + ..$ call : chr "check_tbl_values" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ rows_unique :List of 4 + ..$ message : chr "All combinations of task ID column/`output_type`/`output_type_id` values are unique. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_rows_unique" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ req_vals :List of 5 + ..$ message : chr "Required task ID/output type/output type ID combinations all present. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ missing : tibble [0 x 7] (S3: tbl_df/tbl/data.frame) + .. ..$ location : chr(0) + .. ..$ reference_date : chr(0) + .. ..$ horizon : chr(0) + .. ..$ target_end_date: chr(0) + .. ..$ target : chr(0) + .. ..$ output_type : chr(0) + .. ..$ output_type_id : chr(0) + ..$ call : chr "check_tbl_values_required" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ value_col_valid :List of 4 + ..$ message : chr "Values in column `value` all valid with respect to modeling task config. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_value_col" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ value_col_non_desc :List of 4 + ..$ message : chr "No quantile or cdf output types to check for non-descending values.\n Check skipped." + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : chr "check_tbl_value_col_ascending" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_info" "hub_check" "rlang_message" "message" ... + $ value_col_sum1 :List of 5 + ..$ message : chr "Values in `value` column do sum to 1 for all unique task ID value combination of pmf\n output types. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ error_tbl : NULL + ..$ call : chr "check_tbl_value_col_sum1" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ spl_compound_taskid_set:List of 6 + ..$ message : chr "All samples in a model task conform to single, unique compound task ID set that matches or is\n coarser than"| __truncated__ + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ errors : NULL + ..$ compound_taskid_set:List of 2 + .. ..$ 1: NULL + .. ..$ 2: chr [1:2] "reference_date" "location" + ..$ call : chr "check_tbl_spl_compound_taskid_set" + ..$ use_cli_format : logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ spl_compound_tid :List of 5 + ..$ message : chr "Each sample compound task ID contains single, unique value. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ errors : NULL + ..$ call : chr "check_tbl_spl_compound_tid" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ spl_non_compound_tid :List of 5 + ..$ message : chr "Task ID combinations of non compound task id values consistent across modeling task samples. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ errors : NULL + ..$ call : chr "check_tbl_spl_non_compound_tid" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ spl_n :List of 5 + ..$ message : chr "Required samples per compound idx task present. \n " + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ errors : NULL + ..$ call : chr "check_tbl_spl_n" + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + $ horizon_timediff :List of 4 + ..$ message : chr "Time differences between t0 var `reference_date` and t1 var\n `target_end_date` all match expected perio"| __truncated__ + ..$ where : chr "flu-base/2022-10-22-flu-base.csv" + ..$ call : NULL + ..$ use_cli_format: logi TRUE + ..- attr(*, "class")= chr [1:5] "check_success" "hub_check" "rlang_message" "message" ... + - attr(*, "class")= chr [1:2] "hub_validations" "list" + diff --git a/tests/testthat/test-validate_submission.R b/tests/testthat/test-validate_submission.R index 89ae1d05..93f4dc03 100644 --- a/tests/testthat/test-validate_submission.R +++ b/tests/testthat/test-validate_submission.R @@ -319,3 +319,31 @@ test_that("validate_submission handles overriding output type id data type corre )[["col_types"]] ) }) + +test_that("Ignoring derived_task_ids in validate_submission works", { + # Validation passes + expect_snapshot( + str( + validate_submission( + hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", + skip_submit_window_check = TRUE, + derived_task_ids = "target_end_date" + ) + ) + ) + # Results of validation the same + expect_equal( + validate_submission( + hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", + skip_submit_window_check = TRUE, + derived_task_ids = "target_end_date" + ), + validate_submission( + hub_path = system.file("testhubs/samples", package = "hubValidations"), + file_path = "flu-base/2022-10-22-flu-base.csv", + skip_submit_window_check = TRUE + ) + ) +}) From 0686ff445c43e183ad84f89bbc997619435ebd72 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Wed, 14 Aug 2024 18:18:30 +0300 Subject: [PATCH 07/17] Fix error_tbl bug by using original tbl rowids --- R/check_tbl_values.R | 55 ++++++++++++----------- tests/testthat/_snaps/check_tbl_values.md | 30 +++++++++++++ tests/testthat/test-check_tbl_values.R | 30 +++++++++++++ 3 files changed, 90 insertions(+), 25 deletions(-) diff --git a/R/check_tbl_values.R b/R/check_tbl_values.R index 5ddb954e..6bea1ca0 100644 --- a/R/check_tbl_values.R +++ b/R/check_tbl_values.R @@ -7,18 +7,19 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path, derived_task_ids = NULL) { config_tasks <- hubUtils::read_config(hub_path, "tasks") - if (!is.null(derived_task_ids)) { - tbl[, derived_task_ids] <- NA_character_ - } valid_tbl <- tbl %>% + tibble::rowid_to_column() %>% split(f = tbl$output_type) %>% purrr::imap( - ~ check_values_by_output_type(tbl = .x, output_type = .y, - config_tasks = config_tasks, - round_id = round_id, - derived_task_ids = derived_task_ids) - ) %>% purrr::list_rbind() + ~ check_values_by_output_type( + tbl = .x, output_type = .y, + config_tasks = config_tasks, + round_id = round_id, + derived_task_ids = derived_task_ids + ) + ) %>% + purrr::list_rbind() check <- !any(is.na(valid_tbl$valid)) @@ -26,15 +27,14 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path, details <- NULL error_tbl <- NULL } else { - error_summary <- summarise_invalid_values(valid_tbl, config_tasks, round_id, - derived_task_ids) + error_summary <- summarise_invalid_values( + valid_tbl, config_tasks, round_id, + derived_task_ids + ) details <- error_summary$msg if (length(error_summary$invalid_combs_idx) == 0L) { error_tbl <- NULL } else { - if (hubUtils::is_v3_config(config_tasks)) { - tbl[["output_type_id"]] <- out_type_ids - } error_tbl <- tbl[ error_summary$invalid_combs_idx, names(tbl) != "value" @@ -58,7 +58,10 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path, } check_values_by_output_type <- function(tbl, output_type, config_tasks, round_id, - derived_task_ids = NULL) { + derived_task_ids = NULL) { + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ + } # Coerce accepted vals to character for easier comparison of # values. Tried to use arrow tbls for comparisons as more efficient when @@ -76,24 +79,24 @@ check_values_by_output_type <- function(tbl, output_type, config_tasks, round_id # This approach uses dplyr to identify tbl rows that don't have a complete match # in accepted_vals. accepted_vals$valid <- TRUE - if (hubUtils::is_v3_config(config_tasks)) { - out_type_ids <- tbl[["output_type_id"]] + if (hubUtils::is_v3_config(config_tasks) && output_type == "sample") { tbl[tbl$output_type == "sample", "output_type_id"] <- NA } - valid_tbl <- dplyr::left_join( + dplyr::left_join( tbl, accepted_vals, - by = names(tbl)[names(tbl) != "value"] + by = setdiff(names(tbl), c("value", "rowid")) ) - } summarise_invalid_values <- function(valid_tbl, config_tasks, round_id, derived_task_ids) { - cols <- names(valid_tbl)[!names(valid_tbl) %in% c("value", "valid")] + cols <- setdiff(names(valid_tbl), c("value", "valid", "rowid")) uniq_tbl <- purrr::map(valid_tbl[cols], unique) - uniq_config <- get_round_config_values(config_tasks, round_id, - derived_task_ids)[cols] + uniq_config <- get_round_config_values( + config_tasks, round_id, + derived_task_ids + )[cols] invalid_vals <- purrr::map2( uniq_tbl, uniq_config, @@ -121,13 +124,15 @@ summarise_invalid_values <- function(valid_tbl, config_tasks, round_id, unlist(use.names = FALSE) %>% unique() invalid_row_idx <- which(is.na(valid_tbl$valid)) - invalid_combs_idx <- setdiff(invalid_val_idx, invalid_row_idx) + invalid_combs_idx <- setdiff(invalid_row_idx, invalid_val_idx) if (length(invalid_combs_idx) == 0L) { invalid_combs_msg <- NULL } else { + invalid_combs_idx <- valid_tbl$rowid[invalid_combs_idx] invalid_combs_msg <- cli::format_inline( - "Additionally row{?s} {.val {invalid_combs_idx}} contain invalid - combinations of valid values. + "Additionally {cli::qty(length(invalid_combs_idx))} row{?s} + {.val {invalid_combs_idx}} {cli::qty(length(invalid_combs_idx))} + {?contains/contain} invalid combinations of valid values. See {.var error_tbl} for details." ) } diff --git a/tests/testthat/_snaps/check_tbl_values.md b/tests/testthat/_snaps/check_tbl_values.md index 9b9d3654..fb0d3e76 100644 --- a/tests/testthat/_snaps/check_tbl_values.md +++ b/tests/testthat/_snaps/check_tbl_values.md @@ -38,3 +38,33 @@ Error: ! `tbl` contains invalid values/value combinations. Column `horizon` contains invalid values "11" and "12". +# Ignoring derived_task_ids in check_tbl_values works + + Code + check_tbl_values(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date") + Output + + Message: + `tbl` contains valid values/value combinations. + +--- + + Code + check_tbl_values(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date") + Output + + Error: + ! `tbl` contains invalid values/value combinations. Column `horizon` contains invalid value "9". Additionally row 2 contains invalid combinations of valid values. See `error_tbl` for details. + +--- + + Code + check_tbl_values(tbl, round_id, file_path, hub_path, derived_task_ids = "target_end_date")$ + error_tbl + Output + # A tibble: 1 x 7 + location reference_date horizon target_end_date target output_type + + 1 US 2022-10-22 1 2022-10-29 wk inc flu hosp pmf + # i 1 more variable: output_type_id + diff --git a/tests/testthat/test-check_tbl_values.R b/tests/testthat/test-check_tbl_values.R index 35c116be..9f4c56b0 100644 --- a/tests/testthat/test-check_tbl_values.R +++ b/tests/testthat/test-check_tbl_values.R @@ -195,3 +195,33 @@ test_that("check_tbl_values works with v3 spec samples", { ) ) }) + + +test_that("Ignoring derived_task_ids in check_tbl_values works", { + hub_path <- system.file("testhubs/samples", package = "hubValidations") + file_path <- "flu-base/2022-10-22-flu-base.csv" + round_id <- "2022-10-22" + tbl <- read_model_out_file( + file_path = file_path, + hub_path = hub_path, + coerce_types = "chr" + ) + expect_snapshot( + check_tbl_values(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + + tbl[1, "horizon"] <- "9" + tbl[2, "output_type"] <- "pmf" + expect_snapshot( + check_tbl_values(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + ) + ) + expect_snapshot( + check_tbl_values(tbl, round_id, file_path, hub_path, + derived_task_ids = "target_end_date" + )$error_tbl + ) +}) From df0815e3c6ae6e45f79c0b2ff17620ec1b9da646 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Wed, 14 Aug 2024 18:21:09 +0300 Subject: [PATCH 08/17] remove unused functions --- R/check_tbl_values.R | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/R/check_tbl_values.R b/R/check_tbl_values.R index 6bea1ca0..edbdcfab 100644 --- a/R/check_tbl_values.R +++ b/R/check_tbl_values.R @@ -141,40 +141,3 @@ summarise_invalid_values <- function(valid_tbl, config_tasks, round_id, invalid_combs_idx = invalid_combs_idx ) } - - -get_numeric_output_type_ids <- function(file_path, hub_path) { - get_file_round_config(file_path, hub_path)[["model_tasks"]] %>% - purrr::map(~ .x[["output_type"]]) %>% - unlist(recursive = FALSE) %>% - purrr::map(~ purrr::pluck(.x, "output_type_id")) %>% - purrr::map_lgl(~ is.numeric(unlist(.x))) %>% - purrr::keep(isTRUE) %>% - names() %>% - unique() -} - - -coerce_num_output_type_ids <- function(tbl, file_path, hub_path) { - num_output_types <- get_numeric_output_type_ids( - file_path = file_path, - hub_path = hub_path - ) - - if ( - any(tbl[["output_type"]] %in% num_output_types) && - inherits(tbl[["output_type_id"]], "character") - ) { - type_coerce <- tbl[["output_type"]] %in% num_output_types - num_output_type_id <- suppressWarnings( - as.numeric(tbl$output_type_id[type_coerce]) - ) - # establish only valid coercions to distinguish between the potential for - # two cdf output types in the same round, one numeric and one character. - valid <- !is.na(num_output_type_id) - tbl$output_type_id[type_coerce][valid] <- as.character( - num_output_type_id[valid] - ) - } - tbl -} From 4f18bca8b657c989d2e670f1f64733a5ee231587 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Wed, 14 Aug 2024 18:23:51 +0300 Subject: [PATCH 09/17] Fix linter issues --- R/check_tbl_value_col.R | 22 +++--- R/expand_model_out_grid.R | 3 +- .../test-check_tbl_spl_compound_taskid_set.R | 24 +++--- .../test-check_tbl_spl_compound_tid.R | 17 ++--- tests/testthat/test-check_tbl_values.R | 6 +- tests/testthat/test-expand_model_out_grid.R | 75 +++++++++---------- 6 files changed, 75 insertions(+), 72 deletions(-) diff --git a/R/check_tbl_value_col.R b/R/check_tbl_value_col.R index d60626ff..85ce5af3 100644 --- a/R/check_tbl_value_col.R +++ b/R/check_tbl_value_col.R @@ -20,12 +20,14 @@ check_tbl_value_col <- function(tbl, round_id, file_path, hub_path, details <- split(tbl, f = tbl$output_type) %>% purrr::imap( - ~ check_value_col_by_output_type( - tbl = .x, output_type = .y, - config_tasks = config_tasks, - round_id = round_id, - derived_task_ids = derived_task_ids - ) + \(.x, .y) { + check_value_col_by_output_type( + tbl = .x, output_type = .y, + config_tasks = config_tasks, + round_id = round_id, + derived_task_ids = derived_task_ids + ) + } ) %>% unlist(use.names = TRUE) @@ -50,9 +52,11 @@ check_value_col_by_output_type <- function(tbl, output_type, derived_task_ids = derived_task_ids ), .y = get_round_output_types(config_tasks, round_id), - \(.x, .y) compare_values_to_config( - tbl = .x, output_type_config = .y, output_type = output_type - ) + \(.x, .y) { + compare_values_to_config( + tbl = .x, output_type_config = .y, output_type = output_type + ) + } ) %>% unlist(use.names = TRUE) } diff --git a/R/expand_model_out_grid.R b/R/expand_model_out_grid.R index 0b5c95ea..df9d9a56 100644 --- a/R/expand_model_out_grid.R +++ b/R/expand_model_out_grid.R @@ -571,7 +571,8 @@ validate_derived_task_ids <- function(derived_task_ids, config_tasks, round_id) purrr::map_lgl( ~ !is.null(.x$required) ) - ) %>% purrr::reduce(`|`) + ) %>% + purrr::reduce(`|`) if (any(has_required)) { cli::cli_abort( c( diff --git a/tests/testthat/test-check_tbl_spl_compound_taskid_set.R b/tests/testthat/test-check_tbl_spl_compound_taskid_set.R index 7e915175..67cf42ba 100644 --- a/tests/testthat/test-check_tbl_spl_compound_taskid_set.R +++ b/tests/testthat/test-check_tbl_spl_compound_taskid_set.R @@ -93,17 +93,17 @@ test_that("Different compound_taskid_sets work", { # Mock the config file to include all task ids a derived task id depends on # in the compound_taskid_set but exclude the derived task id itself. # Currently will fail - config_tasks_full_ctids <- purrr::modify_in( - hubUtils::read_config_file( - fs::path(hub_path, "hub-config", "tasks.json") - ), - list( - "rounds", 1, "model_tasks", 2, - "output_type", "sample", - "output_type_id_params", "compound_taskid_set" - ), - ~ c("reference_date", "horizon", "location", "variant") - ) + config_tasks_full_ctids <- purrr::modify_in( + hubUtils::read_config_file( + fs::path(hub_path, "hub-config", "tasks.json") + ), + list( + "rounds", 1, "model_tasks", 2, + "output_type", "sample", + "output_type_id_params", "compound_taskid_set" + ), + ~ c("reference_date", "horizon", "location", "variant") + ) mockery::stub( check_tbl_spl_compound_taskid_set, @@ -180,7 +180,7 @@ test_that("Ignoring derived_task_ids in check_tbl_spl_compound_taskid_set works" ) expect_snapshot( check_tbl_spl_compound_taskid_set(tbl, round_id, file_path, hub_path, - derived_task_ids = "target_end_date" + derived_task_ids = "target_end_date" ) ) }) diff --git a/tests/testthat/test-check_tbl_spl_compound_tid.R b/tests/testthat/test-check_tbl_spl_compound_tid.R index e6e6717d..7554e394 100644 --- a/tests/testthat/test-check_tbl_spl_compound_tid.R +++ b/tests/testthat/test-check_tbl_spl_compound_tid.R @@ -50,14 +50,12 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", c("reference_date", "horizon") ) tbl_coarse <- create_spl_file("2022-10-22", - compound_taskid_set = compound_taskid_set, - write = FALSE, - out_datatype = "chr", - n_samples = 1L + compound_taskid_set = compound_taskid_set, + write = FALSE, + out_datatype = "chr", + n_samples = 1L ) - - # Normal validation should return check failure expect_snapshot( str( @@ -65,10 +63,11 @@ test_that("Overriding compound_taskid_set in check_tbl_spl_compound_tid works", ) ) - # Validation providing coarser compound taskid set succeeds + # Validation providing coarser compound taskid set succeeds expect_snapshot( check_tbl_spl_compound_tid(tbl_coarse, round_id, file_path, hub_path, - compound_taskid_set = compound_taskid_set) + compound_taskid_set = compound_taskid_set + ) ) }) @@ -83,7 +82,7 @@ test_that("Ignoring derived_task_ids in check_tbl_spl_compound_tid works", { ) expect_snapshot( check_tbl_spl_compound_tid(tbl, round_id, file_path, hub_path, - derived_task_ids = "target_end_date" + derived_task_ids = "target_end_date" ) ) }) diff --git a/tests/testthat/test-check_tbl_values.R b/tests/testthat/test-check_tbl_values.R index 9f4c56b0..42537032 100644 --- a/tests/testthat/test-check_tbl_values.R +++ b/tests/testthat/test-check_tbl_values.R @@ -208,7 +208,7 @@ test_that("Ignoring derived_task_ids in check_tbl_values works", { ) expect_snapshot( check_tbl_values(tbl, round_id, file_path, hub_path, - derived_task_ids = "target_end_date" + derived_task_ids = "target_end_date" ) ) @@ -216,12 +216,12 @@ test_that("Ignoring derived_task_ids in check_tbl_values works", { tbl[2, "output_type"] <- "pmf" expect_snapshot( check_tbl_values(tbl, round_id, file_path, hub_path, - derived_task_ids = "target_end_date" + derived_task_ids = "target_end_date" ) ) expect_snapshot( check_tbl_values(tbl, round_id, file_path, hub_path, - derived_task_ids = "target_end_date" + derived_task_ids = "target_end_date" )$error_tbl ) }) diff --git a/tests/testthat/test-expand_model_out_grid.R b/tests/testthat/test-expand_model_out_grid.R index 63a6e9b2..823e987a 100644 --- a/tests/testthat/test-expand_model_out_grid.R +++ b/tests/testthat/test-expand_model_out_grid.R @@ -344,55 +344,54 @@ test_that("expand_model_out_grid output controls with samples work correctly", { test_that("expand_model_out_grid output type subsetting works", { config_tasks <- hubUtils::read_config_file( system.file("config", "tasks-comp-tid.json", - package = "hubValidations" + package = "hubValidations" ) ) expect_snapshot( expand_model_out_grid(config_tasks, - round_id = "2022-12-26", - include_sample_ids = TRUE, - bind_model_tasks = FALSE, - output_types = c("pmf", "sample"), + round_id = "2022-12-26", + include_sample_ids = TRUE, + bind_model_tasks = FALSE, + output_types = c("pmf", "sample"), ) ) expect_snapshot( expand_model_out_grid(config_tasks, - round_id = "2022-12-26", - include_sample_ids = TRUE, - bind_model_tasks = FALSE, - output_types = "sample", + round_id = "2022-12-26", + include_sample_ids = TRUE, + bind_model_tasks = FALSE, + output_types = "sample", ) ) expect_snapshot( expand_model_out_grid(config_tasks, - round_id = "2022-12-26", - include_sample_ids = TRUE, - bind_model_tasks = TRUE, - output_types = "sample", + round_id = "2022-12-26", + include_sample_ids = TRUE, + bind_model_tasks = TRUE, + output_types = "sample", ) ) expect_snapshot( expand_model_out_grid(config_tasks, - round_id = "2022-12-26", - include_sample_ids = FALSE, - bind_model_tasks = TRUE, - output_types = c("random", "sample"), + round_id = "2022-12-26", + include_sample_ids = FALSE, + bind_model_tasks = TRUE, + output_types = c("random", "sample"), ) ) expect_snapshot( expand_model_out_grid(config_tasks, - round_id = "2022-12-26", - include_sample_ids = FALSE, - bind_model_tasks = FALSE, - output_types = c("random"), + round_id = "2022-12-26", + include_sample_ids = FALSE, + bind_model_tasks = FALSE, + output_types = c("random"), ), error = TRUE ) - }) test_that("expand_model_out_grid derived_task_ids ignoring works", { @@ -400,31 +399,31 @@ test_that("expand_model_out_grid derived_task_ids ignoring works", { expect_snapshot( expand_model_out_grid(config_tasks, - round_id = "2022-10-22", - include_sample_ids = FALSE, - bind_model_tasks = TRUE, - output_types = "sample", - derived_task_ids = "target_end_date" + round_id = "2022-10-22", + include_sample_ids = FALSE, + bind_model_tasks = TRUE, + output_types = "sample", + derived_task_ids = "target_end_date" ) ) expect_snapshot( expand_model_out_grid(config_tasks, - round_id = "2022-10-22", - include_sample_ids = TRUE, - bind_model_tasks = TRUE, - output_types = "sample", - derived_task_ids = "target_end_date", - required_vals_only = TRUE + round_id = "2022-10-22", + include_sample_ids = TRUE, + bind_model_tasks = TRUE, + output_types = "sample", + derived_task_ids = "target_end_date", + required_vals_only = TRUE ) ) expect_snapshot( expand_model_out_grid(config_tasks, - round_id = "2022-10-22", - include_sample_ids = FALSE, - bind_model_tasks = FALSE, - output_types = "sample", - derived_task_ids = c("location", "variant") + round_id = "2022-10-22", + include_sample_ids = FALSE, + bind_model_tasks = FALSE, + output_types = "sample", + derived_task_ids = c("location", "variant") ), error = TRUE ) From 809a340c6c688be034e81db851a5c7831dd9c6b6 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Wed, 14 Aug 2024 18:24:07 +0300 Subject: [PATCH 10/17] Bump version --- DESCRIPTION | 2 +- NEWS.md | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3d327a12..2ee2c5c7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: hubValidations Title: Testing framework for hubverse hub validations -Version: 0.4.0 +Version: 0.5.0 Authors@R: c( person( given = "Anna", diff --git a/NEWS.md b/NEWS.md index 5449c894..f32b8d63 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,13 @@ +# hubValidations 0.5.0 + +This release introduces significant improvements in the performance of submission validation via the following changes: +* Add ability to sub-set expanded valid value grids by output type through `output_type` argument in `expand_model_out_grid()` (#98). +* Add ability to ignore the values of derived task IDsin expanded valid value grids through argument `derived_task_ids` in `expand_model_out_grid()`. +Both of these changes allow for the creation of smaller, more focused valid value grids that can be used to validate submissions more efficiently. + +Additional useful functionality: +* Add new exported function `match_tbl_to_model_task()` that matches the rows in a `tbl` of model output data to a model task of a given round (as defined in `tasks.json`). + # hubValidations 0.4.0 - Add new `check_tbl_spl_compound_taskid_set()` check function to `validate_model_data()` that ensures that sample compound task id sets for each modeling task match or are coarser than the expected set defined in `tasks.json` config. From 451b7a24fdd107d20b61ec2aaa5c6e03e24e7952 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Wed, 14 Aug 2024 18:31:56 +0300 Subject: [PATCH 11/17] Add more detail to NEWS --- NEWS.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index f32b8d63..0b0a2c34 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,8 +2,10 @@ This release introduces significant improvements in the performance of submission validation via the following changes: * Add ability to sub-set expanded valid value grids by output type through `output_type` argument in `expand_model_out_grid()` (#98). -* Add ability to ignore the values of derived task IDsin expanded valid value grids through argument `derived_task_ids` in `expand_model_out_grid()`. -Both of these changes allow for the creation of smaller, more focused valid value grids that can be used to validate submissions more efficiently. +* Add ability to ignore the values of derived task IDs in expanded valid value grids through argument `derived_task_ids` in `expand_model_out_grid()`. +* Use sub-setting and batching of model output data validation by output type in appropriate lower level checks and add ability to ignore derived task IDs in `validate_model_data()`, `validate_submission()` and `validate_pr()`. + +Both of these changes allow for the creation of smaller, more focused expanded valid value grids, significantly reducing pressure on memory when working with large, complex hub configs and making submission validation much more efficient. Additional useful functionality: * Add new exported function `match_tbl_to_model_task()` that matches the rows in a `tbl` of model output data to a model task of a given round (as defined in `tasks.json`). From edbf9392c41e4c71e24a66f718d1f585fb10aac0 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Thu, 15 Aug 2024 10:26:28 +0300 Subject: [PATCH 12/17] Throw error if invalid output_type supplied instead of ignoring. --- R/expand_model_out_grid.R | 8 ++++---- tests/testthat/_snaps/expand_model_out_grid.md | 14 ++++---------- tests/testthat/test-expand_model_out_grid.R | 3 ++- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/R/expand_model_out_grid.R b/R/expand_model_out_grid.R index 469c3cdf..6a42ba6e 100644 --- a/R/expand_model_out_grid.R +++ b/R/expand_model_out_grid.R @@ -535,17 +535,17 @@ validate_output_types <- function(output_types, config_tasks, round_id, return(NULL) } round_output_types <- get_round_output_type_names(config_tasks, round_id) - valid_output_types <- intersect(output_types, round_output_types) - if (length(valid_output_types) == 0L) { + invalid_output_types <- setdiff(output_types, round_output_types) + if (length(invalid_output_types) > 0L) { cli::cli_abort( c( - "x" = "{.val {output_types}} {?is/are} not valid output type{?s}.", + "x" = "{.val {invalid_output_types}} {?is/are} not valid output type{?s}.", "i" = "{.arg output_types} must be members of: {.val {round_output_types}}" ), call = call ) } - valid_output_types + output_types } validate_derived_task_ids <- function(derived_task_ids, config_tasks, round_id) { diff --git a/tests/testthat/_snaps/expand_model_out_grid.md b/tests/testthat/_snaps/expand_model_out_grid.md index fe208dda..c1a91de2 100644 --- a/tests/testthat/_snaps/expand_model_out_grid.md +++ b/tests/testthat/_snaps/expand_model_out_grid.md @@ -563,16 +563,10 @@ expand_model_out_grid(config_tasks, round_id = "2022-12-26", include_sample_ids = FALSE, bind_model_tasks = TRUE, output_types = c( "random", "sample"), ) - Output - # A tibble: 6 x 6 - forecast_date target horizon location output_type output_type_id - - 1 2022-12-26 wk ahead inc flu ho~ 2 US sample - 2 2022-12-26 wk ahead inc flu ho~ 1 US sample - 3 2022-12-26 wk ahead inc flu ho~ 2 01 sample - 4 2022-12-26 wk ahead inc flu ho~ 1 01 sample - 5 2022-12-26 wk ahead inc flu ho~ 2 02 sample - 6 2022-12-26 wk ahead inc flu ho~ 1 02 sample + Condition + Error in `expand_model_out_grid()`: + x "random" is not valid output type. + i `output_types` must be members of: "sample", "mean", and "pmf" --- diff --git a/tests/testthat/test-expand_model_out_grid.R b/tests/testthat/test-expand_model_out_grid.R index c3e6537e..f94f4230 100644 --- a/tests/testthat/test-expand_model_out_grid.R +++ b/tests/testthat/test-expand_model_out_grid.R @@ -381,7 +381,8 @@ test_that("expand_model_out_grid output type subsetting works", { include_sample_ids = FALSE, bind_model_tasks = TRUE, output_types = c("random", "sample"), - ) + ), + error = TRUE ) # If no valid output type provided, errors From 28715b39fa0e95c42283fea241affa56511286c9 Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Thu, 15 Aug 2024 10:58:39 +0300 Subject: [PATCH 13/17] add output_types and derived_task_ids arguments to submission_tmpl --- R/submission_tmpl.R | 23 ++++++++- man/submission_tmpl.Rd | 26 +++++++++- tests/testthat/_snaps/submission_tmpl.md | 61 ++++++++++++++++++++++++ tests/testthat/test-submission_tmpl.R | 38 +++++++++++++++ 4 files changed, 145 insertions(+), 3 deletions(-) diff --git a/R/submission_tmpl.R b/R/submission_tmpl.R index a84fde80..cad9af36 100644 --- a/R/submission_tmpl.R +++ b/R/submission_tmpl.R @@ -92,10 +92,27 @@ #' NULL #' ) #' ) +#' # Subsetting for a single output type +#' submission_tmpl( +#' config_tasks = config_tasks, +#' round_id = "2022-12-26", +#' output_types = "sample" +#' ) +#' # Derive a template for a derived task ID +#' config_tasks <- hubUtils::read_config(test_path("testdata", "hub-spl")) +#' submission_tmpl( +#' config_tasks = config_tasks, +#' round_id = "2022-10-22", +#' output_types = "sample", +#' derived_task_ids = "target_end_date", +#' complete_cases_only = FALSE +#' ) submission_tmpl <- function(hub_con, config_tasks, round_id, required_vals_only = FALSE, complete_cases_only = TRUE, - compound_taskid_set = NULL) { + compound_taskid_set = NULL, + output_types = NULL, + derived_task_ids = NULL) { switch(rlang::check_exclusive(hub_con, config_tasks), hub_con = { checkmate::assert_class(hub_con, classes = "hub_connection") @@ -108,7 +125,9 @@ submission_tmpl <- function(hub_con, config_tasks, round_id, round_id = round_id, required_vals_only = required_vals_only, include_sample_ids = TRUE, - compound_taskid_set = compound_taskid_set + compound_taskid_set = compound_taskid_set, + output_types = output_types, + derived_task_ids = derived_task_ids ) tmpl_cols <- c( diff --git a/man/submission_tmpl.Rd b/man/submission_tmpl.Rd index dfc6ced6..1627ae2e 100644 --- a/man/submission_tmpl.Rd +++ b/man/submission_tmpl.Rd @@ -10,7 +10,9 @@ submission_tmpl( round_id, required_vals_only = FALSE, complete_cases_only = TRUE, - compound_taskid_set = NULL + compound_taskid_set = NULL, + output_types = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -38,6 +40,13 @@ are included in the output.} in the round. Can be used to override the compound task ID set defined in the config. If \code{NULL} is provided for a given modeling task, a compound task ID set of all task IDs is used.} + +\item{output_types}{Character vector of output type names to include. +Use to subset for grids for specific output types.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ a tibble template containing an expanded grid of valid task ID and @@ -127,4 +136,19 @@ submission_tmpl( NULL ) ) +# Subsetting for a single output type +submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-12-26", + output_types = "sample" +) +# Derive a template for a derived task ID +config_tasks <- hubUtils::read_config(test_path("testdata", "hub-spl")) +submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-10-22", + output_types = "sample", + derived_task_ids = "target_end_date", + complete_cases_only = FALSE +) } diff --git a/tests/testthat/_snaps/submission_tmpl.md b/tests/testthat/_snaps/submission_tmpl.md index c3939889..247f27b0 100644 --- a/tests/testthat/_snaps/submission_tmpl.md +++ b/tests/testthat/_snaps/submission_tmpl.md @@ -290,3 +290,64 @@ x "random_var" is not valid task ID. i The `compound_taskid_set` must be a subset of "forecast_date", "target", "horizon", and "location". +# submission_tmpl output type subsetting works + + Code + submission_tmpl(config_tasks = config_tasks, round_id = "2022-12-26", + output_types = "sample") + Output + # A tibble: 6 x 7 + forecast_date target horizon location output_type output_type_id value + + 1 2022-12-26 wk ahead inc ~ 2 US sample 1 NA + 2 2022-12-26 wk ahead inc ~ 2 01 sample 1 NA + 3 2022-12-26 wk ahead inc ~ 2 02 sample 1 NA + 4 2022-12-26 wk ahead inc ~ 1 US sample 2 NA + 5 2022-12-26 wk ahead inc ~ 1 01 sample 2 NA + 6 2022-12-26 wk ahead inc ~ 1 02 sample 2 NA + +--- + + Code + submission_tmpl(config_tasks = config_tasks, round_id = "2022-12-26", + output_types = c("mean", "sample")) + Output + # A tibble: 12 x 7 + forecast_date target horizon location output_type output_type_id value + + 1 2022-12-26 wk ahead inc~ 2 US mean NA + 2 2022-12-26 wk ahead inc~ 1 US mean NA + 3 2022-12-26 wk ahead inc~ 2 01 mean NA + 4 2022-12-26 wk ahead inc~ 1 01 mean NA + 5 2022-12-26 wk ahead inc~ 2 02 mean NA + 6 2022-12-26 wk ahead inc~ 1 02 mean NA + 7 2022-12-26 wk ahead inc~ 2 US sample 1 NA + 8 2022-12-26 wk ahead inc~ 2 01 sample 1 NA + 9 2022-12-26 wk ahead inc~ 2 02 sample 1 NA + 10 2022-12-26 wk ahead inc~ 1 US sample 2 NA + 11 2022-12-26 wk ahead inc~ 1 01 sample 2 NA + 12 2022-12-26 wk ahead inc~ 1 02 sample 2 NA + +# submission_tmpl ignoring derived task ids works + + Code + submission_tmpl(config_tasks = config_tasks, round_id = "2022-10-22", + output_types = "sample", derived_task_ids = "target_end_date", + complete_cases_only = FALSE) + Output + # A tibble: 80 x 9 + reference_date target horizon location variant target_end_date output_type + + 1 2022-10-22 wk inc f~ 0 US AA NA sample + 2 2022-10-22 wk inc f~ 1 US AA NA sample + 3 2022-10-22 wk inc f~ 2 US AA NA sample + 4 2022-10-22 wk inc f~ 3 US AA NA sample + 5 2022-10-22 wk inc f~ 0 01 AA NA sample + 6 2022-10-22 wk inc f~ 1 01 AA NA sample + 7 2022-10-22 wk inc f~ 2 01 AA NA sample + 8 2022-10-22 wk inc f~ 3 01 AA NA sample + 9 2022-10-22 wk inc f~ 0 02 AA NA sample + 10 2022-10-22 wk inc f~ 1 02 AA NA sample + # i 70 more rows + # i 2 more variables: output_type_id , value + diff --git a/tests/testthat/test-submission_tmpl.R b/tests/testthat/test-submission_tmpl.R index 8d1b8e05..6e9aae72 100644 --- a/tests/testthat/test-submission_tmpl.R +++ b/tests/testthat/test-submission_tmpl.R @@ -217,3 +217,41 @@ test_that("submission_tmpl errors correctly", { error = TRUE ) }) + +test_that("submission_tmpl output type subsetting works", { + config_tasks <- hubUtils::read_config_file(system.file("config", "tasks-comp-tid.json", + package = "hubValidations" + )) + + # Subsetting for a single output type + expect_snapshot( + submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-12-26", + output_types = "sample" + ) + ) + + # Subsetting for a two output types + expect_snapshot( + submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-12-26", + output_types = c("mean", "sample") + ) + ) +}) + +test_that("submission_tmpl ignoring derived task ids works", { + config_tasks <- hubUtils::read_config(test_path("testdata", "hub-spl")) + + expect_snapshot( + submission_tmpl( + config_tasks = config_tasks, + round_id = "2022-10-22", + output_types = "sample", + derived_task_ids = "target_end_date", + complete_cases_only = FALSE + ) + ) +}) From 8ec826172e19d4953c25f01583a311bc5157a99a Mon Sep 17 00:00:00 2001 From: Anna Krystalli Date: Thu, 15 Aug 2024 12:04:49 +0300 Subject: [PATCH 14/17] Add note on derived task IDs in pkgdown docs --- _pkgdown.yml | 1 + vignettes/articles/validate-pr.Rmd | 24 ++++++++++++++++++++++ vignettes/articles/validate-submission.Rmd | 13 ++++++++++++ 3 files changed, 38 insertions(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 61a7cf7f..18b36391 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -37,6 +37,7 @@ reference: - try_check - expand_model_out_grid - get_tbl_compound_taskid_set + - match_tbl_to_model_task - title: "`` methods" diff --git a/vignettes/articles/validate-pr.Rmd b/vignettes/articles/validate-pr.Rmd index 2b178c24..9571c1af 100644 --- a/vignettes/articles/validate-pr.Rmd +++ b/vignettes/articles/validate-pr.Rmd @@ -56,6 +56,29 @@ Here's an example of what the workflow looks like on GitHub: knitr::include_graphics("assets/ga-action-screenshot.png") ``` +### Ignoring derived task IDs to improve performance + +Argument **`derived_task_ids`** allows for the specification of **task IDs that are derived from other task IDs** to be ignored, which **can often lead to a significant improvement in validation performance**. + +#### What are derived task IDs? + +Derived task IDs are a class of task ID whose values depend on the values of other task IDs. As such, the **validity of derived task ID values is dependent on the values of the task IDs they are derived from** and the validity of value combinations of derived and other task IDs is much more restricted. A common example of a derived task ID is `target_end_date` which is most often derived from the `reference_date` or `origin_date` and `horizon` task ids. + +#### Implications of derived task IDs on validation performance + +With standard validation, derived task IDs like `target_end_date` tend to pollute the expanded grid used to validate valid value combination with invalid combinations. That's because, while a given combination of `reference_date` and `horizon` values will only have a single valid `target_end_date` value, the `target_end_date` property in the config will contain all possible valid values for all combinations of `reference_date` and `horizon`. Because its the values in the config used to create the expanded valid values grid, the sizes these grids can reach as function of config complexity can often put excessive strain on the system's memory, affecting overall performance of PR validation. + +#### How to ignore derived task IDs + +Supplying the names of derived task IDs to argument `derived_task_ids` will ignore them during validation checks and, depending on config complexity, this **can lead to a significant improvement in validation performance**. + + ### Skipping submission window checks @@ -109,6 +132,7 @@ These settings can be modified if required though the use of arguments `file_mod Is ignored when checking model metadata files as well as when `file_modification_check` is set to `"none"`. +