diff --git a/R/check_tbl_values.R b/R/check_tbl_values.R index 89a75de2..5ddb954e 100644 --- a/R/check_tbl_values.R +++ b/R/check_tbl_values.R @@ -1,34 +1,24 @@ #' Check model output data tbl contains valid value combinations #' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**. #' @inherit check_tbl_colnames params +#' @inheritParams expand_model_out_grid #' @inherit check_tbl_colnames return #' @export -check_tbl_values <- function(tbl, round_id, file_path, hub_path) { +check_tbl_values <- function(tbl, round_id, file_path, hub_path, + derived_task_ids = NULL) { config_tasks <- hubUtils::read_config(hub_path, "tasks") - - # Coerce accepted vals to character for easier comparison of - # values. Tried to use arrow tbls for comparisons as more efficient when - # working with larger files but currently arrow does not match NAs as dplyr - # does, returning false positives for mean & median rows which contain NA in - # output type ID column. - accepted_vals <- expand_model_out_grid( - config_tasks = config_tasks, - round_id = round_id, - all_character = TRUE - ) - - # This approach uses dplyr to identify tbl rows that don't have a complete match - # in accepted_vals. - accepted_vals$valid <- TRUE - if (hubUtils::is_v3_config(config_tasks)) { - out_type_ids <- tbl[["output_type_id"]] - tbl[tbl$output_type == "sample", "output_type_id"] <- NA + if (!is.null(derived_task_ids)) { + tbl[, derived_task_ids] <- NA_character_ } - valid_tbl <- dplyr::left_join( - tbl, accepted_vals, - by = names(tbl)[names(tbl) != "value"] - ) + valid_tbl <- tbl %>% + split(f = tbl$output_type) %>% + purrr::imap( + ~ check_values_by_output_type(tbl = .x, output_type = .y, + config_tasks = config_tasks, + round_id = round_id, + derived_task_ids = derived_task_ids) + ) %>% purrr::list_rbind() check <- !any(is.na(valid_tbl$valid)) @@ -36,7 +26,8 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) { details <- NULL error_tbl <- NULL } else { - error_summary <- summarise_invalid_values(valid_tbl, accepted_vals) + error_summary <- summarise_invalid_values(valid_tbl, config_tasks, round_id, + derived_task_ids) details <- error_summary$msg if (length(error_summary$invalid_combs_idx) == 0L) { error_tbl <- NULL @@ -66,10 +57,43 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) { ) } -summarise_invalid_values <- function(valid_tbl, accepted_vals) { +check_values_by_output_type <- function(tbl, output_type, config_tasks, round_id, + derived_task_ids = NULL) { + + # Coerce accepted vals to character for easier comparison of + # values. Tried to use arrow tbls for comparisons as more efficient when + # working with larger files but currently arrow does not match NAs as dplyr + # does, returning false positives for mean & median rows which contain NA in + # output type ID column. + accepted_vals <- expand_model_out_grid( + config_tasks = config_tasks, + round_id = round_id, + all_character = TRUE, + output_types = output_type, + derived_task_ids = derived_task_ids + ) + + # This approach uses dplyr to identify tbl rows that don't have a complete match + # in accepted_vals. + accepted_vals$valid <- TRUE + if (hubUtils::is_v3_config(config_tasks)) { + out_type_ids <- tbl[["output_type_id"]] + tbl[tbl$output_type == "sample", "output_type_id"] <- NA + } + + valid_tbl <- dplyr::left_join( + tbl, accepted_vals, + by = names(tbl)[names(tbl) != "value"] + ) + +} + +summarise_invalid_values <- function(valid_tbl, config_tasks, round_id, + derived_task_ids) { cols <- names(valid_tbl)[!names(valid_tbl) %in% c("value", "valid")] uniq_tbl <- purrr::map(valid_tbl[cols], unique) - uniq_config <- purrr::map(accepted_vals[cols], unique) + uniq_config <- get_round_config_values(config_tasks, round_id, + derived_task_ids)[cols] invalid_vals <- purrr::map2( uniq_tbl, uniq_config, diff --git a/R/config_tasks-utils.R b/R/config_tasks-utils.R index 1c6e1bc6..2266f529 100644 --- a/R/config_tasks-utils.R +++ b/R/config_tasks-utils.R @@ -27,3 +27,64 @@ get_round_output_type_names <- function(config_tasks, round_id, out } } + +# get all task_ids values +get_round_config_values <- function(config_tasks, round_id, + derived_task_ids = NULL) { + model_tasks <- hubUtils::get_round_model_tasks(config_tasks, round_id) + task_id_names <- setdiff( + hubUtils::get_round_task_id_names(config_tasks, round_id), + derived_task_ids + ) + task_id_values <- purrr::map( + purrr::set_names(task_id_names), + \(.x) get_task_id_values(.x, model_tasks) + ) + if (!is.null(derived_task_ids)) { + task_id_values <- c( + task_id_values, + purrr::map( + purrr::set_names(derived_task_ids), + ~NA_character_ + ) + ) + } + + output_type_names <- get_round_output_type_names(config_tasks, round_id) + output_type_id_values <- purrr::map( + output_type_names, + \(.x) get_output_type_id_values(.x, model_tasks) + ) %>% purrr::flatten_chr() + + output_types <- list( + output_type = output_type_names, + output_type_id = output_type_id_values + ) + + c(task_id_values, output_types) +} +get_task_id_values <- function(task_id, model_tasks) { + purrr::map( + model_tasks, + ~ .x[["task_ids"]][[task_id]] + ) %>% + unlist(use.names = FALSE) %>% + unique() %>% + as.character() +} + +get_output_type_id_values <- function(output_type, model_tasks) { + out <- purrr::map( + model_tasks, + ~ .x[["output_type"]][[output_type]][["output_type_id"]] + ) %>% + unlist(use.names = FALSE) %>% + unique() %>% + as.character() + + if (length(out) == 0L) { + return(NA_character_) + } else { + return(out) + } +} diff --git a/R/validate_model_data.R b/R/validate_model_data.R index 5cd7a03b..f4014149 100644 --- a/R/validate_model_data.R +++ b/R/validate_model_data.R @@ -3,6 +3,7 @@ #' @inheritParams check_tbl_unique_round_id #' @inheritParams validate_model_file #' @inheritParams hubData::create_hub_schema +#' @inheritParams expand_model_out_grid #' @inherit validate_model_file return #' @export #' @details @@ -29,7 +30,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, "double", "integer", "logical", "Date" ), - validations_cfg_path = NULL) { + validations_cfg_path = NULL, + derived_task_ids = NULL) { checks <- new_hub_validations() file_meta <- parse_file_name(file_path) @@ -136,7 +138,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL, tbl_chr, round_id = round_id, file_path = file_path, - hub_path = hub_path + hub_path = hub_path, + derived_task_ids = derived_task_ids ), file_path ) if (is_any_error(checks$valid_vals)) { diff --git a/man/check_tbl_values.Rd b/man/check_tbl_values.Rd index 19229251..386bb55c 100644 --- a/man/check_tbl_values.Rd +++ b/man/check_tbl_values.Rd @@ -4,7 +4,7 @@ \alias{check_tbl_values} \title{Check model output data tbl contains valid value combinations} \usage{ -check_tbl_values(tbl, round_id, file_path, hub_path) +check_tbl_values(tbl, round_id, file_path, hub_path, derived_task_ids = NULL) } \arguments{ \item{tbl}{a tibble/data.frame of the contents of the file being validated. Column types must \strong{all be character}.} @@ -23,6 +23,10 @@ For more details consult the in the \code{arrow} package. The hub must be fully configured with valid \code{admin.json} and \code{tasks.json} files within the \code{hub-config} directory.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ Depending on whether validation has succeeded, one of: diff --git a/man/validate_model_data.Rd b/man/validate_model_data.Rd index a06355cc..4ffa95e1 100644 --- a/man/validate_model_data.Rd +++ b/man/validate_model_data.Rd @@ -10,7 +10,8 @@ validate_model_data( round_id_col = NULL, output_type_id_datatype = c("from_config", "auto", "character", "double", "integer", "logical", "Date"), - validations_cfg_path = NULL + validations_cfg_path = NULL, + derived_task_ids = NULL ) } \arguments{ @@ -47,6 +48,10 @@ behaviour so use with care.} \item{validations_cfg_path}{Path to \code{validations.yml} file. If \code{NULL} defaults to \code{hub-config/validations.yml}.} + +\item{derived_task_ids}{Character vector of derived task ID names (task IDs whose +values depend on other task IDs) to ignore. Columns for such task ids will +contain \code{NA}s.} } \value{ An object of class \code{hub_validations}. Each named element contains