hubverse-org · annakrystalli · Aug 16, 2024 · Aug 9, 2024 · Aug 12, 2024 · Aug 14, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: hubValidations
 Title: Testing framework for hubverse hub validations
-Version: 0.4.0
+Version: 0.5.0
 Authors@R: c(
     person(
         given = "Anna", 

diff --git a/NAMESPACE b/NAMESPACE
@@ -50,6 +50,7 @@ export(is_exec_warn)
 export(is_failure)
 export(is_info)
 export(is_success)
+export(match_tbl_to_model_task)
 export(new_hub_validations)
 export(not_pass)
 export(opt_check_metadata_team_max_model_n)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,15 @@
+# hubValidations 0.5.0
+
+This release introduces significant improvements in the performance of submission validation via the following changes:
+* Add ability to sub-set expanded valid value grids by output type through `output_type` argument in `expand_model_out_grid()` (#98).
+* Add ability to ignore the values of derived task IDs in expanded valid value grids through argument `derived_task_ids` in `expand_model_out_grid()`. 
+* Use sub-setting and batching of model output data validation by output type in appropriate lower level checks and add ability to ignore derived task IDs in `validate_model_data()`, `validate_submission()` and `validate_pr()`. 
+
+Both of these changes allow for the creation of smaller, more focused expanded valid value grids, significantly reducing pressure on memory when working with large, complex hub configs and making submission validation much more efficient.
+
+Additional useful functionality:
+* Add new exported function `match_tbl_to_model_task()` that matches the rows in a `tbl` of model output data to a model task of a given round (as defined in `tasks.json`).
+
 # hubValidations 0.4.0
 
 - Add new `check_tbl_spl_compound_taskid_set()` check function to `validate_model_data()` that ensures that sample compound task id sets for each modeling task match or are coarser than the expected set defined in `tasks.json` config.

diff --git a/R/check_tbl_spl_compound_taskid_set.R b/R/check_tbl_spl_compound_taskid_set.R
@@ -8,6 +8,7 @@
 #' Column types must **all be character**.
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_colnames return
+#' @inheritParams expand_model_out_grid
 #' @details If the check fails, the output of the check includes an `errors` element,
 #' a list of items, one for each modeling task failing validation.
 #' The structure depends on the reason the check failed.
@@ -31,24 +32,25 @@
 #' See [hubverse documentation on samples](https://hubverse.io/en/latest/user-guide/sample-output-type.html)
 #' for more details.
 #' @export
-check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path) {
+check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path,
+                                              derived_task_ids = NULL) {
   config_tasks <- hubUtils::read_config(hub_path, "tasks")
 
   if (isFALSE(has_spls_tbl(tbl)) || isFALSE(hubUtils::is_v3_config(config_tasks))) {
     return(skip_v3_spl_check(file_path))
   }
 
-  compound_taskid_set <- get_tbl_compound_taskid_set(tbl, config_tasks, round_id,
-    compact = FALSE, error = FALSE
+  compound_taskid_set <- get_tbl_compound_taskid_set(
+    tbl, config_tasks, round_id,
+    compact = FALSE, error = FALSE,
+    derived_task_ids = NULL
   )
 
   check <- purrr::map_lgl(
     compound_taskid_set,
     ~ is.null(attr(.x, "errors"))
   ) |> all()
 
-
-
   capture_check_cnd(
     check = check,
     file_path = file_path,

diff --git a/R/check_tbl_spl_compound_tid.R b/R/check_tbl_spl_compound_tid.R
@@ -4,6 +4,7 @@
 #' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**.
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_colnames return
+#' @inheritParams expand_model_out_grid
 #' @param compound_taskid_set a list of `compound_taskid_set`s (characters vector of compound task IDs),
 #' one for each modeling task. Used to override the compound task ID set in the config file,
 #' for example, when validating coarser samples.
@@ -17,7 +18,8 @@
 #' for more details.
 #' @export
 check_tbl_spl_compound_tid <- function(tbl, round_id, file_path, hub_path,
-                                       compound_taskid_set = NULL) {
+                                       compound_taskid_set = NULL,
+                                       derived_task_ids = NULL) {
   if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) {
     cli::cli_abort("Valid {.var compound_taskid_set} must be provided.")
   }
@@ -33,7 +35,9 @@ check_tbl_spl_compound_tid <- function(tbl, round_id, file_path, hub_path,
     return(skip_v3_spl_check(file_path))
   }
 
-  hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set)
+  hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set,
+    derived_task_ids = derived_task_ids
+  )
   n_tbl <- hash_tbl[hash_tbl$n_compound_idx > 1L, ]
 
   check <- nrow(n_tbl) == 0L

diff --git a/R/check_tbl_spl_n.R b/R/check_tbl_spl_n.R
@@ -16,7 +16,8 @@
 #' for more details.
 #' @export
 check_tbl_spl_n <- function(tbl, round_id, file_path, hub_path,
-                            compound_taskid_set = NULL) {
+                            compound_taskid_set = NULL,
+                            derived_task_ids = NULL) {
   if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) {
     cli::cli_abort("Valid {.var compound_taskid_set} must be provided.")
   }
@@ -32,7 +33,9 @@ check_tbl_spl_n <- function(tbl, round_id, file_path, hub_path,
     return(skip_v3_spl_check(file_path))
   }
 
-  hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set)
+  hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set,
+    derived_task_ids = derived_task_ids
+  )
   n_ranges <- get_round_spl_n_ranges(config_tasks, round_id)
 
   n_tbl <- dplyr::group_by(hash_tbl, .data$compound_idx) %>%

diff --git a/R/check_tbl_spl_non_compound_tid.R b/R/check_tbl_spl_non_compound_tid.R
@@ -17,7 +17,8 @@
 #' for more details.
 #' @export
 check_tbl_spl_non_compound_tid <- function(tbl, round_id, file_path, hub_path,
-                                           compound_taskid_set = NULL) {
+                                           compound_taskid_set = NULL,
+                                           derived_task_ids = NULL) {
   if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) {
     cli::cli_abort("Valid {.var compound_taskid_set} must be provided.")
   }
@@ -33,7 +34,8 @@ check_tbl_spl_non_compound_tid <- function(tbl, round_id, file_path, hub_path,
     return(skip_v3_spl_check(file_path))
   }
 
-  hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set)
+  hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set,
+                           derived_task_ids = derived_task_ids)
 
   n_tbl <- dplyr::summarise(
     hash_tbl,

diff --git a/R/check_tbl_value_col.R b/R/check_tbl_value_col.R
@@ -5,54 +5,34 @@
 #' type of the appropriate model task.
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_col_types return
+#' @inheritParams expand_model_out_grid
 #' @export
-check_tbl_value_col <- function(tbl, round_id, file_path, hub_path) {
+check_tbl_value_col <- function(tbl, round_id, file_path, hub_path,
+                                derived_task_ids = NULL) {
   config_tasks <- hubUtils::read_config(hub_path, "tasks")
 
   tbl[, names(tbl) != "value"] <- hubData::coerce_to_character(
     tbl[, names(tbl) != "value"]
   )
+  if (!is.null(derived_task_ids)) {
+    tbl[, derived_task_ids] <- NA_character_
+  }
 
-  full <- expand_model_out_grid(
-    config_tasks,
-    round_id = round_id,
-    required_vals_only = FALSE,
-    all_character = TRUE,
-    as_arrow_table = FALSE,
-    bind_model_tasks = FALSE
-  )
-
-  join_cols <- names(tbl)[names(tbl) != "value"] # nolint: object_usage_linter
-  tbl <- purrr::map(
-    full,
-    ~ dplyr::inner_join(.x, tbl, by = join_cols)
-  )
-
-  round_config <- get_file_round_config(file_path, hub_path)
-  output_type_config <- round_config[["model_tasks"]] %>%
-    purrr::map(~ .x[["output_type"]])
-
-
-  details <- purrr::map2(
-    tbl, output_type_config,
-    check_modeling_task_value_col
-  ) %>%
+  details <- split(tbl, f = tbl$output_type) %>%
+    purrr::imap(
+      \(.x, .y) {
+        check_value_col_by_output_type(
+          tbl = .x, output_type = .y,
+          config_tasks = config_tasks,
+          round_id = round_id,
+          derived_task_ids = derived_task_ids
+        )
+      }
+    ) %>%
     unlist(use.names = TRUE)
 
   check <- is.null(details)
 
-  ## Example code for attempting bullets of details. Needs more experimentation
-  ## but parking for now.
-  # if (!check) {
-  #     details_bullets_div <- function(details) {
-  #         cli::cli_div()
-  #         cli::format_bullets_raw(
-  #             stats::setNames(details, rep("*", length(details)))
-  #         )
-  #     }
-  #     details <- details_bullets_div(details)
-  # }
-
   capture_check_cnd(
     check = check,
     file_path = file_path,
@@ -63,19 +43,28 @@ check_tbl_value_col <- function(tbl, round_id, file_path, hub_path) {
   )
 }
 
-
-check_modeling_task_value_col <- function(tbl, output_type_config) {
-  purrr::imap(
-    split(tbl, tbl[["output_type"]]),
-    ~ compare_values_to_config(
-      tbl = .x, output_type = .y,
-      output_type_config
-    )
+check_value_col_by_output_type <- function(tbl, output_type,
+                                           config_tasks, round_id,
+                                           derived_task_ids = NULL) {
+  purrr::map2(
+    .x = match_tbl_to_model_task(tbl, config_tasks,
+      round_id, output_type,
+      derived_task_ids = derived_task_ids
+    ),
+    .y = get_round_output_types(config_tasks, round_id),
+    \(.x, .y) {
+      compare_values_to_config(
+        tbl = .x, output_type_config = .y, output_type = output_type
+      )
+    }
   ) %>%
     unlist(use.names = TRUE)
 }
 
 compare_values_to_config <- function(tbl, output_type, output_type_config) {
+  if (any(is.null(tbl), is.null(output_type_config))) {
+    return(NULL)
+  }
   details <- NULL
   values <- tbl$value
   config <- output_type_config[[output_type]][["value"]]