optimise check_tbl_values by output_type batching & ignoring derived …

…task-ids. Related to #93
hubverse-org · Aug 9, 2024 · 2aa70e9 · 2aa70e9
1 parent 21d2f85
commit 2aa70e9
Show file tree

Hide file tree

Showing 5 changed files with 127 additions and 30 deletions.
diff --git a/R/check_tbl_values.R b/R/check_tbl_values.R
@@ -1,42 +1,33 @@
 #' Check model output data tbl contains valid value combinations
 #' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**.
 #' @inherit check_tbl_colnames params
+#' @inheritParams expand_model_out_grid
 #' @inherit check_tbl_colnames return
 #' @export
-check_tbl_values <- function(tbl, round_id, file_path, hub_path) {
+check_tbl_values <- function(tbl, round_id, file_path, hub_path,
+                             derived_task_ids = NULL) {
   config_tasks <- hubUtils::read_config(hub_path, "tasks")
-
-  # Coerce accepted vals to character for easier comparison of
-  # values. Tried to use arrow tbls for comparisons as more efficient when
-  # working with larger files but currently arrow does not match NAs as dplyr
-  # does, returning false positives for mean & median rows which contain NA in
-  # output type ID column.
-  accepted_vals <- expand_model_out_grid(
-    config_tasks = config_tasks,
-    round_id = round_id,
-    all_character = TRUE
-  )
-
-  # This approach uses dplyr to identify tbl rows that don't have a complete match
-  # in accepted_vals.
-  accepted_vals$valid <- TRUE
-  if (hubUtils::is_v3_config(config_tasks)) {
-    out_type_ids <- tbl[["output_type_id"]]
-    tbl[tbl$output_type == "sample", "output_type_id"] <- NA
+  if (!is.null(derived_task_ids)) {
+    tbl[, derived_task_ids] <- NA_character_
   }
 
-  valid_tbl <- dplyr::left_join(
-    tbl, accepted_vals,
-    by = names(tbl)[names(tbl) != "value"]
-  )
+  valid_tbl <- tbl %>%
+    split(f = tbl$output_type) %>%
+    purrr::imap(
+      ~ check_values_by_output_type(tbl = .x, output_type = .y,
+                                 config_tasks = config_tasks,
+                                 round_id = round_id,
+                                 derived_task_ids = derived_task_ids)
+    ) %>% purrr::list_rbind()
 
   check <- !any(is.na(valid_tbl$valid))
 
   if (check) {
     details <- NULL
     error_tbl <- NULL
   } else {
-    error_summary <- summarise_invalid_values(valid_tbl, accepted_vals)
+    error_summary <- summarise_invalid_values(valid_tbl, config_tasks, round_id,
+                                              derived_task_ids)
     details <- error_summary$msg
     if (length(error_summary$invalid_combs_idx) == 0L) {
       error_tbl <- NULL
@@ -66,10 +57,43 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) {
   )
 }
 
-summarise_invalid_values <- function(valid_tbl, accepted_vals) {
+check_values_by_output_type <- function(tbl, output_type, config_tasks, round_id,
+                                     derived_task_ids = NULL) {
+
+  # Coerce accepted vals to character for easier comparison of
+  # values. Tried to use arrow tbls for comparisons as more efficient when
+  # working with larger files but currently arrow does not match NAs as dplyr
+  # does, returning false positives for mean & median rows which contain NA in
+  # output type ID column.
+  accepted_vals <- expand_model_out_grid(
+    config_tasks = config_tasks,
+    round_id = round_id,
+    all_character = TRUE,
+    output_types = output_type,
+    derived_task_ids = derived_task_ids
+  )
+
+  # This approach uses dplyr to identify tbl rows that don't have a complete match
+  # in accepted_vals.
+  accepted_vals$valid <- TRUE
+  if (hubUtils::is_v3_config(config_tasks)) {
+    out_type_ids <- tbl[["output_type_id"]]
+    tbl[tbl$output_type == "sample", "output_type_id"] <- NA
+  }
+
+  valid_tbl <- dplyr::left_join(
+    tbl, accepted_vals,
+    by = names(tbl)[names(tbl) != "value"]
+  )
+
+}
+
+summarise_invalid_values <- function(valid_tbl, config_tasks, round_id,
+                                     derived_task_ids) {
   cols <- names(valid_tbl)[!names(valid_tbl) %in% c("value", "valid")]
   uniq_tbl <- purrr::map(valid_tbl[cols], unique)
-  uniq_config <- purrr::map(accepted_vals[cols], unique)
+  uniq_config <- get_round_config_values(config_tasks, round_id,
+                                         derived_task_ids)[cols]
 
   invalid_vals <- purrr::map2(
     uniq_tbl, uniq_config,

diff --git a/R/config_tasks-utils.R b/R/config_tasks-utils.R
@@ -27,3 +27,64 @@ get_round_output_type_names <- function(config_tasks, round_id,
     out
   }
 }
+
+# get all task_ids values
+get_round_config_values <- function(config_tasks, round_id,
+                                    derived_task_ids = NULL) {
+  model_tasks <- hubUtils::get_round_model_tasks(config_tasks, round_id)
+  task_id_names <- setdiff(
+    hubUtils::get_round_task_id_names(config_tasks, round_id),
+    derived_task_ids
+  )
+  task_id_values <- purrr::map(
+    purrr::set_names(task_id_names),
+    \(.x) get_task_id_values(.x, model_tasks)
+  )
+  if (!is.null(derived_task_ids)) {
+    task_id_values <- c(
+      task_id_values,
+      purrr::map(
+        purrr::set_names(derived_task_ids),
+        ~NA_character_
+      )
+    )
+  }
+
+  output_type_names <- get_round_output_type_names(config_tasks, round_id)
+  output_type_id_values <- purrr::map(
+    output_type_names,
+    \(.x) get_output_type_id_values(.x, model_tasks)
+  ) %>% purrr::flatten_chr()
+
+  output_types <- list(
+    output_type = output_type_names,
+    output_type_id = output_type_id_values
+  )
+
+  c(task_id_values, output_types)
+}
+get_task_id_values <- function(task_id, model_tasks) {
+  purrr::map(
+    model_tasks,
+    ~ .x[["task_ids"]][[task_id]]
+  ) %>%
+    unlist(use.names = FALSE) %>%
+    unique() %>%
+    as.character()
+}
+
+get_output_type_id_values <- function(output_type, model_tasks) {
+  out <- purrr::map(
+    model_tasks,
+    ~ .x[["output_type"]][[output_type]][["output_type_id"]]
+  ) %>%
+    unlist(use.names = FALSE) %>%
+    unique() %>%
+    as.character()
+
+  if (length(out) == 0L) {
+    return(NA_character_)
+  } else {
+    return(out)
+  }
+}
diff --git a/R/validate_model_data.R b/R/validate_model_data.R
@@ -3,6 +3,7 @@
 #' @inheritParams check_tbl_unique_round_id
 #' @inheritParams validate_model_file
 #' @inheritParams hubData::create_hub_schema
+#' @inheritParams expand_model_out_grid
 #' @inherit validate_model_file return
 #' @export
 #' @details
@@ -29,7 +30,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
                                   "double", "integer",
                                   "logical", "Date"
                                 ),
-                                validations_cfg_path = NULL) {
+                                validations_cfg_path = NULL,
+                                derived_task_ids = NULL) {
   checks <- new_hub_validations()
 
   file_meta <- parse_file_name(file_path)
@@ -136,7 +138,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
       tbl_chr,
       round_id = round_id,
       file_path = file_path,
-      hub_path = hub_path
+      hub_path = hub_path,
+      derived_task_ids = derived_task_ids
     ), file_path
   )
   if (is_any_error(checks$valid_vals)) {

diff --git a/man/check_tbl_values.Rd b/man/check_tbl_values.Rd
diff --git a/man/validate_model_data.Rd b/man/validate_model_data.Rd