hubverse-org · zkamvar · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # hubValidations (development version)
 
+* Evaluation of whether all combinations of required values have been submitted through `check_tbl_values_required()` is now chunked by output type for v4 config and above. This reduces memory pressure and should speed up required value validation in hubs with complex task.json files. It also fixes the bug reported in #177.
+
 # hubValidations 0.9.0
 
 * Re-exported functions useful for modelers (#149):

diff --git a/R/check_tbl_values_required.R b/R/check_tbl_values_required.R
@@ -39,13 +39,54 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
     force_output_types <- TRUE
   } else {
     # For pre v4 configs, we use the legacy settings and rules.
-    output_types <- NULL
+    output_types <- list(NULL)
     force_output_types <- FALSE
   }
+
+  # Iterate over output types in v4. This reduces memory pressure and also
+  # keeps output type evaluation separate, resolving the bug reported in #177.
+  # v3 and below is unaffected and validation proceeds as before using the full
+  # modeling task expanded grid
+  missing_df <- purrr::map(output_types, \(.x) {
+    check_required_output_type_by_modeling_task(
+      tbl = tbl,
+      config_tasks = config_tasks,
+      round_id = round_id,
+      output_type = .x,
+      derived_task_ids = derived_task_ids,
+      force_output_types = force_output_types
+    )
+  }) %>%
+    purrr::list_rbind()
+
+  check <- nrow(missing_df) == 0L
+
+  if (check) {
+    details <- NULL
+  } else {
+    missing_df <- coerce_to_hub_schema(missing_df, config_tasks)
+    details <- cli::format_inline("See {.var missing} attribute for details.")
+  }
+
+  capture_check_cnd(
+    check = check,
+    file_path = file_path,
+    msg_subject = "Required task ID/output type/output type ID combinations",
+    msg_attribute = NULL,
+    msg_verbs = c("all present.", "missing."),
+    details = details,
+    missing = missing_df
+  )
+}
+
+check_required_output_type_by_modeling_task <- function(tbl, config_tasks,
+                                                        round_id, output_type,
+                                                        derived_task_ids,
+                                                        force_output_types) {
   req <- expand_model_out_grid(
     config_tasks,
     round_id = round_id,
-    output_types = output_types,
+    output_types = output_type,
     required_vals_only = TRUE,
     force_output_types = force_output_types,
     all_character = TRUE,
@@ -56,7 +97,7 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
   full <- expand_model_out_grid(
     config_tasks,
     round_id = round_id,
-    output_types = output_types,
+    output_types = output_type,
     required_vals_only = FALSE,
     all_character = TRUE,
     as_arrow_table = FALSE,
@@ -66,30 +107,11 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
 
   tbl <- join_tbl_to_model_task(full, tbl, subset_to_tbl_cols = TRUE)
 
-  missing_df <- purrr::pmap(
+  purrr::pmap(
     combine_mt_inputs(tbl, req, full),
     check_modeling_task_values_required
   ) %>%
     purrr::list_rbind()
-
-  check <- nrow(missing_df) == 0L
-
-  if (check) {
-    details <- NULL
-  } else {
-    missing_df <- coerce_to_hub_schema(missing_df, config_tasks)
-    details <- cli::format_inline("See {.var missing} attribute for details.")
-  }
-
-  capture_check_cnd(
-    check = check,
-    file_path = file_path,
-    msg_subject = "Required task ID/output type/output type ID combinations",
-    msg_attribute = NULL,
-    msg_verbs = c("all present.", "missing."),
-    details = details,
-    missing = missing_df
-  )
 }
 
 check_modeling_task_values_required <- function(tbl, req, full) {
@@ -173,7 +195,6 @@ get_opt_col_list <- function(x, mask, full, req) {
 # Identify missing required values for optional value combinations.
 # Output full missing rows compiled from optional values and missing required values.
 missing_req_rows <- function(opt_cols, x, mask, req, full) {
-
   if (all(opt_cols == FALSE)) {
     return(req[!conc_rows(req) %in% conc_rows(x), ])
   }

diff --git a/tests/testthat/test-check_tbl_values_required.R b/tests/testthat/test-check_tbl_values_required.R
@@ -373,3 +373,27 @@ test_that("Reading derived_task_ids from config works", {
     check_tbl_values_required(tbl, round_id, file_path, hub_path)
   )
 })
+
+test_that("v4 config output type leak fixed (#177)", {
+  hub_path <- test_path("testdata", "hub-177")
+  file_path <- "FluSight-baseline/2024-12-14-FluSight-baseline.parquet"
+  round_id <- "2024-12-14"
+  tbl <- read_model_out_file(
+    file_path = file_path,
+    hub_path = hub_path,
+    coerce_types = "chr"
+  )
+  res <- check_tbl_values_required(tbl,
+    round_id = round_id,
+    file_path = file_path, hub_path = hub_path
+  )
+
+  expect_s3_class(res,
+    c(
+      "check_success", "hub_check",
+      "rlang_message", "message",
+      "condition"
+    ),
+    exact = TRUE
+  )
+})
diff --git a/tests/testthat/testdata/hub-177/hub-config/admin.json b/tests/testthat/testdata/hub-177/hub-config/admin.json
@@ -0,0 +1,25 @@
+{
+    "schema_version": "https://raw.githubusercontent.com/hubverse-org/schemas/main/v4.0.0/admin-schema.json",
+    "name": "US CDC FluSight",
+    "maintainer": "US CDC",
+    "contact": {
+        "name": "Joe bloggs",
+        "email": "[email protected]"
+    },
+    "repository": {
+        "host": "github",
+        "owner": "cdcepi",
+        "repository": "FluSight-forecast-hub"
+    },
+    "file_format": ["csv", "parquet"],
+    "timezone": "US/Eastern",
+    "model_output_dir": "model-output",
+    "cloud": {
+        "enabled": true,
+        "host": {
+          "name": "aws",
+          "storage_service": "s3",
+          "storage_location": "cdcepi-flusight-forecast-hub"
+        }
+    }
+}