Merge branch 'main' into ak/derived-task-id-val-check/110

hubverse-org · Dec 12, 2024 · 089b5e8 · 089b5e8
2 parents 0a12910 + 1fdf702
commit 089b5e8
Show file tree

Hide file tree

Showing 9 changed files with 843 additions and 32 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,8 @@
 # hubValidations (development version)
 
-* Added `check_tbl_derived_task_id_vals()` check to `validate_model_data()` that ensures that values in derived task ID columns match expected values for the corresponding derived task IDs in the round as defined in `tasks.json` config (#110). Given the dependence of derived task IDs on the values of other values, the check ignores the combinations of derived task ID values with those of other task IDs and focuses only on identifying values that do not match corresponding accepted values.
+* Added `check_tbl_derived_task_id_vals()` check to `validate_model_data()` that ensures that values in derived task ID columns match expected values for the corresponding derived task IDs in the round as defined in `tasks.json` config (#110). Given the dependence of derived task IDs on the values of their source task ID values, the check ignores the combinations of derived task ID values with those of other task IDs and focuses only on identifying values that do not match corresponding accepted values.
+* `submission_tmpl()` gains the `force_output_types` allowing users to force optional output types to be included in a submission template when `required_vals_only = TRUE`. In conjunction with the use of the `output_types` argument, this allows users to create submission templates which include optional output types they plan to submit.
+* `check_tbl_values_required()` no longer reports false positives for v4 hubs, which fixes the bug reported in #177. Evaluation of whether all combinations of required values have been submitted through `check_tbl_values_required()` is now chunked by output type for v4 config and above. This reduces memory pressure and should speed up required value validation in hubs with complex task.json files.
 
 # hubValidations 0.9.0
 

diff --git a/R/check_tbl_values_required.R b/R/check_tbl_values_required.R
@@ -39,13 +39,54 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
     force_output_types <- TRUE
   } else {
     # For pre v4 configs, we use the legacy settings and rules.
-    output_types <- NULL
+    output_types <- list(NULL)
     force_output_types <- FALSE
   }
+
+  # Iterate over output types in v4. This reduces memory pressure and also
+  # keeps output type evaluation separate, resolving the bug reported in #177.
+  # v3 and below is unaffected and validation proceeds as before using the full
+  # modeling task expanded grid
+  missing_df <- purrr::map(output_types, \(.x) {
+    check_required_output_type_by_modeling_task(
+      tbl = tbl,
+      config_tasks = config_tasks,
+      round_id = round_id,
+      output_type = .x,
+      derived_task_ids = derived_task_ids,
+      force_output_types = force_output_types
+    )
+  }) %>%
+    purrr::list_rbind()
+
+  check <- nrow(missing_df) == 0L
+
+  if (check) {
+    details <- NULL
+  } else {
+    missing_df <- coerce_to_hub_schema(missing_df, config_tasks)
+    details <- cli::format_inline("See {.var missing} attribute for details.")
+  }
+
+  capture_check_cnd(
+    check = check,
+    file_path = file_path,
+    msg_subject = "Required task ID/output type/output type ID combinations",
+    msg_attribute = NULL,
+    msg_verbs = c("all present.", "missing."),
+    details = details,
+    missing = missing_df
+  )
+}
+
+check_required_output_type_by_modeling_task <- function(tbl, config_tasks,
+                                                        round_id, output_type,
+                                                        derived_task_ids,
+                                                        force_output_types) {
   req <- expand_model_out_grid(
     config_tasks,
     round_id = round_id,
-    output_types = output_types,
+    output_types = output_type,
     required_vals_only = TRUE,
     force_output_types = force_output_types,
     all_character = TRUE,
@@ -56,7 +97,7 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
   full <- expand_model_out_grid(
     config_tasks,
     round_id = round_id,
-    output_types = output_types,
+    output_types = output_type,
     required_vals_only = FALSE,
     all_character = TRUE,
     as_arrow_table = FALSE,
@@ -66,30 +107,11 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path,
 
   tbl <- join_tbl_to_model_task(full, tbl, subset_to_tbl_cols = TRUE)
 
-  missing_df <- purrr::pmap(
+  purrr::pmap(
     combine_mt_inputs(tbl, req, full),
     check_modeling_task_values_required
   ) %>%
     purrr::list_rbind()
-
-  check <- nrow(missing_df) == 0L
-
-  if (check) {
-    details <- NULL
-  } else {
-    missing_df <- coerce_to_hub_schema(missing_df, config_tasks)
-    details <- cli::format_inline("See {.var missing} attribute for details.")
-  }
-
-  capture_check_cnd(
-    check = check,
-    file_path = file_path,
-    msg_subject = "Required task ID/output type/output type ID combinations",
-    msg_attribute = NULL,
-    msg_verbs = c("all present.", "missing."),
-    details = details,
-    missing = missing_df
-  )
 }
 
 check_modeling_task_values_required <- function(tbl, req, full) {
@@ -173,7 +195,6 @@ get_opt_col_list <- function(x, mask, full, req) {
 # Identify missing required values for optional value combinations.
 # Output full missing rows compiled from optional values and missing required values.
 missing_req_rows <- function(opt_cols, x, mask, req, full) {
-
   if (all(opt_cols == FALSE)) {
     return(req[!conc_rows(req) %in% conc_rows(x), ])
   }

diff --git a/R/submission_tmpl.R b/R/submission_tmpl.R
@@ -20,15 +20,25 @@
 #' values only.
 #'
 #' @details
-#' For task IDs or output_type_ids where all values are optional, by default, columns
-#' are included as columns of `NA`s when `required_vals_only = TRUE`.
+#' For task IDs where all values are optional, by default, columns
+#' are created as columns of `NA`s when `required_vals_only = TRUE`.
 #' When such columns exist, the function returns a tibble with zero rows, as no
 #' complete cases of required value combinations exists.
 #' _(Note that determination of complete cases does excludes valid `NA`
 #' `output_type_id` values in `"mean"` and `"median"` output types)._
 #' To return a template of incomplete required cases, which includes `NA` columns, use
 #' `complete_cases_only = FALSE`.
 #'
+#' To include output types that are optional in the submission template
+#' when `required_vals_only = TRUE` and `complete_cases_only = FALSE`, use
+#' `force_output_types = TRUE`. Use this in combination with sub-setting for
+#'  output types you plan to submit via argument `output_types` to create a
+#' submission template customised to your submission plans.
+#' _Tip: to ensure you create a template with all required output types, it's
+#' a good idea to first run the functions without subsetting or forcing output
+#' types and examing the unique values in `output_type` to check which output
+#' types are required._
+#'
 #' When sample output types are included in the output, the `output_type_id`
 #' column contains example sample indexes which are useful for identifying the
 #' compound task ID structure of multivariate sampling distributions in particular,
@@ -116,8 +126,19 @@
 #'   derived_task_ids = "target_end_date",
 #'   complete_cases_only = FALSE
 #' )
+#' # Force optional output type, in this case "mean".
+#' submission_tmpl(
+#'   config_tasks = config_tasks,
+#'   round_id = "2022-12-12",
+#'   required_vals_only = TRUE,
+#'   output_types = c("pmf", "quantile", "mean"),
+#'   force_output_types = TRUE,
+#'   derived_task_ids = "target_end_date",
+#'   complete_cases_only = FALSE
+#' )
 submission_tmpl <- function(hub_con, config_tasks, round_id,
                             required_vals_only = FALSE,
+                            force_output_types = FALSE,
                             complete_cases_only = TRUE,
                             compound_taskid_set = NULL,
                             output_types = NULL,
@@ -138,15 +159,39 @@ submission_tmpl <- function(hub_con, config_tasks, round_id,
       derived_task_ids, config_tasks, round_id
     )
   }
-
   tmpl_df <- expand_model_out_grid(config_tasks,
     round_id = round_id,
     required_vals_only = required_vals_only,
     include_sample_ids = TRUE,
     compound_taskid_set = compound_taskid_set,
     output_types = output_types,
-    derived_task_ids = derived_task_ids
+    derived_task_ids = derived_task_ids,
+    force_output_types = force_output_types
   )
+  if (nrow(tmpl_df) == 0L && !complete_cases_only) {
+    # If all output_types are optional, expand_model_out_grid returns
+    # a zero row and column data.frame. To attempt to expand required task id
+    # values when complete_cases_only = FALSE, we use
+    # force_output_types = TRUE to force the output types to be included. We
+    # then remove output type related columns  and create a data.frame of
+    # required task id vales only.
+    tmpl_df <- expand_model_out_grid(
+      config_tasks,
+      round_id = round_id,
+      required_vals_only = required_vals_only,
+      include_sample_ids = TRUE,
+      compound_taskid_set = compound_taskid_set,
+      output_types = output_types,
+      derived_task_ids = derived_task_ids,
+      force_output_types = TRUE
+    )
+    output_cols <- hubUtils::std_colnames[c("output_type", "output_type_id", "value")]
+    tmpl_df <- tmpl_df[setdiff(names(tmpl_df), output_cols)] |>
+      unique()
+  }
+  if (nrow(tmpl_df) == 0L) {
+    return(tmpl_df)
+  }
 
   tmpl_cols <- c(
     hubUtils::get_round_task_id_names(
@@ -155,7 +200,6 @@ submission_tmpl <- function(hub_con, config_tasks, round_id,
     ),
     hubUtils::std_colnames[names(hubUtils::std_colnames) != "model_id"]
   )
-
   # Add NA columns for value and all optional cols
   na_cols <- tmpl_cols[!tmpl_cols %in% names(tmpl_df)]
   tmpl_df[, na_cols] <- NA

diff --git a/man/submission_tmpl.Rd b/man/submission_tmpl.Rd
diff --git a/tests/testthat/test-check_tbl_values_required.R b/tests/testthat/test-check_tbl_values_required.R
@@ -373,3 +373,27 @@ test_that("Reading derived_task_ids from config works", {
     check_tbl_values_required(tbl, round_id, file_path, hub_path)
   )
 })
+
+test_that("v4 config output type leak fixed (#177)", {
+  hub_path <- test_path("testdata", "hub-177")
+  file_path <- "FluSight-baseline/2024-12-14-FluSight-baseline.parquet"
+  round_id <- "2024-12-14"
+  tbl <- read_model_out_file(
+    file_path = file_path,
+    hub_path = hub_path,
+    coerce_types = "chr"
+  )
+  res <- check_tbl_values_required(tbl,
+    round_id = round_id,
+    file_path = file_path, hub_path = hub_path
+  )
+
+  expect_s3_class(res,
+    c(
+      "check_success", "hub_check",
+      "rlang_message", "message",
+      "condition"
+    ),
+    exact = TRUE
+  )
+})
diff --git a/tests/testthat/test-submission_tmpl.R b/tests/testthat/test-submission_tmpl.R
@@ -255,3 +255,52 @@ test_that("submission_tmpl ignoring derived task ids works", {
     )
   )
 })
+
+
+test_that("submission_tmpl force_output_types works", {
+  config_tasks <- read_config_file(
+    test_path(
+      "testdata", "configs",
+      "tasks-samples-v4.json"
+    )
+  )
+  # When force_output_types is not set, all output_types are optional, a
+  #  zero row and column data.frame is returned  by default.
+  req_non_force_default <- submission_tmpl(
+    config_tasks = config_tasks,
+    round_id = "2022-10-22",
+    required_vals_only = TRUE,
+    output_types = "sample"
+  )
+  expect_equal(dim(req_non_force_default), c(0L, 0L))
+  # When force_output_types is not set, all output_types are optional and
+  # complete_cases_only = FALSE a data.frame containing required task ID
+  # values is returned, with all optional task ids and output type related
+  # columns set to NA.
+  expect_warning({
+    req_non_force <- submission_tmpl(
+      config_tasks = config_tasks,
+      round_id = "2022-10-22",
+      required_vals_only = TRUE,
+      output_types = "sample",
+      complete_cases_only = FALSE
+    )
+  }, "all optional values") |> suppressMessages()
+  expect_equal(dim(req_non_force), c(4L, 9L))
+  expect_equal(unique(req_non_force$output_type), NA_character_)
+
+  # When force_output_types is TRUE, the requested output type should be
+  # returned.
+  expect_warning({
+    req_force <- submission_tmpl(
+      config_tasks = config_tasks,
+      round_id = "2022-10-22",
+      required_vals_only = TRUE,
+      force_output_types = TRUE,
+      output_types = "sample",
+      complete_cases_only = FALSE
+    )
+  }, "all optional values") |> suppressMessages()
+  expect_equal(dim(req_force), c(4L, 9L))
+  expect_equal(unique(req_force$output_type), "sample")
+})
diff --git a/tests/testthat/testdata/hub-177/hub-config/admin.json b/tests/testthat/testdata/hub-177/hub-config/admin.json
@@ -0,0 +1,25 @@
+{
+    "schema_version": "https://raw.githubusercontent.com/hubverse-org/schemas/main/v4.0.0/admin-schema.json",
+    "name": "US CDC FluSight",
+    "maintainer": "US CDC",
+    "contact": {
+        "name": "Joe bloggs",
+        "email": "[email protected]"
+    },
+    "repository": {
+        "host": "github",
+        "owner": "cdcepi",
+        "repository": "FluSight-forecast-hub"
+    },
+    "file_format": ["csv", "parquet"],
+    "timezone": "US/Eastern",
+    "model_output_dir": "model-output",
+    "cloud": {
+        "enabled": true,
+        "host": {
+          "name": "aws",
+          "storage_service": "s3",
+          "storage_location": "cdcepi-flusight-forecast-hub"
+        }
+    }
+}