hubverse-org · zkamvar · Dec 12, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 12, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -27,6 +27,7 @@ export(check_submission_metadata_file_exists)
 export(check_submission_time)
 export(check_tbl_col_types)
 export(check_tbl_colnames)
+export(check_tbl_derived_task_id_vals)
 export(check_tbl_match_round_id)
 export(check_tbl_rows_unique)
 export(check_tbl_spl_compound_taskid_set)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # hubValidations (development version)
 
+* Added `check_tbl_derived_task_id_vals()` check to `validate_model_data()` that ensures that values in derived task ID columns match expected values for the corresponding derived task IDs in the round as defined in `tasks.json` config (#110). Given the dependence of derived task IDs on the values of other values, the check ignores the combinations of derived task ID values with those of other task IDs and focuses only on identifying values that do not match corresponding accepted values.
+
 # hubValidations 0.9.0
 
 * Re-exported functions useful for modelers (#149):

diff --git a/R/check_tbl_derived_task_id_vals.R b/R/check_tbl_derived_task_id_vals.R
@@ -0,0 +1,92 @@
+#' Check derived task ID columns contain valid values
+#'
+#' This check is used to validate that values in any derived task ID columns
+#' matches accepted values for each derived task ID in the config.
+#' Given the dependence of derived task IDs on the values of other values,
+#' it ignores the combinations of derived task ID values with those of other task IDs
+#' and focuses only on identifying values that do not match the accepted values.
+#' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**.
+#' @inherit check_tbl_colnames params
+#' @inheritParams expand_model_out_grid
+#' @return
+#' Depending on whether validation has succeeded, one of:
+#' - `<message/check_success>` condition class object.
+#' - `<error/check_failure>` condition class object.
+#'
+#' If no `derived_task_ids` are specified, the check is skipped and a
+#' `<message/check_info>` condition class object is retuned.
+#'
+#' Returned object also inherits from subclass `<hub_check>`.
+#' @export
+check_tbl_derived_task_id_vals <- function(
+    tbl, round_id, file_path, hub_path,
+    derived_task_ids = get_hub_derived_task_ids(
+      hub_path, round_id
+    )) {
+  if (is.null(derived_task_ids)) {
+    return(
+      capture_check_info(
+        file_path = file_path,
+        msg = "No derived task IDs to check. Skipping derived task ID value check."
+      )
+    )
+  }
+  config_tasks <- read_config(hub_path, "tasks")
+
+  derived_task_id_vals <- get_round_config_values(
+    config_tasks = config_tasks,
+    round_id = round_id,
+    derived_task_ids = NULL
+  )[derived_task_ids]
+
+  setdiff_vals <- purrr::map2(
+    tbl[derived_task_ids], derived_task_id_vals,
+    \(.x, .y) {
+      setdiff(.x, .y)
+    }
+  )
+
+  invalid_derived_task_ids <- purrr::map_lgl(
+    setdiff_vals,
+    \(.x) length(.x) > 0L
+  )
+
+  check <- !any(invalid_derived_task_ids)
+
+  if (check) {
+    details <- NULL
+    errors <- NULL
+  } else {
+    invalid_vals <- setdiff_vals[invalid_derived_task_ids]
+    invalid_vals_msg <- purrr::map_chr(
+      seq_along(invalid_vals),
+      \(.x) {
+        paste0(
+          "{.arg {names(invalid_vals)[", .x,
+          "]}}: {.val {invalid_vals[[", .x, "]]}}"
+        )
+      }
+    )
+    details <- c(
+      invalid_vals_msg,
+      "see {.code errors} for more details."
+    ) |>
+      paste(collapse = ", ") |>
+      cli::format_inline()
+    errors <- invalid_vals
+  }
+
+  capture_check_cnd(
+    check = check,
+    file_path = file_path,
+    msg_subject = "{.var tbl}",
+    msg_attribute = "",
+    msg_verbs = c(
+      "contains valid derived task ID values.",
+      "contains invalid derived task ID values."
+    ),
+    errors = errors,
+    error = FALSE,
+    details = details
+  )
+}
diff --git a/R/validate_model_data.R b/R/validate_model_data.R
@@ -170,6 +170,16 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
     return(checks)
   }
 
+  checks$derived_task_id_vals <- try_check(
+    check_tbl_derived_task_id_vals(
+      tbl_chr,
+      round_id = round_id,
+      file_path = file_path,
+      hub_path = hub_path,
+      derived_task_ids = derived_task_ids
+    ), file_path
+  )
+
   checks$rows_unique <- try_check(
     check_tbl_rows_unique(
       tbl_chr,

diff --git a/inst/check_table.csv b/inst/check_table.csv
@@ -14,9 +14,10 @@ unique_round_id,Round ID column contains a single unique round ID. Skipped if `r
 match_round_id,Round ID from file contents matches round ID from file name. Skipped if `round_id_from_var` is FALSE in config.,TRUE,check_error,validate_model_data,check_tbl_match_round_id,,FALSE
 colnames,File column names match expected column names for round (i.e. task ID names + hub standard column names),TRUE,check_error,validate_model_data,check_tbl_colnames,,FALSE
 col_types,File column types match expected column types from config. Mainly applicable to parquet & arrow files.,FALSE,check_failure,validate_model_data,check_tbl_col_types,,FALSE
-valid_vals,Columns (excluding `value` column) contain valid combinations of task ID / output type / output type ID values,TRUE,check_error,validate_model_data,check_tbl_values,error_tbl: table of invalid task ID/output type/output type ID value combinations,FALSE
-rows_unique,Columns (excluding `value` column) contain unique combinations of task ID / output type / output type ID values,FALSE,check_failure,validate_model_data,check_tbl_rows_unique,,FALSE
-req_vals,Columns (excluding `value` column) contain all required combinations of task ID / output type / output type ID values,FALSE,check_failure,validate_model_data,check_tbl_values_required,missing_df: table of missing task ID/output type/output type ID value combinations,FALSE
+valid_vals,Columns (excluding the `value` and any derived task ID columns) contain valid combinations of task ID / output type / output type ID values,TRUE,check_error,validate_model_data,check_tbl_values,error_tbl: table of invalid task ID/output type/output type ID value combinations,FALSE
+derived_task_id_vals,Derived task ID columns contain valid values.,FALSE,check_failure,validate_model_data,check_tbl_derived_task_id_vals,errors: named list of derived task ID values. Each element contains the invalid values for each derived task ID that failed the check.,FALSE
+rows_unique,Columns (excluding the `value` and any derived task ID columns) contain unique combinations of task ID / output type / output type ID values,FALSE,check_failure,validate_model_data,check_tbl_rows_unique,,FALSE
+req_vals,Columns (excluding the `value` and any derived task ID columns) contain all required combinations of task ID / output type / output type ID values,FALSE,check_failure,validate_model_data,check_tbl_values_required,missing_df: table of missing task ID/output type/output type ID value combinations,FALSE
 value_col_valid,Values in `value` column are coercible to data type configured for each output type,FALSE,check_failure,validate_model_data,check_tbl_value_col,,FALSE
 value_col_non_desc,Values in `value` column are non-decreasing as output_type_ids increase for all unique task ID /output type value combinations. Applies to `quantile` or `cdf` output types only,FALSE,check_failure,validate_model_data,check_tbl_value_col_ascending,error_tbl: table of rows affected,FALSE
 value_col_sum1,Values in the `value` column of `pmf` output type data for each unique task ID combination sum to 1.,FALSE,check_failure,validate_model_data,check_tbl_value_col_sum1,error_tbl: table of rows affected,FALSE

diff --git a/man/check_tbl_derived_task_id_vals.Rd b/man/check_tbl_derived_task_id_vals.Rd
diff --git a/man/validate_model_data.Rd b/man/validate_model_data.Rd
diff --git a/man/validate_pr.Rd b/man/validate_pr.Rd
diff --git a/man/validate_submission.Rd b/man/validate_submission.Rd
diff --git a/tests/testthat/_snaps/check_tbl_derived_task_id_vals.md b/tests/testthat/_snaps/check_tbl_derived_task_id_vals.md
@@ -0,0 +1,28 @@
+# check_tbl_derived_task_ids_vals works
+
+    Code
+      check_tbl_derived_task_id_vals(tbl, round_id, file_path, hub_path)
+    Output
+      <message/check_success>
+      Message:
+      `tbl` contains valid derived task ID values.
+
+---
+
+    Code
+      check_tbl_derived_task_id_vals(tbl, round_id, file_path, hub_path,
+        derived_task_ids = NULL)
+    Output
+      <message/check_info>
+      Message:
+      No derived task IDs to check. Skipping derived task ID value check.
+
+---
+
+    Code
+      check_tbl_derived_task_id_vals(tbl, round_id, file_path, hub_path)
+    Output
+      <error/check_failure>
+      Error:
+      ! `tbl` contains invalid derived task ID values.  `target_date`: "random_val", see `errors` for more details.
+