Merge pull request #176 from hubverse-org/ak/derived-task-id-val-chec…

…k/110 Add `check_tbl_derived_task_id_vals()` function
hubverse-org · Dec 12, 2024 · c21383e · c21383e
2 parents 1fdf702 + 9660c5f
commit c21383e
Show file tree

Hide file tree

Showing 15 changed files with 560 additions and 205 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -27,6 +27,7 @@ export(check_submission_metadata_file_exists)
 export(check_submission_time)
 export(check_tbl_col_types)
 export(check_tbl_colnames)
+export(check_tbl_derived_task_id_vals)
 export(check_tbl_match_round_id)
 export(check_tbl_rows_unique)
 export(check_tbl_spl_compound_taskid_set)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 # hubValidations (development version)
 
+* Added `check_tbl_derived_task_id_vals()` check to `validate_model_data()` that ensures that values in derived task ID columns match expected values for the corresponding derived task IDs in the round as defined in `tasks.json` config (#110). Given the dependence of derived task IDs on the values of their source task ID values, the check ignores the combinations of derived task ID values with those of other task IDs and focuses only on identifying values that do not match corresponding accepted values.
 * `submission_tmpl()` gains the `force_output_types` allowing users to force optional output types to be included in a submission template when `required_vals_only = TRUE`. In conjunction with the use of the `output_types` argument, this allows users to create submission templates which include optional output types they plan to submit.
 * `check_tbl_values_required()` no longer reports false positives for v4 hubs, which fixes the bug reported in #177. Evaluation of whether all combinations of required values have been submitted through `check_tbl_values_required()` is now chunked by output type for v4 config and above. This reduces memory pressure and should speed up required value validation in hubs with complex task.json files.
 

diff --git a/R/check_tbl_derived_task_id_vals.R b/R/check_tbl_derived_task_id_vals.R
@@ -0,0 +1,86 @@
+#' Check derived task ID columns contain valid values
+#'
+#' This check is used to validate that values in any derived task ID columns
+#' matches accepted values for each derived task ID in the config.
+#' Given the dependence of derived task IDs on the values of other values,
+#' it ignores the combinations of derived task ID values with those of other task IDs
+#' and focuses only on identifying values that do not match the accepted values.
+#' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**.
+#' @inherit check_tbl_colnames params
+#' @inheritParams expand_model_out_grid
+#' @return
+#' Depending on whether validation has succeeded, one of:
+#' - `<message/check_success>` condition class object.
+#' - `<error/check_failure>` condition class object.
+#'
+#' If no `derived_task_ids` are specified, the check is skipped and a
+#' `<message/check_info>` condition class object is retuned.
+#'
+#' Returned object also inherits from subclass `<hub_check>`.
+#' @export
+check_tbl_derived_task_id_vals <- function(
+    tbl, round_id, file_path, hub_path,
+    derived_task_ids = get_hub_derived_task_ids(
+      hub_path, round_id
+    )) {
+  if (is.null(derived_task_ids)) {
+    return(
+      capture_check_info(
+        file_path = file_path,
+        msg = "No derived task IDs to check. Skipping derived task ID value check."
+      )
+    )
+  }
+  config_tasks <- read_config(hub_path, "tasks")
+
+  derived_task_id_vals <- get_round_config_values(
+    config_tasks = config_tasks,
+    round_id = round_id,
+    derived_task_ids = NULL
+  )[derived_task_ids]
+
+  setdiff_vals <- purrr::map2(
+    tbl[derived_task_ids], derived_task_id_vals, setdiff
+  )
+
+  invalid_derived_task_ids <- lengths(setdiff_vals) > 0L
+
+  check <- !any(invalid_derived_task_ids)
+
+  if (check) {
+    details <- NULL
+    errors <- NULL
+  } else {
+    invalid_vals <- setdiff_vals[invalid_derived_task_ids]
+    invalid_vals_msg <- purrr::map_chr(
+      seq_along(invalid_vals),
+      \(.x) {
+        paste0(
+          "{.arg {names(invalid_vals)[", .x,
+          "]}}: {.val {invalid_vals[[", .x, "]]}}"
+        )
+      }
+    )
+    details <- c(
+      invalid_vals_msg,
+      "see {.code errors} for more details."
+    ) |>
+      paste(collapse = ", ") |>
+      cli::format_inline()
+    errors <- invalid_vals
+  }
+
+  capture_check_cnd(
+    check = check,
+    file_path = file_path,
+    msg_subject = "{.var tbl}",
+    msg_attribute = "",
+    msg_verbs = c(
+      "contains valid derived task ID values.",
+      "contains invalid derived task ID values."
+    ),
+    errors = errors,
+    error = FALSE,
+    details = details
+  )
+}
diff --git a/R/validate_model_data.R b/R/validate_model_data.R
@@ -170,6 +170,16 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
     return(checks)
   }
 
+  checks$derived_task_id_vals <- try_check(
+    check_tbl_derived_task_id_vals(
+      tbl_chr,
+      round_id = round_id,
+      file_path = file_path,
+      hub_path = hub_path,
+      derived_task_ids = derived_task_ids
+    ), file_path
+  )
+
   checks$rows_unique <- try_check(
     check_tbl_rows_unique(
       tbl_chr,

diff --git a/inst/check_table.csv b/inst/check_table.csv
@@ -14,9 +14,10 @@ unique_round_id,Round ID column contains a single unique round ID. Skipped if `r
 match_round_id,Round ID from file contents matches round ID from file name. Skipped if `round_id_from_var` is FALSE in config.,TRUE,check_error,validate_model_data,check_tbl_match_round_id,,FALSE
 colnames,File column names match expected column names for round (i.e. task ID names + hub standard column names),TRUE,check_error,validate_model_data,check_tbl_colnames,,FALSE
 col_types,File column types match expected column types from config. Mainly applicable to parquet & arrow files.,FALSE,check_failure,validate_model_data,check_tbl_col_types,,FALSE
-valid_vals,Columns (excluding `value` column) contain valid combinations of task ID / output type / output type ID values,TRUE,check_error,validate_model_data,check_tbl_values,error_tbl: table of invalid task ID/output type/output type ID value combinations,FALSE
-rows_unique,Columns (excluding `value` column) contain unique combinations of task ID / output type / output type ID values,FALSE,check_failure,validate_model_data,check_tbl_rows_unique,,FALSE
-req_vals,Columns (excluding `value` column) contain all required combinations of task ID / output type / output type ID values,FALSE,check_failure,validate_model_data,check_tbl_values_required,missing_df: table of missing task ID/output type/output type ID value combinations,FALSE
+valid_vals,Columns (excluding the `value` and any derived task ID columns) contain valid combinations of task ID / output type / output type ID values,TRUE,check_error,validate_model_data,check_tbl_values,error_tbl: table of invalid task ID/output type/output type ID value combinations,FALSE
+derived_task_id_vals,Derived task ID columns contain valid values.,FALSE,check_failure,validate_model_data,check_tbl_derived_task_id_vals,errors: named list of derived task ID values. Each element contains the invalid values for each derived task ID that failed the check.,FALSE
+rows_unique,Columns (excluding the `value` and any derived task ID columns) contain unique combinations of task ID / output type / output type ID values,FALSE,check_failure,validate_model_data,check_tbl_rows_unique,,FALSE
+req_vals,Columns (excluding the `value` and any derived task ID columns) contain all required combinations of task ID / output type / output type ID values,FALSE,check_failure,validate_model_data,check_tbl_values_required,missing_df: table of missing task ID/output type/output type ID value combinations,FALSE
 value_col_valid,Values in `value` column are coercible to data type configured for each output type,FALSE,check_failure,validate_model_data,check_tbl_value_col,,FALSE
 value_col_non_desc,Values in `value` column are non-decreasing as output_type_ids increase for all unique task ID /output type value combinations. Applies to `quantile` or `cdf` output types only,FALSE,check_failure,validate_model_data,check_tbl_value_col_ascending,error_tbl: table of rows affected,FALSE
 value_col_sum1,Values in the `value` column of `pmf` output type data for each unique task ID combination sum to 1.,FALSE,check_failure,validate_model_data,check_tbl_value_col_sum1,error_tbl: table of rows affected,FALSE

diff --git a/man/check_tbl_derived_task_id_vals.Rd b/man/check_tbl_derived_task_id_vals.Rd
diff --git a/man/validate_model_data.Rd b/man/validate_model_data.Rd
diff --git a/man/validate_pr.Rd b/man/validate_pr.Rd
diff --git a/man/validate_submission.Rd b/man/validate_submission.Rd
diff --git a/tests/testthat/_snaps/check_tbl_derived_task_id_vals.md b/tests/testthat/_snaps/check_tbl_derived_task_id_vals.md
@@ -0,0 +1,28 @@
+# check_tbl_derived_task_ids_vals works
+
+    Code
+      check_tbl_derived_task_id_vals(tbl, round_id, file_path, hub_path)
+    Output
+      <message/check_success>
+      Message:
+      `tbl` contains valid derived task ID values.
+
+---
+
+    Code
+      check_tbl_derived_task_id_vals(tbl, round_id, file_path, hub_path,
+        derived_task_ids = NULL)
+    Output
+      <message/check_info>
+      Message:
+      No derived task IDs to check. Skipping derived task ID value check.
+
+---
+
+    Code
+      check_tbl_derived_task_id_vals(tbl, round_id, file_path, hub_path)
+    Output
+      <error/check_failure>
+      Error:
+      ! `tbl` contains invalid derived task ID values.  `target_date`: "random_val", see `errors` for more details.
+