hubverse-org · annakrystalli · Jul 25, 2024 · Jul 22, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: hubValidations
 Title: Testing framework for hubverse hub validations
-Version: 0.2.0
+Version: 0.3.0
 Authors@R: c(
     person(
         given = "Anna", 
@@ -38,7 +38,7 @@ Imports:
     fs,
     gh,
     hubAdmin (>= 1.0.0),
-    hubData (>= 1.0.0),
+    hubData (>= 1.1.0),
     hubUtils (>= 0.1.0),
     jsonlite,
     jsonvalidate,

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# hubValidations 0.3.0
+
+* Introduce an `output_type_id_datatype` argument to `validate_pr()`, `validate_submission()`, `validate_model_data()` and `expand_model_out_grid()` and set default value to `"from_config"`. This default means the data type specified in the `output_type_id_datatype` property in `tasks.json` (introduced in schema version `v3.0.1`) is used to cast the hub level `output_type_id` column data type. If not set in the config, the functions fall back to `"auto"` which detects the simplest data type that can represent all output type id values across all output types and rounds. The argument also allows hub administrators to override this setting manually during validation.
+
 # hubValidations 0.2.0
 
 * Move and rename the following `hubData` functions to `hubValidations`:

diff --git a/R/check_tbl_col_types.R b/R/check_tbl_col_types.R
@@ -3,19 +3,27 @@
 #' Check that model output data column datatypes conform to those define in the
 #' hub config.
 #' @inherit check_tbl_colnames params
+#' @inheritParams hubData::create_hub_schema
 #' @return
 #' Depending on whether validation has succeeded, one of:
 #' - `<message/check_success>` condition class object.
 #' - `<warning/check_failure>` condition class object.
 #'
 #' Returned object also inherits from subclass `<hub_check>`.
 #' @export
-check_tbl_col_types <- function(tbl, file_path, hub_path) {
+check_tbl_col_types <- function(tbl, file_path, hub_path,
+                                output_type_id_datatype = c(
+                                  "from_config", "auto", "character",
+                                  "double", "integer",
+                                  "logical", "Date"
+                                )) {
+  output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
   config_tasks <- hubUtils::read_config(hub_path, "tasks")
 
   schema <- hubData::create_hub_schema(config_tasks,
     partitions = NULL,
-    r_schema = TRUE
+    r_schema = TRUE,
+    output_type_id_datatype = output_type_id_datatype
   )[names(tbl)]
 
   tbl_types <- purrr::map_chr(tbl, ~ if (inherits(.x, "numeric")) {
@@ -31,12 +39,13 @@ check_tbl_col_types <- function(tbl, file_path, hub_path) {
     details <- NULL
   } else {
     invalid_cols <- names(compare_types)[!compare_types]
-    details <- paste(
+    details <- paste0(
       "{.var ", invalid_cols,
       "} should be {.val ", schema[invalid_cols],
       "} not {.val ", tbl_types[invalid_cols], "}"
     ) %>%
       paste(collapse = ", ") %>%
+      paste0(".") %>%
       cli::format_inline()
   }
 

diff --git a/R/expand_model_out_grid.R b/R/expand_model_out_grid.R
@@ -95,10 +95,16 @@ expand_model_out_grid <- function(config_tasks,
                                   round_id,
                                   required_vals_only = FALSE,
                                   all_character = FALSE,
+                                  output_type_id_datatype = c(
+                                    "from_config", "auto", "character",
+                                    "double", "integer",
+                                    "logical", "Date"
+                                  ),
                                   as_arrow_table = FALSE,
                                   bind_model_tasks = TRUE,
                                   include_sample_ids = FALSE) {
   round_idx <- hubUtils::get_round_idx(config_tasks, round_id)
+  output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
 
   round_config <- purrr::pluck(
     config_tasks,
@@ -156,7 +162,8 @@ expand_model_out_grid <- function(config_tasks,
     config_tasks,
     all_character = all_character,
     as_arrow_table = as_arrow_table,
-    bind_model_tasks = bind_model_tasks
+    bind_model_tasks = bind_model_tasks,
+    output_type_id_datatype = output_type_id_datatype
   )
 }
 
@@ -221,7 +228,9 @@ fix_round_id <- function(x, round_id, round_config, round_ids) {
 # - binding multiple modeling task grids together.
 process_mt_grid_outputs <- function(x, config_tasks, all_character,
                                     as_arrow_table = TRUE,
-                                    bind_model_tasks = TRUE) {
+                                    bind_model_tasks = TRUE,
+                                    output_type_id_datatype = output_type_id_datatype) {
+
   if (bind_model_tasks) {
     # To bind multiple modeling task grids together, we need to ensure they contain
     # the same columns. Any missing columns are padded with NAs.
@@ -232,7 +241,8 @@ process_mt_grid_outputs <- function(x, config_tasks, all_character,
     schema_cols <- names(
       hubData::create_hub_schema(
         config_tasks,
-        partitions = NULL
+        partitions = NULL,
+        output_type_id_datatype = output_type_id_datatype
       )
     )
     all_cols <- schema_cols[schema_cols %in% all_cols]
@@ -252,7 +262,8 @@ process_mt_grid_outputs <- function(x, config_tasks, all_character,
       ~ hubData::coerce_to_hub_schema(
         .x,
         config_tasks,
-        as_arrow_table = as_arrow_table
+        as_arrow_table = as_arrow_table,
+        output_type_id_datatype = output_type_id_datatype
       )
     )
   }

diff --git a/R/opt_check_tbl_col_timediff.R b/R/opt_check_tbl_col_timediff.R
@@ -3,14 +3,20 @@
 #' @param t0_colname Character string. The name of the time zero date column.
 #' @param t1_colname Character string. The name of the time zero + 1 time step date column.
 #' @param timediff an object of class `lubridate` [`Period-class`] and length 1.
+#' @inheritParams hubData::create_hub_schema
 #' @details
 #' Should be deployed as part of `validate_model_data` optional checks.
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_col_types return
 #' @export
 opt_check_tbl_col_timediff <- function(tbl, file_path, hub_path,
                                        t0_colname, t1_colname,
-                                       timediff = lubridate::weeks(2)) {
+                                       timediff = lubridate::weeks(2),
+                                       output_type_id_datatype = c(
+                                         "from_config", "auto", "character",
+                                         "double", "integer",
+                                         "logical", "Date"
+                                       )) {
   checkmate::assert_class(timediff, "Period")
   checkmate::assert_scalar(timediff)
   checkmate::assert_character(t0_colname, len = 1L)
@@ -19,9 +25,11 @@ opt_check_tbl_col_timediff <- function(tbl, file_path, hub_path,
   checkmate::assert_choice(t1_colname, choices = names(tbl))
 
   config_tasks <- hubUtils::read_config(hub_path, "tasks")
+  output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
   schema <- hubData::create_hub_schema(config_tasks,
     partitions = NULL,
-    r_schema = TRUE
+    r_schema = TRUE,
+    output_type_id_datatype = output_type_id_datatype
   )
   assert_column_date(t0_colname, schema)
   assert_column_date(t1_colname, schema)

diff --git a/R/opt_check_tbl_horizon_timediff.R b/R/opt_check_tbl_horizon_timediff.R
@@ -9,12 +9,18 @@
 #' The period of a single horizon. Default to 1 week.
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_col_types return
+#' @inheritParams hubData::create_hub_schema
 #' @details
 #' Should be deployed as part of `validate_model_data` optional checks.
 #' @export
 opt_check_tbl_horizon_timediff <- function(tbl, file_path, hub_path, t0_colname,
                                            t1_colname, horizon_colname = "horizon",
-                                           timediff = lubridate::weeks()) {
+                                           timediff = lubridate::weeks(),
+                                           output_type_id_datatype = c(
+                                             "from_config", "auto", "character",
+                                             "double", "integer",
+                                             "logical", "Date"
+                                           )) {
   checkmate::assert_class(timediff, "Period")
   checkmate::assert_scalar(timediff)
   checkmate::assert_character(t0_colname, len = 1L)
@@ -25,9 +31,11 @@ opt_check_tbl_horizon_timediff <- function(tbl, file_path, hub_path, t0_colname,
   checkmate::assert_choice(horizon_colname, choices = names(tbl))
 
   config_tasks <- hubUtils::read_config(hub_path, "tasks")
+  output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
   schema <- hubData::create_hub_schema(config_tasks,
     partitions = NULL,
-    r_schema = TRUE
+    r_schema = TRUE,
+    output_type_id_datatype = output_type_id_datatype
   )
   assert_column_date(t0_colname, schema)
   assert_column_date(t1_colname, schema)

diff --git a/R/read_model_out_file.R b/R/read_model_out_file.R
@@ -1,16 +1,26 @@
 #' Read a model output file
 #'
 #' @inheritParams check_valid_round_id
+#' @inheritParams hubData::create_hub_schema
 #' @param coerce_types character. What to coerce column types to on read.
-#' - `hub`: read in (`csv`) or coerce (`parquet`, `arrow`) to hub schema.
+#' - `hub`: (default) read in (`csv`) or coerce (`parquet`, `arrow`) to hub
+#'  schema.
+#' When coercing data types using the `hub` schema, the `output_type_id_datatype`
+#' can also be used to set the `output_type_id` column data type manually.
 #' - `chr`: read in (`csv`) or coerce (`parquet`, `arrow`) all columns to character.
 #' - `none`: No coercion. Use `arrow` `read_*` function defaults.
 #' @return a tibble of contents of the model output file.
 #' @export
 read_model_out_file <- function(file_path, hub_path = ".",
-                                coerce_types = c("hub", "chr", "none")) {
+                                coerce_types = c("hub", "chr", "none"),
+                                output_type_id_datatype = c(
+                                  "from_config", "auto", "character",
+                                  "double", "integer",
+                                  "logical", "Date"
+                                )) {
   coerce_types <- rlang::arg_match(coerce_types)
   full_path <- abs_file_path(file_path, hub_path)
+  output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
 
   if (!fs::file_exists(full_path)) {
     rel_path <- rel_file_path(file_path, hub_path) # nolint: object_usage_linter
@@ -31,7 +41,8 @@ read_model_out_file <- function(file_path, hub_path = ".",
       if (coerce_on_read) {
         schema <- create_model_out_schema(
           hub_path,
-          col_types = coerce_types
+          col_types = coerce_types,
+          output_type_id_datatype = output_type_id_datatype
         )
       }
       arrow::read_csv_arrow(
@@ -43,7 +54,8 @@ read_model_out_file <- function(file_path, hub_path = ".",
       if (coerce_types == "hub") {
         arrow::read_parquet(full_path) %>%
           hubData::coerce_to_hub_schema(
-            config_tasks = hubUtils::read_config(hub_path, "tasks")
+            config_tasks = hubUtils::read_config(hub_path, "tasks"),
+            output_type_id_datatype = output_type_id_datatype
           )
       } else if (coerce_types == "chr") {
         arrow::read_parquet(full_path) %>%
@@ -56,7 +68,8 @@ read_model_out_file <- function(file_path, hub_path = ".",
       if (coerce_types == "hub") {
         arrow::read_feather(full_path) %>%
           hubData::coerce_to_hub_schema(
-            config_tasks = hubUtils::read_config(hub_path, "tasks")
+            config_tasks = hubUtils::read_config(hub_path, "tasks"),
+            output_type_id_datatype = output_type_id_datatype
           )
       } else if (coerce_types == "chr") {
         arrow::read_feather(full_path) %>%
@@ -70,11 +83,17 @@ read_model_out_file <- function(file_path, hub_path = ".",
 }
 
 create_model_out_schema <- function(hub_path,
-                                    col_types = c("hub", "chr")) {
+                                    col_types = c("hub", "chr"),
+                                    output_type_id_datatype = c(
+                                      "from_config", "auto", "character",
+                                      "double", "integer",
+                                      "logical", "Date"
+                                    )) {
   col_types <- rlang::arg_match(col_types)
   schema <- hubData::create_hub_schema(
     config_tasks = hubUtils::read_config(hub_path, "tasks"),
-    partitions = NULL
+    partitions = NULL,
+    output_type_id_datatype = output_type_id_datatype
   )
 
   switch(col_types,

diff --git a/R/validate_model_data.R b/R/validate_model_data.R
@@ -2,6 +2,7 @@
 #'
 #' @inheritParams check_tbl_unique_round_id
 #' @inheritParams validate_model_file
+#' @inheritParams hubData::create_hub_schema
 #' @inherit validate_model_file return
 #' @export
 #' @details
@@ -23,11 +24,17 @@
 #' file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
 #' validate_model_data(hub_path, file_path)
 validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
+                                output_type_id_datatype = c(
+                                  "from_config", "auto", "character",
+                                  "double", "integer",
+                                  "logical", "Date"
+                                ),
                                 validations_cfg_path = NULL) {
   checks <- new_hub_validations()
 
   file_meta <- parse_file_name(file_path)
   round_id <- file_meta$round_id
+  output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
 
   # -- File parsing checks ----
   checks$file_read <- try_check(
@@ -113,7 +120,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
     check_tbl_col_types(
       tbl,
       file_path = file_path,
-      hub_path = hub_path
+      hub_path = hub_path,
+      output_type_id_datatype = output_type_id_datatype
     ), file_path
   )
 

diff --git a/R/validate_pr.R b/R/validate_pr.R
@@ -92,15 +92,24 @@
 #' )
 #' }
 validate_pr <- function(hub_path = ".", gh_repo, pr_number,
-                        round_id_col = NULL, validations_cfg_path = NULL,
+                        round_id_col = NULL,
+                        output_type_id_datatype = c(
+                          "from_config", "auto", "character",
+                          "double", "integer",
+                          "logical", "Date"
+                        ), validations_cfg_path = NULL,
                         skip_submit_window_check = FALSE,
-                        file_modification_check = c("error", "warn", "message", "none"),
+                        file_modification_check = c(
+                          "error", "warn",
+                          "message", "none"
+                        ),
                         allow_submit_window_mods = TRUE,
                         submit_window_ref_date_from = c(
                           "file",
                           "file_path"
                         )) {
   file_modification_check <- rlang::arg_match(file_modification_check)
+  output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
   model_output_dir <- get_hub_model_output_dir(hub_path) # nolint: object_name_linter
   model_metadata_dir <- "model-metadata" # nolint: object_name_linter
   validations <- new_hub_validations()
@@ -167,6 +176,7 @@ validate_pr <- function(hub_path = ".", gh_repo, pr_number,
         ~ validate_submission(
           hub_path,
           file_path = .x,
+          output_type_id_datatype = output_type_id_datatype,
           validations_cfg_path = validations_cfg_path,
           skip_submit_window_check = skip_submit_window_check,
           skip_check_config = TRUE

diff --git a/R/validate_submission.R b/R/validate_submission.R
@@ -5,6 +5,7 @@
 #' of the file.
 #'
 #' @inherit validate_model_data return params
+#' @inheritParams hubData::create_hub_schema
 #' @param skip_submit_window_check Logical. Whether to skip the submission window check.
 #' @param skip_check_config Logical. Whether to skip the hub config validation check.
 #'  check.
@@ -38,13 +39,20 @@
 #' validate_submission(hub_path, file_path)
 validate_submission <- function(hub_path, file_path, round_id_col = NULL,
                                 validations_cfg_path = NULL,
+                                output_type_id_datatype = c(
+                                  "from_config", "auto", "character",
+                                  "double", "integer",
+                                  "logical", "Date"
+                                ),
                                 skip_submit_window_check = FALSE,
                                 skip_check_config = FALSE,
                                 submit_window_ref_date_from = c(
                                   "file",
                                   "file_path"
                                 )) {
   check_hub_config <- new_hub_validations()
+  output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
+
   if (!skip_check_config) {
     check_hub_config$valid_config <- try_check(
       check_config_hub_valid(hub_path),
@@ -80,6 +88,7 @@ validate_submission <- function(hub_path, file_path, round_id_col = NULL,
     hub_path = hub_path,
     file_path = file_path,
     round_id_col = round_id_col,
+    output_type_id_datatype = output_type_id_datatype,
     validations_cfg_path = validations_cfg_path
   )