hubverse-org · annakrystalli · Nov 27, 2023 · Nov 1, 2023 · Nov 1, 2023 · Nov 1, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: hubValidations
 Title: Testing framework for hubverse hub validations
-Version: 0.0.0.9004
+Version: 0.0.0.9005
 Authors@R: c(
     person(
         given = "Anna", 
@@ -37,7 +37,7 @@ Imports:
     dplyr,
     fs,
     gh,
-    hubUtils (>= 0.0.0.9014),
+    hubUtils (>= 0.0.0.9016),
     jsonlite,
     jsonvalidate,
     lubridate,

diff --git a/NAMESPACE b/NAMESPACE
@@ -46,6 +46,7 @@ export(is_info)
 export(is_success)
 export(new_hub_validations)
 export(not_pass)
+export(opt_check_metadata_team_max_model_n)
 export(opt_check_tbl_col_timediff)
 export(opt_check_tbl_counts_lt_popn)
 export(opt_check_tbl_horizon_timediff)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# hubValidations 0.0.0.9005
+
+* Improved handling of numeric output type IDs (including high precision floating points / values with trailing zeros), especially when overall hub output type ID column is character. This previously lead to a number of bugs and false validation failures (#58 & #54) which are addressed in this version.
+* Bug fixes with respect to handling modelling tasks with no required task ID / output type combinations.
+* Improved capture of error messages when check execution error occurs.
+
 # hubValidations 0.0.0.9004
 
 This release contains a bug fix for reading in and validating CSV column types correctly. (#54) 
@@ -10,7 +16,7 @@ This release includes a number of bug fixes:
 
 # hubValidations 0.0.0.9002
 
-This release includes improvements desgined after the first round of sandbox testing on setting up the CDC FluSight hub. Improvements include:
+This release includes improvements designed after the first round of sandbox testing on setting up the CDC FluSight hub. Improvements include:
 
 * Export `parse_file_name` function for parsing model output metadata from a model output file name.
 * Issue more specific and informative messaging when `check_tbl_values()` check fails.

diff --git a/R/check_tbl_values.R b/R/check_tbl_values.R
@@ -11,7 +11,9 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) {
   # working with larger files but currently arrow does not match NAs as dplyr
   # does, returning false positives for mean & median rows which contain NA in
   # output type ID column.
-  tbl <- hubUtils::coerce_to_character(tbl)
+  tbl <- hubUtils::coerce_to_character(tbl) %>%
+    coerce_num_output_type_ids(file_path, hub_path)
+
   accepted_vals <- hubUtils::expand_model_out_val_grid(
     config_tasks = config_tasks,
     round_id = round_id,
@@ -105,3 +107,40 @@ summarise_invalid_values <- function(valid_tbl, accepted_vals) {
     invalid_combs_idx = invalid_combs_idx
   )
 }
+
+
+get_numeric_output_type_ids <- function(file_path, hub_path) {
+
+  get_file_round_config(file_path, hub_path)[["model_tasks"]] %>%
+    purrr::map(~ .x[["output_type"]]) %>%
+    unlist(recursive = FALSE) %>%
+    purrr::map(~ purrr::pluck(.x, "output_type_id")) %>%
+    purrr::map_lgl(~is.numeric(unlist(.x))) %>%
+    purrr::keep(isTRUE) %>%
+    names() %>%
+    unique()
+}
+
+
+coerce_num_output_type_ids <- function(tbl, file_path, hub_path) {
+
+  num_output_types <- get_numeric_output_type_ids(
+    file_path = file_path,
+    hub_path = hub_path)
+
+  if (any(tbl[["output_type"]] %in% num_output_types) &&
+      inherits(tbl[["output_type_id"]], "character")) {
+
+    type_coerce <- tbl[["output_type"]] %in% num_output_types
+    num_output_type_id <- suppressWarnings(
+      as.numeric(tbl$output_type_id[type_coerce])
+    )
+    # establish only valid coercions to distinguish between the potential for
+    # two cdf output types in the same round, one numeric and one character.
+    valid <- !is.na(num_output_type_id)
+    tbl$output_type_id[type_coerce][valid] <- as.character(
+      num_output_type_id[valid]
+    )
+  }
+  tbl
+}
diff --git a/R/check_tbl_values_required.R b/R/check_tbl_values_required.R
@@ -7,7 +7,8 @@
 check_tbl_values_required <- function(tbl, round_id, file_path, hub_path) {
   config_tasks <- hubUtils::read_config(hub_path, "tasks")
   tbl[["value"]] <- NULL
-  tbl <- hubUtils::coerce_to_character(tbl)
+  tbl <- hubUtils::coerce_to_character(tbl) %>%
+    coerce_num_output_type_ids(file_path, hub_path)
 
   req <- hubUtils::expand_model_out_val_grid(
     config_tasks,
@@ -32,7 +33,7 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path) {
   )
 
   missing_df <- purrr::pmap(
-    list(tbl, req, full),
+    combine_mt_inputs(tbl, req, full),
     check_modeling_task_values_required
   ) %>%
     purrr::list_rbind()
@@ -323,3 +324,10 @@ split_na_req <- function(req) {
   req[na_idx[, "row"], ] %>%
     split(na_idx[, "col"])
 }
+
+combine_mt_inputs <- function(tbl, req, full) {
+  keep_mt <- purrr::map_lgl(req,  ~nrow(.x) > 0L)
+  list(tbl[keep_mt],
+       req[keep_mt],
+       full[keep_mt])
+}
diff --git a/R/opt_check_metadata_team_max_model_n.R b/R/opt_check_metadata_team_max_model_n.R
@@ -0,0 +1,39 @@
+#' Check that submitting team does not exceed maximum number of allowed models
+#' per team
+#'
+#' @inherit check_metadata_file_exists params
+#' @param n_max Integer. Number of maximum allowed models per team.
+#' @inherit check_tbl_col_types return
+#' @details
+#' Should be deployed as part of `validate_model_metadata` optional checks.
+#'
+#'
+#' @export
+opt_check_metadata_team_max_model_n <- function(file_path, hub_path, n_max = 2L) {
+
+    team_abbr <- parse_file_name(
+        file_path,
+        file_type = "model_metadata")$team_abbr
+    all_model_meta <- hubUtils::load_model_metadata(hub_path)
+
+    team_models <- all_model_meta[["model_abbr"]][all_model_meta[["team_abbr"]] == team_abbr]
+    n_models <- length(team_models)
+    check <- isFALSE(n_models > n_max)
+    if (check) {
+        details <- NULL
+    } else {
+        details <- cli::format_inline(
+            "Team {.val {team_abbr}} has submitted valid metadata for
+            {.val {n_models}} model{?s}:
+            {.val {team_models}}.")
+    }
+
+    capture_check_cnd(
+        check = check,
+        file_path = file_path,
+        msg_subject = cli::format_inline(
+            "Maximum number of models per team ({.val {n_max}})"),
+        msg_attribute = "exceeded.",
+        msg_verbs = c("not", ""),
+        details = details)
+}
diff --git a/R/opt_check_tbl_col_timediff.R b/R/opt_check_tbl_col_timediff.R
@@ -3,6 +3,8 @@
 #' @param t0_colname Character string. The name of the time zero date column.
 #' @param t1_colname Character string. The name of the time zero + 1 time step date column.
 #' @param timediff an object of class `lubridate` [`Period-class`] and length 1.
+#' @details
+#' Should be deployed as part of `validate_model_data` optional checks.
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_col_types return
 #' @export

diff --git a/R/opt_check_tbl_counts_lt_popn.R b/R/opt_check_tbl_counts_lt_popn.R
@@ -16,6 +16,8 @@
 #'
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_col_types return
+#' @details
+#' Should be deployed as part of `validate_model_data` optional checks.
 #' @export
 #' @examples
 #' hub_path <- system.file("testhubs/flusight", package = "hubValidations")

diff --git a/R/opt_check_tbl_horizon_timediff.R b/R/opt_check_tbl_horizon_timediff.R
@@ -8,6 +8,8 @@
 #' The period of a single horizon. Default to 1 week.
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_col_types return
+#' @details
+#' Should be deployed as part of `validate_model_data` optional checks.
 #' @export
 opt_check_tbl_horizon_timediff <- function(tbl, file_path, hub_path, t0_colname,
                                            t1_colname, horizon_colname = "horizon",

diff --git a/R/parse_file_name.R b/R/parse_file_name.R
@@ -2,9 +2,12 @@
 #'
 #' @param file_path Character string. A model output file name.
 #' Can include parent directories which are ignored.
+#' @param file_type Character string. Type of file name being parsed. One of `"model_output"`
+#' or `"model_metadata"`.
 #'
 #' @return A list with the following elements:
-#'  - `round_id`: The round ID the model output is associated with.
+#'  - `round_id`: The round ID the model output is associated with (`NA` for
+#'  model metadata files.)
 #'  - `team_abbr`: The team responsible for the model.
 #'  - `model_abbr`: The name of the model.
 #'  - `model_id`: The unique model ID derived from the concatenation of
@@ -15,32 +18,40 @@
 #'
 #' @examples
 #' parse_file_name("hub-baseline/2022-10-15-hub-baseline.csv")
-parse_file_name <- function(file_path) {
-    checkmate::assert_string(file_path)
-    file_name <- tools::file_path_sans_ext(basename(file_path))
+parse_file_name <- function(file_path, file_type = c("model_output", "model_metadata")) {
+  file_type <- rlang::arg_match(file_type)
+  checkmate::assert_string(file_path)
+  file_name <- tools::file_path_sans_ext(basename(file_path))
 
-    split_pattern <- stringr::regex(
-        "([[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2})|[a-z_0-9]+",
-        TRUE
+  split_pattern <- stringr::regex(
+    "([[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2})|[a-z_0-9]+",
+    TRUE
+  )
+  split_res <- unlist(
+    stringr::str_extract_all(
+      file_name,
+      split_pattern
     )
-    split_res <- unlist(
-        stringr::str_extract_all(
-            file_name,
-            split_pattern
-        )
-    )
-    if (length(split_res) != 3L) {
-        cli::cli_abort(
-            "Could not parse file name {.path {file_name}} for submission metadata.
+  )
+  exp_n <- switch(file_type,
+    model_output = 3L,
+    model_metadata = 2L
+  )
+  if (length(split_res) != exp_n) {
+    cli::cli_abort(
+      "Could not parse file name {.path {file_name}} for submission metadata.
       Please consult documentation for file name requirements for correct
       metadata parsing."
-        )
-    }
-    list(
-        round_id = split_res[1],
-        team_abbr = split_res[2],
-        model_abbr = split_res[3],
-        model_id = paste(split_res[2], split_res[3], sep = "-"),
-        ext = fs::path_ext(file_path)
     )
+  }
+  if (file_type == "model_metadata") {
+    split_res <- c(NA, split_res)
+  }
+  list(
+    round_id = split_res[1],
+    team_abbr = split_res[2],
+    model_abbr = split_res[3],
+    model_id = paste(split_res[2], split_res[3], sep = "-"),
+    ext = fs::path_ext(file_path)
+  )
 }
diff --git a/R/try_check.R b/R/try_check.R
@@ -10,15 +10,9 @@
 try_check <- function(expr, file_path) {
   check <- try(expr, silent = TRUE)
   if (inherits(check, "try-error")) {
-    message <- attr(check, "condition")$message
-    parent_msg <- attr(check, "condition")$parent$message
-    if (is.character(parent_msg)) {
-      parent_msg <- paste(parent_msg, collapse = " --> ")
-      msg <- paste(message, parent_msg,  sep = " --> ")
-    } else {
-      msg <- message
-    }
-    msg <- clean_msg(msg)
+    msg <- as.character(check) %>%
+      cli::ansi_strip() %>%
+      clean_msg()
 
     return(
       capture_exec_error(