Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve handling of numeric output type IDs. #60

Merged
merged 20 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
aee5bac
add model-metadata-schema to pass updated hubUtils validation checks
annakrystalli Nov 1, 2023
e8da5b4
Improve handling of numeric output type IDs. Resolves #58 & #54
annakrystalli Nov 1, 2023
ea92169
Merge branch 'main' into num-type-ids
annakrystalli Nov 1, 2023
8e0da25
fix error caused by model task with no required task id/output type c…
annakrystalli Nov 2, 2023
ae8c0fc
improve exec error message capturing
annakrystalli Nov 2, 2023
b9374a2
update news
annakrystalli Nov 2, 2023
ae89852
add non req model task tests
annakrystalli Nov 2, 2023
7da8f3a
bump hubUtils requirement
annakrystalli Nov 2, 2023
730040b
Add deployment note in docs
annakrystalli Nov 2, 2023
ec8e614
add ability to parse model metadata file names for metadata
annakrystalli Nov 2, 2023
177e8a5
Add fn opt_check_metadata_team_max_model_n. Resolves #34
annakrystalli Nov 2, 2023
5b53f1b
add test cases (currently failing) that validations reject floating p…
elray1 Nov 2, 2023
7e42400
Update tests/testthat/testdata/hub-chr/hub-config/admin.json
annakrystalli Nov 9, 2023
b116688
Update tests/testthat/testdata/hub-it/hub-config/admin.json
annakrystalli Nov 9, 2023
d756bee
Update tests/testthat/testdata/hub-num/hub-config/admin.json
annakrystalli Nov 9, 2023
0ae59f1
Merge pull request #61 from Infectious-Disease-Modeling-Hubs/test_che…
annakrystalli Nov 9, 2023
e1842e1
skip tests offline
annakrystalli Nov 10, 2023
30910f6
ensure default read behaviour for file type
annakrystalli Nov 10, 2023
27f1e65
require all character tbls
annakrystalli Nov 10, 2023
24a6bf8
read in all character tbl for some checks
annakrystalli Nov 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: hubValidations
Title: Testing framework for hubverse hub validations
Version: 0.0.0.9004
Version: 0.0.0.9005
Authors@R: c(
person(
given = "Anna",
Expand Down Expand Up @@ -37,7 +37,7 @@ Imports:
dplyr,
fs,
gh,
hubUtils (>= 0.0.0.9014),
hubUtils (>= 0.0.0.9016),
jsonlite,
jsonvalidate,
lubridate,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ export(is_info)
export(is_success)
export(new_hub_validations)
export(not_pass)
export(opt_check_metadata_team_max_model_n)
export(opt_check_tbl_col_timediff)
export(opt_check_tbl_counts_lt_popn)
export(opt_check_tbl_horizon_timediff)
Expand Down
8 changes: 7 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# hubValidations 0.0.0.9005

* Improved handling of numeric output type IDs (including high precision floating points / values with trailing zeros), especially when overall hub output type ID column is character. This previously lead to a number of bugs and false validation failures (#58 & #54) which are addressed in this version.
* Bug fixes with respect to handling modelling tasks with no required task ID / output type combinations.
* Improved capture of error messages when check execution error occurs.

# hubValidations 0.0.0.9004

This release contains a bug fix for reading in and validating CSV column types correctly. (#54)
Expand All @@ -10,7 +16,7 @@ This release includes a number of bug fixes:

# hubValidations 0.0.0.9002

This release includes improvements desgined after the first round of sandbox testing on setting up the CDC FluSight hub. Improvements include:
This release includes improvements designed after the first round of sandbox testing on setting up the CDC FluSight hub. Improvements include:

* Export `parse_file_name` function for parsing model output metadata from a model output file name.
* Issue more specific and informative messaging when `check_tbl_values()` check fails.
Expand Down
41 changes: 40 additions & 1 deletion R/check_tbl_values.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) {
# working with larger files but currently arrow does not match NAs as dplyr
# does, returning false positives for mean & median rows which contain NA in
# output type ID column.
tbl <- hubUtils::coerce_to_character(tbl)
tbl <- hubUtils::coerce_to_character(tbl) %>%
coerce_num_output_type_ids(file_path, hub_path)

accepted_vals <- hubUtils::expand_model_out_val_grid(
config_tasks = config_tasks,
round_id = round_id,
Expand Down Expand Up @@ -105,3 +107,40 @@ summarise_invalid_values <- function(valid_tbl, accepted_vals) {
invalid_combs_idx = invalid_combs_idx
)
}


get_numeric_output_type_ids <- function(file_path, hub_path) {

get_file_round_config(file_path, hub_path)[["model_tasks"]] %>%
purrr::map(~ .x[["output_type"]]) %>%
unlist(recursive = FALSE) %>%
purrr::map(~ purrr::pluck(.x, "output_type_id")) %>%
purrr::map_lgl(~is.numeric(unlist(.x))) %>%
purrr::keep(isTRUE) %>%
names() %>%
unique()
}


coerce_num_output_type_ids <- function(tbl, file_path, hub_path) {

num_output_types <- get_numeric_output_type_ids(
file_path = file_path,
hub_path = hub_path)

if (any(tbl[["output_type"]] %in% num_output_types) &&
inherits(tbl[["output_type_id"]], "character")) {

type_coerce <- tbl[["output_type"]] %in% num_output_types
num_output_type_id <- suppressWarnings(
as.numeric(tbl$output_type_id[type_coerce])
)
# establish only valid coercions to distinguish between the potential for
# two cdf output types in the same round, one numeric and one character.
valid <- !is.na(num_output_type_id)
tbl$output_type_id[type_coerce][valid] <- as.character(
num_output_type_id[valid]
)
Comment on lines +138 to +140
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the use of as.character here would open up the problems I outlined in discussion #59. I added an example of a unit test that passes, but that I think should not pass, here. In that example, the output_type_id in the submission file is 0.09999999999999997779554. On my system, this is not equal to the floating point representation of 0.1, but as.character casts it to "0.1". Here's an illustration:

> a <- 0.0999999999999999777955
> b <- 0.1000000000000000055511
> c <- 0.1
> a == b
[1] FALSE
> a == c
[1] FALSE
> b == c
[1] TRUE
> as.character(a) == as.character(b)
[1] TRUE
> as.character(a) == as.character(c)
[1] TRUE
> as.character(b) == as.character(c)
[1] TRUE
> as.character(a)
[1] "0.1"

As a result of this, if we use values coerced via as.character to do validations, we may be generous in passing submissions that as.character rounds to the value specified in the tasks config file. This would then cause issues later on in places like ensembling code or filtering code, where grouping or filtering on the output_type_id may give unexpected results.

I am wondering if a way forward might be to do the data type coercion in a different order, or just deal with character representations from the start for purposes of validating output_type_ids and task id variables? Something like:

  1. For purposes of validation, read in all fields other than the value column as character strings. Additionally, read the contents of tasks.json as strings if that's possible (may not be?)?
  2. Check exact equality of values for output_type_ids and task id variables with the required/optional values specified in tasks.json, as strings. For example, if the tasks.json file says that a valid quantile level is 0.1, we expect to see the exact characters "0.1" in the submission file. This would change our current passing validation results for submissions with values like 0.1000000000000000055511 to failures.

Something like this would entirely sidestep the issues of what gets rounded to 0.1 or what the floating point representation of 0.1 is.

Another option might be to use arrow's cast function for type conversion instead of as.character, since it seems to be more careful in what it rounds. But my question about that is whether the outputs here would be stable across platforms. For example, maybe on one computer the floating point representation of 0.1 is 0.1000000000000000055511 and on another computer (e.g., one that's not using 64 bit representations of numbers) it's something slightly different. For that reason, it seems better to avoid relying on equality of floating point representations if possible.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @elray1 . Great points and I agree that although some output IDs are numeric, given we want to be matching output ID values effectively as categories / use them for grouping (both in validation and downstream) it feels indeed more appropriate to work with data as character for this purposes and match exact values, not check for equality using numeric tolerances or rounding behaviours.

So I will look into the most efficient way of including all character versions of data in our model output validation checks.

I will merge your currently failing test to include in the test suite. I also feel the original test you included (of a 0.1000000000000000055511 value included instead of a 0.1 value should now also fail.

Persistent issue with trailing zeros in tasks.json

The one remaining problem however that as.character() was solving for us is removing trailing zeros. When reading the tasks.json into R from JSON and then passing values to expand.grid to create the grid of all possible valid values, any trailing zeros are removed from numeric output type IDs. Although I spent quite some time trying to figure out a way to read in values from JSON in R as character (retaining any trailing zeros) I could not find a way so I don't think the suggestion:

Additionally, read the contents of tasks.json as strings if that's possible (may not be?)?
is currently possible.

Using as.character() on model output data as proposed in this PR was addressing this issue:

as.character(c(0.300, 0.350))
#> [1] "0.3"  "0.35"

so that including 0.300 as an output type ID in a tasks.json would match a 0.300 value in the data (which previously failed because the value from config would be converted to 0.3 while the value in model output data would be read in as 0.300. Having said that, the approach in this PR is still a bit of a hack, as, to match values, trailing zeros are removed from both which is not quite right. I think we will have to tackle this with strong documentation and perhaps some sort of validation check (although I think a validation check will be difficult for the same reasons. Detecting trailing zeros in numerics will likely be hampered by the requirement to retain trailing zeros...our original problem!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK Just to clarify what the new approach I'm putting together is doing:

  • The read_model_out_file fn now has argument coerce_types with values hub, chr and none instead of logical use_hub_schema.
    • hub reads in (csvs) or coerces (parquet & arrow which have schema defined in file, using arrow cast) to the hub schema.
    • chr reads in (csvs) or coerces (parquet & arrow) all columns as character.
    • none: no coercion. Read using default arrow::read_* fun settings.

This now causes both 0.1000000000000000055511 and 0.09999999999999997779554 to fail validation. It also means that values with trailing zeros (e.g. 0.300) will always fail, regardless of whether 0.300 is specified in tasks.json (leading to the issue discussed in #54). While I do feel this is actually correct behaviour when validating model output data, it still feels unsatisfactory to me form the aspect that values with trailing zeros can currently be specified in config files but that will not be respected when reading in and will cause validation errors. I think this will remain problematic until we find a way to detect and throw a warning about trailing zeros during config file validation.

Copy link
Member Author

@annakrystalli annakrystalli Nov 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirm read and column type checking behaviour for different file types

Also can we just confirm that you are happy with the following column type checks (same as before after addressing issues discussed in #54) but needs some recoding due to changed default behaviour in read_model_out_file).

  • When reading in csv files (which do not have column type information embedded), whether a file can be parsed using the hub schema on read is checked via check_file_read fn. If that check passes, the file is then read in using thehub schema. If a csv is read in therefore, it will always pass the column type check.
  • For arrow and parquet files, files are read in according to the file schema (not coerced to the hub schema). As such, the check on column types checks that the file schema matches the hub schema (and may therefore fail).

Could just give me a thumbs up if you agree?

}
tbl
}
12 changes: 10 additions & 2 deletions R/check_tbl_values_required.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
check_tbl_values_required <- function(tbl, round_id, file_path, hub_path) {
config_tasks <- hubUtils::read_config(hub_path, "tasks")
tbl[["value"]] <- NULL
tbl <- hubUtils::coerce_to_character(tbl)
tbl <- hubUtils::coerce_to_character(tbl) %>%
coerce_num_output_type_ids(file_path, hub_path)

req <- hubUtils::expand_model_out_val_grid(
config_tasks,
Expand All @@ -32,7 +33,7 @@ check_tbl_values_required <- function(tbl, round_id, file_path, hub_path) {
)

missing_df <- purrr::pmap(
list(tbl, req, full),
combine_mt_inputs(tbl, req, full),
check_modeling_task_values_required
) %>%
purrr::list_rbind()
Expand Down Expand Up @@ -323,3 +324,10 @@ split_na_req <- function(req) {
req[na_idx[, "row"], ] %>%
split(na_idx[, "col"])
}

combine_mt_inputs <- function(tbl, req, full) {
keep_mt <- purrr::map_lgl(req, ~nrow(.x) > 0L)
list(tbl[keep_mt],
req[keep_mt],
full[keep_mt])
}
39 changes: 39 additions & 0 deletions R/opt_check_metadata_team_max_model_n.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#' Check that submitting team does not exceed maximum number of allowed models
#' per team
#'
#' @inherit check_metadata_file_exists params
#' @param n_max Integer. Number of maximum allowed models per team.
#' @inherit check_tbl_col_types return
#' @details
#' Should be deployed as part of `validate_model_metadata` optional checks.
#'
#'
#' @export
opt_check_metadata_team_max_model_n <- function(file_path, hub_path, n_max = 2L) {

team_abbr <- parse_file_name(
file_path,
file_type = "model_metadata")$team_abbr
all_model_meta <- hubUtils::load_model_metadata(hub_path)

team_models <- all_model_meta[["model_abbr"]][all_model_meta[["team_abbr"]] == team_abbr]
n_models <- length(team_models)
check <- isFALSE(n_models > n_max)
if (check) {
details <- NULL
} else {
details <- cli::format_inline(
"Team {.val {team_abbr}} has submitted valid metadata for
{.val {n_models}} model{?s}:
{.val {team_models}}.")
}

capture_check_cnd(
check = check,
file_path = file_path,
msg_subject = cli::format_inline(
"Maximum number of models per team ({.val {n_max}})"),
msg_attribute = "exceeded.",
msg_verbs = c("not", ""),
details = details)
}
2 changes: 2 additions & 0 deletions R/opt_check_tbl_col_timediff.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#' @param t0_colname Character string. The name of the time zero date column.
#' @param t1_colname Character string. The name of the time zero + 1 time step date column.
#' @param timediff an object of class `lubridate` [`Period-class`] and length 1.
#' @details
#' Should be deployed as part of `validate_model_data` optional checks.
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_col_types return
#' @export
Expand Down
2 changes: 2 additions & 0 deletions R/opt_check_tbl_counts_lt_popn.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#'
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_col_types return
#' @details
#' Should be deployed as part of `validate_model_data` optional checks.
#' @export
#' @examples
#' hub_path <- system.file("testhubs/flusight", package = "hubValidations")
Expand Down
2 changes: 2 additions & 0 deletions R/opt_check_tbl_horizon_timediff.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#' The period of a single horizon. Default to 1 week.
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_col_types return
#' @details
#' Should be deployed as part of `validate_model_data` optional checks.
#' @export
opt_check_tbl_horizon_timediff <- function(tbl, file_path, hub_path, t0_colname,
t1_colname, horizon_colname = "horizon",
Expand Down
59 changes: 35 additions & 24 deletions R/parse_file_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
#'
#' @param file_path Character string. A model output file name.
#' Can include parent directories which are ignored.
#' @param file_type Character string. Type of file name being parsed. One of `"model_output"`
#' or `"model_metadata"`.
#'
#' @return A list with the following elements:
#' - `round_id`: The round ID the model output is associated with.
#' - `round_id`: The round ID the model output is associated with (`NA` for
#' model metadata files.)
#' - `team_abbr`: The team responsible for the model.
#' - `model_abbr`: The name of the model.
#' - `model_id`: The unique model ID derived from the concatenation of
Expand All @@ -15,32 +18,40 @@
#'
#' @examples
#' parse_file_name("hub-baseline/2022-10-15-hub-baseline.csv")
parse_file_name <- function(file_path) {
checkmate::assert_string(file_path)
file_name <- tools::file_path_sans_ext(basename(file_path))
parse_file_name <- function(file_path, file_type = c("model_output", "model_metadata")) {
file_type <- rlang::arg_match(file_type)
checkmate::assert_string(file_path)
file_name <- tools::file_path_sans_ext(basename(file_path))

split_pattern <- stringr::regex(
"([[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2})|[a-z_0-9]+",
TRUE
split_pattern <- stringr::regex(
"([[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2})|[a-z_0-9]+",
TRUE
)
split_res <- unlist(
stringr::str_extract_all(
file_name,
split_pattern
)
split_res <- unlist(
stringr::str_extract_all(
file_name,
split_pattern
)
)
if (length(split_res) != 3L) {
cli::cli_abort(
"Could not parse file name {.path {file_name}} for submission metadata.
)
exp_n <- switch(file_type,
model_output = 3L,
model_metadata = 2L
)
if (length(split_res) != exp_n) {
cli::cli_abort(
"Could not parse file name {.path {file_name}} for submission metadata.
Please consult documentation for file name requirements for correct
metadata parsing."
)
}
list(
round_id = split_res[1],
team_abbr = split_res[2],
model_abbr = split_res[3],
model_id = paste(split_res[2], split_res[3], sep = "-"),
ext = fs::path_ext(file_path)
)
}
if (file_type == "model_metadata") {
split_res <- c(NA, split_res)
}
list(
round_id = split_res[1],
team_abbr = split_res[2],
model_abbr = split_res[3],
model_id = paste(split_res[2], split_res[3], sep = "-"),
ext = fs::path_ext(file_path)
)
}
12 changes: 3 additions & 9 deletions R/try_check.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,9 @@
try_check <- function(expr, file_path) {
check <- try(expr, silent = TRUE)
if (inherits(check, "try-error")) {
message <- attr(check, "condition")$message
parent_msg <- attr(check, "condition")$parent$message
if (is.character(parent_msg)) {
parent_msg <- paste(parent_msg, collapse = " --> ")
msg <- paste(message, parent_msg, sep = " --> ")
} else {
msg <- message
}
msg <- clean_msg(msg)
msg <- as.character(check) %>%
cli::ansi_strip() %>%
clean_msg()

return(
capture_exec_error(
Expand Down
Loading
Loading