Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

91/override out type id type #101

Merged
merged 14 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: hubValidations
Title: Testing framework for hubverse hub validations
Version: 0.2.0
Version: 0.3.0
Authors@R: c(
person(
given = "Anna",
Expand Down Expand Up @@ -38,7 +38,7 @@ Imports:
fs,
gh,
hubAdmin (>= 1.0.0),
hubData (>= 1.0.0),
hubData (>= 1.1.0),
hubUtils (>= 0.1.0),
jsonlite,
jsonvalidate,
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# hubValidations 0.3.0

* Introduce an `output_type_id_datatype` argument to `validate_pr()`, `validate_submission()`, `validate_model_data()` and `expand_model_out_grid()` and set default value to `"from_config"`. This default means the data type specified in the `output_type_id_datatype` property in `tasks.json` (introduced in schema version `v3.0.1`) is used to cast the hub level `output_type_id` column data type. If not set in the config, the functions fall back to `"auto"` which detects the simplest data type that can represent all output type id values across all output types and rounds. The argument also allows hub administrators to override this setting manually during validation.
annakrystalli marked this conversation as resolved.
Show resolved Hide resolved

# hubValidations 0.2.0

* Move and rename the following `hubData` functions to `hubValidations`:
Expand Down
15 changes: 12 additions & 3 deletions R/check_tbl_col_types.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,27 @@
#' Check that model output data column datatypes conform to those define in the
#' hub config.
#' @inherit check_tbl_colnames params
#' @inheritParams hubData::create_hub_schema
#' @return
#' Depending on whether validation has succeeded, one of:
#' - `<message/check_success>` condition class object.
#' - `<warning/check_failure>` condition class object.
#'
#' Returned object also inherits from subclass `<hub_check>`.
#' @export
check_tbl_col_types <- function(tbl, file_path, hub_path) {
check_tbl_col_types <- function(tbl, file_path, hub_path,
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
)) {
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
config_tasks <- hubUtils::read_config(hub_path, "tasks")

schema <- hubData::create_hub_schema(config_tasks,
partitions = NULL,
r_schema = TRUE
r_schema = TRUE,
output_type_id_datatype = output_type_id_datatype
)[names(tbl)]

tbl_types <- purrr::map_chr(tbl, ~ if (inherits(.x, "numeric")) {
Expand All @@ -31,12 +39,13 @@ check_tbl_col_types <- function(tbl, file_path, hub_path) {
details <- NULL
} else {
invalid_cols <- names(compare_types)[!compare_types]
details <- paste(
details <- paste0(
"{.var ", invalid_cols,
"} should be {.val ", schema[invalid_cols],
"} not {.val ", tbl_types[invalid_cols], "}"
) %>%
paste(collapse = ", ") %>%
paste0(".") %>%
cli::format_inline()
}

Expand Down
19 changes: 15 additions & 4 deletions R/expand_model_out_grid.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,16 @@ expand_model_out_grid <- function(config_tasks,
round_id,
required_vals_only = FALSE,
all_character = FALSE,
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
),
as_arrow_table = FALSE,
bind_model_tasks = TRUE,
include_sample_ids = FALSE) {
round_idx <- hubUtils::get_round_idx(config_tasks, round_id)
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)

round_config <- purrr::pluck(
config_tasks,
Expand Down Expand Up @@ -156,7 +162,8 @@ expand_model_out_grid <- function(config_tasks,
config_tasks,
all_character = all_character,
as_arrow_table = as_arrow_table,
bind_model_tasks = bind_model_tasks
bind_model_tasks = bind_model_tasks,
output_type_id_datatype = output_type_id_datatype
)
}

Expand Down Expand Up @@ -221,7 +228,9 @@ fix_round_id <- function(x, round_id, round_config, round_ids) {
# - binding multiple modeling task grids together.
process_mt_grid_outputs <- function(x, config_tasks, all_character,
as_arrow_table = TRUE,
bind_model_tasks = TRUE) {
bind_model_tasks = TRUE,
output_type_id_datatype = output_type_id_datatype) {

if (bind_model_tasks) {
# To bind multiple modeling task grids together, we need to ensure they contain
# the same columns. Any missing columns are padded with NAs.
Expand All @@ -232,7 +241,8 @@ process_mt_grid_outputs <- function(x, config_tasks, all_character,
schema_cols <- names(
hubData::create_hub_schema(
config_tasks,
partitions = NULL
partitions = NULL,
output_type_id_datatype = output_type_id_datatype
)
)
all_cols <- schema_cols[schema_cols %in% all_cols]
Expand All @@ -252,7 +262,8 @@ process_mt_grid_outputs <- function(x, config_tasks, all_character,
~ hubData::coerce_to_hub_schema(
.x,
config_tasks,
as_arrow_table = as_arrow_table
as_arrow_table = as_arrow_table,
output_type_id_datatype = output_type_id_datatype
)
)
}
Expand Down
12 changes: 10 additions & 2 deletions R/opt_check_tbl_col_timediff.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@
#' @param t0_colname Character string. The name of the time zero date column.
#' @param t1_colname Character string. The name of the time zero + 1 time step date column.
#' @param timediff an object of class `lubridate` [`Period-class`] and length 1.
#' @inheritParams hubData::create_hub_schema
#' @details
#' Should be deployed as part of `validate_model_data` optional checks.
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_col_types return
#' @export
opt_check_tbl_col_timediff <- function(tbl, file_path, hub_path,
t0_colname, t1_colname,
timediff = lubridate::weeks(2)) {
timediff = lubridate::weeks(2),
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
)) {
checkmate::assert_class(timediff, "Period")
checkmate::assert_scalar(timediff)
checkmate::assert_character(t0_colname, len = 1L)
Expand All @@ -19,9 +25,11 @@ opt_check_tbl_col_timediff <- function(tbl, file_path, hub_path,
checkmate::assert_choice(t1_colname, choices = names(tbl))

config_tasks <- hubUtils::read_config(hub_path, "tasks")
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
schema <- hubData::create_hub_schema(config_tasks,
partitions = NULL,
r_schema = TRUE
r_schema = TRUE,
output_type_id_datatype = output_type_id_datatype
)
assert_column_date(t0_colname, schema)
assert_column_date(t1_colname, schema)
Expand Down
12 changes: 10 additions & 2 deletions R/opt_check_tbl_horizon_timediff.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@
#' The period of a single horizon. Default to 1 week.
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_col_types return
#' @inheritParams hubData::create_hub_schema
#' @details
#' Should be deployed as part of `validate_model_data` optional checks.
#' @export
opt_check_tbl_horizon_timediff <- function(tbl, file_path, hub_path, t0_colname,
t1_colname, horizon_colname = "horizon",
timediff = lubridate::weeks()) {
timediff = lubridate::weeks(),
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
)) {
checkmate::assert_class(timediff, "Period")
checkmate::assert_scalar(timediff)
checkmate::assert_character(t0_colname, len = 1L)
Expand All @@ -25,9 +31,11 @@ opt_check_tbl_horizon_timediff <- function(tbl, file_path, hub_path, t0_colname,
checkmate::assert_choice(horizon_colname, choices = names(tbl))

config_tasks <- hubUtils::read_config(hub_path, "tasks")
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
schema <- hubData::create_hub_schema(config_tasks,
partitions = NULL,
r_schema = TRUE
r_schema = TRUE,
output_type_id_datatype = output_type_id_datatype
)
assert_column_date(t0_colname, schema)
assert_column_date(t1_colname, schema)
Expand Down
33 changes: 26 additions & 7 deletions R/read_model_out_file.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,26 @@
#' Read a model output file
#'
#' @inheritParams check_valid_round_id
#' @inheritParams hubData::create_hub_schema
#' @param coerce_types character. What to coerce column types to on read.
#' - `hub`: read in (`csv`) or coerce (`parquet`, `arrow`) to hub schema.
#' - `hub`: (default) read in (`csv`) or coerce (`parquet`, `arrow`) to hub
#' schema.
#' When coercing data types using the `hub` schema, the `output_type_id_datatype`
#' can also be used to set the `output_type_id` column data type manually.
#' - `chr`: read in (`csv`) or coerce (`parquet`, `arrow`) all columns to character.
#' - `none`: No coercion. Use `arrow` `read_*` function defaults.
#' @return a tibble of contents of the model output file.
#' @export
read_model_out_file <- function(file_path, hub_path = ".",
coerce_types = c("hub", "chr", "none")) {
coerce_types = c("hub", "chr", "none"),
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
)) {
coerce_types <- rlang::arg_match(coerce_types)
full_path <- abs_file_path(file_path, hub_path)
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)

if (!fs::file_exists(full_path)) {
rel_path <- rel_file_path(file_path, hub_path) # nolint: object_usage_linter
Expand All @@ -31,7 +41,8 @@ read_model_out_file <- function(file_path, hub_path = ".",
if (coerce_on_read) {
schema <- create_model_out_schema(
hub_path,
col_types = coerce_types
col_types = coerce_types,
output_type_id_datatype = output_type_id_datatype
)
}
arrow::read_csv_arrow(
Expand All @@ -43,7 +54,8 @@ read_model_out_file <- function(file_path, hub_path = ".",
if (coerce_types == "hub") {
arrow::read_parquet(full_path) %>%
hubData::coerce_to_hub_schema(
config_tasks = hubUtils::read_config(hub_path, "tasks")
config_tasks = hubUtils::read_config(hub_path, "tasks"),
output_type_id_datatype = output_type_id_datatype
)
} else if (coerce_types == "chr") {
arrow::read_parquet(full_path) %>%
Expand All @@ -56,7 +68,8 @@ read_model_out_file <- function(file_path, hub_path = ".",
if (coerce_types == "hub") {
arrow::read_feather(full_path) %>%
hubData::coerce_to_hub_schema(
config_tasks = hubUtils::read_config(hub_path, "tasks")
config_tasks = hubUtils::read_config(hub_path, "tasks"),
output_type_id_datatype = output_type_id_datatype
)
} else if (coerce_types == "chr") {
arrow::read_feather(full_path) %>%
Expand All @@ -70,11 +83,17 @@ read_model_out_file <- function(file_path, hub_path = ".",
}

create_model_out_schema <- function(hub_path,
col_types = c("hub", "chr")) {
col_types = c("hub", "chr"),
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
)) {
col_types <- rlang::arg_match(col_types)
schema <- hubData::create_hub_schema(
config_tasks = hubUtils::read_config(hub_path, "tasks"),
partitions = NULL
partitions = NULL,
output_type_id_datatype = output_type_id_datatype
)

switch(col_types,
Expand Down
10 changes: 9 additions & 1 deletion R/validate_model_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#'
#' @inheritParams check_tbl_unique_round_id
#' @inheritParams validate_model_file
#' @inheritParams hubData::create_hub_schema
#' @inherit validate_model_file return
#' @export
#' @details
Expand All @@ -23,11 +24,17 @@
#' file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
#' validate_model_data(hub_path, file_path)
validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
),
validations_cfg_path = NULL) {
checks <- new_hub_validations()

file_meta <- parse_file_name(file_path)
round_id <- file_meta$round_id
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)

# -- File parsing checks ----
checks$file_read <- try_check(
Expand Down Expand Up @@ -113,7 +120,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
check_tbl_col_types(
tbl,
file_path = file_path,
hub_path = hub_path
hub_path = hub_path,
output_type_id_datatype = output_type_id_datatype
), file_path
)

Expand Down
14 changes: 12 additions & 2 deletions R/validate_pr.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,24 @@
#' )
#' }
validate_pr <- function(hub_path = ".", gh_repo, pr_number,
round_id_col = NULL, validations_cfg_path = NULL,
round_id_col = NULL,
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
), validations_cfg_path = NULL,
skip_submit_window_check = FALSE,
file_modification_check = c("error", "warn", "message", "none"),
file_modification_check = c(
"error", "warn",
"message", "none"
),
allow_submit_window_mods = TRUE,
submit_window_ref_date_from = c(
"file",
"file_path"
)) {
file_modification_check <- rlang::arg_match(file_modification_check)
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
model_output_dir <- get_hub_model_output_dir(hub_path) # nolint: object_name_linter
model_metadata_dir <- "model-metadata" # nolint: object_name_linter
validations <- new_hub_validations()
Expand Down Expand Up @@ -167,6 +176,7 @@ validate_pr <- function(hub_path = ".", gh_repo, pr_number,
~ validate_submission(
hub_path,
file_path = .x,
output_type_id_datatype = output_type_id_datatype,
validations_cfg_path = validations_cfg_path,
skip_submit_window_check = skip_submit_window_check,
skip_check_config = TRUE
Expand Down
9 changes: 9 additions & 0 deletions R/validate_submission.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#' of the file.
#'
#' @inherit validate_model_data return params
#' @inheritParams hubData::create_hub_schema
#' @param skip_submit_window_check Logical. Whether to skip the submission window check.
#' @param skip_check_config Logical. Whether to skip the hub config validation check.
#' check.
Expand Down Expand Up @@ -38,13 +39,20 @@
#' validate_submission(hub_path, file_path)
validate_submission <- function(hub_path, file_path, round_id_col = NULL,
validations_cfg_path = NULL,
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
),
skip_submit_window_check = FALSE,
skip_check_config = FALSE,
submit_window_ref_date_from = c(
"file",
"file_path"
)) {
check_hub_config <- new_hub_validations()
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)

if (!skip_check_config) {
check_hub_config$valid_config <- try_check(
check_config_hub_valid(hub_path),
Expand Down Expand Up @@ -80,6 +88,7 @@ validate_submission <- function(hub_path, file_path, round_id_col = NULL,
hub_path = hub_path,
file_path = file_path,
round_id_col = round_id_col,
output_type_id_datatype = output_type_id_datatype,
validations_cfg_path = validations_cfg_path
)

Expand Down
Loading
Loading