Skip to content

Commit

Permalink
Merge pull request #133 from hubverse-org/ak/fix-filename-parse-error…
Browse files Browse the repository at this point in the history
…/132

undefined
  • Loading branch information
annakrystalli authored Oct 24, 2024
2 parents 79d32a7 + a4a17d3 commit 25416ba
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 39 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Added:
- new vignette on how to create custom validation checks for hub validations (#121)
- new section on how to manage additional dependencies required by custom validation functions (#22).
* Bolstered parsing of file names in `parse_file_name()` to ensure they match the specific pattern required rather than just checking patterns of interest are extracted successfully and added more fine-grained check error messages (#132).

# hubValidations 0.7.0

Expand Down
127 changes: 99 additions & 28 deletions R/parse_file_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,51 +29,122 @@ parse_file_name <- function(file_path, file_type = c("model_output", "model_meta
file_type <- rlang::arg_match(file_type)
checkmate::assert_string(file_path)
file_name <- tools::file_path_sans_ext(basename(file_path))
# Detect, validate and remove compression extension before validating and splitting
# file name
compression_ext <- validate_compression_ext(
compression_ext = fs::path_ext(file_name)
)
file_name <- tools::file_path_sans_ext(file_name)
validate_filename_contents(file_name)
validate_filename_pattern(file_name, file_type)

split_res <- split_filename(file_name, file_type)

list(
round_id = split_res[1],
team_abbr = split_res[2],
model_abbr = split_res[3],
model_id = paste(split_res[2], split_res[3], sep = "-"),
ext = fs::path_ext(file_path),
compression_ext = compression_ext
) %>% purrr::compact()
}

# Split file name into round_id, team_abbr and model_abbr
split_filename <- function(file_name, file_type) {
split_pattern <- stringr::regex(
"([[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2})|[a-z_0-9]+",
TRUE
)
compress_regex <- paste0(
"\\.(",
paste0(compress_codec, collapse = "|"),
")$"
)
split_res <- unlist(
stringr::str_extract_all(
# remove compression extension before splitting
stringr::str_remove(file_name, compress_regex),
split_pattern
)
)
split_res <- stringr::str_extract_all(file_name, split_pattern) |> unlist()

exp_n <- switch(file_type,
model_output = 3L,
model_metadata = 2L
)

if (length(split_res) != exp_n) {
cli::cli_abort(
"Could not parse file name {.path {file_name}} for submission metadata.
Please consult documentation for file name requirements for correct
metadata parsing."
Please consult
{.href [documentation on file name requirements
](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)} for correct metadata parsing."
)
}
if (file_type == "model_metadata") {
split_res <- c(NA, split_res)
}
# extract compression extension if present
compression_ext <- stringr::str_extract(file_name, compress_regex)
if (is.na(compression_ext)) {
compression_ext <- NULL
} else {
compression_ext <- stringr::str_remove(compression_ext, "^\\.")
split_res
}

# Function that validates a hubverse file name does not contain invalid characters
validate_filename_contents <- function(file_name, call = rlang::caller_env()) {
pattern <- "^[A-Za-z0-9_-]+$"
invalid_contents <- isFALSE(grepl(pattern, file_name))

if (invalid_contents) {

invalid_char <- stringr::str_remove_all(file_name, "[A-Za-z0-9_-]+") |> # nolint: object_usage_linter
strsplit("") |>
unlist() |>
unique()

cli::cli_abort(
c("x" = "File name {.file {file_name}} contains character{?s}
{.val {invalid_char}} that {?is/are} not allowed"),
call = call
)
}
}

# Function that validates a hubverse file name matches expected pattern:
# [round_id]-[team_abbr]-[model_abbr]
validate_filename_pattern <- function(file_name, file_type,
call = rlang::caller_env()) {
pattern <- switch(file_type,
model_output = "^((\\d{4}-\\d{2}-\\d{2})|[A-Za-z0-9_]+)-([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$",
model_metadata = "^([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$"
)


expected_pattern <- switch(file_type, # nolint: object_usage_linter
model_output = "[round_id]-[team_abbr]-[model_abbr]",
model_metadata = "[team_abbr]-[model_abbr]"
)


info_url <- switch(file_type, # nolint: object_usage_linter
model_output = "https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure",
model_metadata = "https://hubverse.io/en/latest/user-guide/model-metadata.html#directory-structure"
)

if (!grepl(pattern, file_name)) {
cli::cli_abort(
c("x" = "File name {.file {file_name}} does not match expected pattern of
{.field {expected_pattern}}. Please consult
{.href [documentation on file name requirements
]({info_url})}
for details."),
call = call
)
}
}

list(
round_id = split_res[1],
team_abbr = split_res[2],
model_abbr = split_res[3],
model_id = paste(split_res[2], split_res[3], sep = "-"),
ext = fs::path_ext(file_path),
compression_ext = compression_ext
) %>% purrr::compact()
# Function that validates a hubverse file name compression extension.
# Returns NULL if empty string or compression_ext if valid.
validate_compression_ext <- function(compression_ext, call = rlang::caller_env()) {
if (compression_ext == "") {
return(NULL)
}
if (!compression_ext %in% compress_codec) {
cli::cli_abort(
c("x" = "Compression extension {.val {compression_ext}} is not valid.
Must be one of {.val {compress_codec}}.
Please consult {.href [documentation on file name requirements
](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)}
for details."),
call = call
)
}
compression_ext
}
22 changes: 19 additions & 3 deletions tests/testthat/_snaps/parse_file_name.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,31 @@
parse_file_name("hubBaseline.yml", file_type = "model_metadata")
Condition
Error in `parse_file_name()`:
! Could not parse file name 'hubBaseline' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x File name 'hubBaseline' does not match expected pattern of [team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-metadata.html#directory-structure>) for details.

# parse_file_name fails correctly

Code
parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel.csv")
Condition
Error in `parse_file_name()`:
! Could not parse file name '2022-10-08-team1_goodmodel' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x File name '2022-10-08-team1_goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

---

Code
parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv")
Condition
Error in `parse_file_name()`:
x File name '2022-10-08-team1_goodmodel ' contains character " " that is not allowed

---

Code
parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv")
Condition
Error in `parse_file_name()`:
x File name '2022-10-08-team1_goodmodel*' contains character "*" that is not allowed

# parse_file_name ignores compression extensions

Expand Down Expand Up @@ -180,5 +196,5 @@
"model-output/team1-goodmodel/2022-10-08-team1-goodmodel.gzipr.parquet")
Condition
Error in `parse_file_name()`:
! Could not parse file name '2022-10-08-team1-goodmodel.gzipr' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x Compression extension "gzipr" is not valid. Must be one of "snappy", "gzip", "gz", "brotli", "zstd", "lz4", "lzo", and "bz2". Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

2 changes: 1 addition & 1 deletion tests/testthat/_snaps/utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
get_file_round_id(file_path = "team1-goodmodel/2022-10-08-team-1-goodmodel.csv")
Condition
Error in `parse_file_name()`:
! Could not parse file name '2022-10-08-team-1-goodmodel' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x File name '2022-10-08-team-1-goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

# get_file_* utils work

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/_snaps/validate_model_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@
validate_model_data(hub_path, file_path = "random-path.csv")
Condition
Error in `parse_file_name()`:
! Could not parse file name 'random-path' for submission metadata. Please consult documentation for file name requirements for correct metadata parsing.
x File name 'random-path' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

# validate_model_data with v3 sample data works

Expand Down
14 changes: 14 additions & 0 deletions tests/testthat/test-parse_file_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ test_that("parse_file_name fails correctly", {
),
error = TRUE
)
# Correctly detects invalid characters
expect_snapshot(
parse_file_name(
"model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv"
),
error = TRUE
)
expect_snapshot(
parse_file_name(
"model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv"
),
error = TRUE
)

expect_error(
parse_file_name(
"model-output/team1-goodmodel/round_1-team1-good-model.parquet"
Expand Down
12 changes: 7 additions & 5 deletions tests/testthat/test-validate_pr.R
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,13 @@ test_that("validate_pr works on valid PR using v2.0.0 schema and old orgname", {
branch = "pr-valid"
)

checks <- validate_pr(
hub_path = temp_hub,
gh_repo = "hubverse-org/ci-testhub-simple-old-orgname",
pr_number = 1,
skip_submit_window_check = TRUE
checks <- suppressMessages(
validate_pr(
hub_path = temp_hub,
gh_repo = "hubverse-org/ci-testhub-simple-old-orgname",
pr_number = 1,
skip_submit_window_check = TRUE
)
)

expect_snapshot(str(checks))
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/testdata/hub-now/hub-config/admin.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"schema_version": "https://raw.githubusercontent.com/Infectious-Disease-Modeling-Hubs/schemas/main/v3.0.1/admin-schema.json",
"schema_version": "https://raw.githubusercontent.com/hubverse-org/schemas/main/v3.0.1/admin-schema.json",
"name": "SARS-CoV-2 Variant Nowcast Hub",
"maintainer": "jbloggs @ UMass-Amherst",
"contact": {
Expand Down

0 comments on commit 25416ba

Please sign in to comment.