Skip to content

Commit

Permalink
Bolstered parsing of file names. Resolves #132
Browse files Browse the repository at this point in the history
  • Loading branch information
annakrystalli committed Oct 24, 2024
1 parent 9873ad3 commit 5ea1a10
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 34 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Added:
- new vignette on how to create custom validation checks for hub validations (#121)
- new section on how to manage additional dependencies required by custom validation functions (#22).
* Bolstered parsing of file names in `parse_file_name()` to ensure they match the specific pattern required rather than just checking patterns of interest are extracted successfully and added more fine-grained check error messages (#132).

# hubValidations 0.7.0

Expand Down
123 changes: 94 additions & 29 deletions R/parse_file_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,55 +28,120 @@
parse_file_name <- function(file_path, file_type = c("model_output", "model_metadata")) {
file_type <- rlang::arg_match(file_type)
checkmate::assert_string(file_path)
file_name <- tools::file_path_sans_ext(basename(file_path))
# Detect, validate and remove compression extension before validating and splitting
# file name
compression_ext <- validate_compression_ext(
compression_ext = fs::path_ext(file_name)
)
file_name <- tools::file_path_sans_ext(file_name)
validate_filename_contents(file_name)
validate_filename_pattern(file_name, file_type)

split_res <- split_filename(file_name, file_type)

list(
round_id = split_res[1],
team_abbr = split_res[2],
model_abbr = split_res[3],
model_id = paste(split_res[2], split_res[3], sep = "-"),
ext = fs::path_ext(file_path),
compression_ext = compression_ext
) %>% purrr::compact()
}

# Split file name into round_id, team_abbr and model_abbr
split_filename <- function(file_name, file_type) {
split_pattern <- stringr::regex(
"([[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2})|[a-z_0-9]+",
TRUE
)
compress_regex <- paste0(
"\\.(",
paste0(compress_codec, collapse = "|"),
")$"
)
file_name <- tools::file_path_sans_ext(basename(file_path)) |>
# remove compression extension before validating and splitting
stringr::str_remove(compress_regex)
split_res <- stringr::str_extract_all(file_name, split_pattern) |> unlist()

split_res <- unlist(
stringr::str_extract_all(
file_name,
split_pattern
)
)
exp_n <- switch(file_type,
model_output = 3L,
model_metadata = 2L
)

if (length(split_res) != exp_n) {
cli::cli_abort(
"Could not parse file name {.path {file_name}} for submission metadata.
Please consult
{.href [documentation for file name requirements
{.href [documentation on file name requirements
](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)} for correct metadata parsing."
)
}
if (file_type == "model_metadata") {
split_res <- c(NA, split_res)
}
# extract compression extension if present
compression_ext <- stringr::str_extract(file_name, compress_regex)
if (is.na(compression_ext)) {
compression_ext <- NULL
} else {
compression_ext <- stringr::str_remove(compression_ext, "^\\.")
split_res
}

# Function that validates a hubverse file name does not contain invalid characters
validate_filename_contents <- function(file_name, call = rlang::caller_env()) {
pattern <- "^[A-Za-z0-9_-]+$"
invalid_contents <- isFALSE(grepl(pattern, file_name))

if (invalid_contents) {
invalid_char <- stringr::str_remove_all(file_name, "[A-Za-z0-9_-]+") |>
strsplit("") |>
unlist() |>
unique()

cli::cli_abort(
c("x" = "File name {.file {file_name}} contains character{?s}
{.val {invalid_char}} that {?is/are} not allowed"),
call = call
)
}
}

# Function that validates a hubverse file name matches expected pattern:
# [round_id]-[team_abbr]-[model_abbr]
validate_filename_pattern <- function(file_name, file_type,
call = rlang::caller_env()) {
pattern <- switch(file_type,
model_output = "^((\\d{4}-\\d{2}-\\d{2})|[A-Za-z0-9_]+)-([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$",
model_metadata = "^([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$"
)

expected_pattern <- switch(file_type,
model_output = "[round_id]-[team_abbr]-[model_abbr]",
model_metadata = "[team_abbr]-[model_abbr]"
)

info_url <- switch(file_type,
model_output = "https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure",
model_metadata = "https://hubverse.io/en/latest/user-guide/model-metadata.html#directory-structure"
)

if (!grepl(pattern, file_name)) {
cli::cli_abort(
c("x" = "File name {.file {file_name}} does not match expected pattern of
{.field {expected_pattern}}. Please consult
{.href [documentation on file name requirements
]({info_url})}
for details."),
call = call
)
}
}

list(
round_id = split_res[1],
team_abbr = split_res[2],
model_abbr = split_res[3],
model_id = paste(split_res[2], split_res[3], sep = "-"),
ext = fs::path_ext(file_path),
compression_ext = compression_ext
) %>% purrr::compact()
# Function that validates a hubverse file name compression extension.
# Returns NULL if empty string or compression_ext if valid.
validate_compression_ext <- function(compression_ext, call = rlang::caller_env()) {
if (compression_ext == "") {
return(NULL)
}
if (!compression_ext %in% compress_codec) {
cli::cli_abort(
c("x" = "Compression extension {.val {compression_ext}} is not valid.
Must be one of {.val {compress_codec}}.
Please consult {.href [documentation on file name requirements
](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)}
for details."),
call = call
)
}
compression_ext
}
31 changes: 28 additions & 3 deletions tests/testthat/_snaps/parse_file_name.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,31 @@
parse_file_name("hubBaseline.yml", file_type = "model_metadata")
Condition
Error in `parse_file_name()`:
! Could not parse file name 'hubBaseline' for submission metadata. Please consult documentation for file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for correct metadata parsing.
x File name 'hubBaseline' does not match expected pattern of [team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-metadata.html#directory-structure>) for details.

# parse_file_name fails correctly

Code
parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel.csv")
Condition
Error in `parse_file_name()`:
! Could not parse file name '2022-10-08-team1_goodmodel' for submission metadata. Please consult documentation for file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for correct metadata parsing.
x File name '2022-10-08-team1_goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

---

Code
parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv")
Condition
Error in `parse_file_name()`:
x File name '2022-10-08-team1_goodmodel ' contains character " " that is not allowed

---

Code
parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv")
Condition
Error in `parse_file_name()`:
x File name '2022-10-08-team1_goodmodel*' contains character "*" that is not allowed

# parse_file_name ignores compression extensions

Expand All @@ -119,6 +135,9 @@
$ext
[1] "parquet"
$compression_ext
[1] "gzip"

---

Expand All @@ -141,6 +160,9 @@
$ext
[1] "parquet"
$compression_ext
[1] "gz"

---

Expand All @@ -163,6 +185,9 @@
$ext
[1] "parquet"
$compression_ext
[1] "snappy"

---

Expand All @@ -171,5 +196,5 @@
"model-output/team1-goodmodel/2022-10-08-team1-goodmodel.gzipr.parquet")
Condition
Error in `parse_file_name()`:
! Could not parse file name '2022-10-08-team1-goodmodel.gzipr' for submission metadata. Please consult documentation for file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for correct metadata parsing.
x Compression extension "gzipr" is not valid. Must be one of "snappy", "gzip", "gz", "brotli", "zstd", "lz4", "lzo", and "bz2". Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

2 changes: 1 addition & 1 deletion tests/testthat/_snaps/utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
get_file_round_id(file_path = "team1-goodmodel/2022-10-08-team-1-goodmodel.csv")
Condition
Error in `parse_file_name()`:
! Could not parse file name '2022-10-08-team-1-goodmodel' for submission metadata. Please consult documentation for file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for correct metadata parsing.
x File name '2022-10-08-team-1-goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

# get_file_* utils work

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/_snaps/validate_model_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@
validate_model_data(hub_path, file_path = "random-path.csv")
Condition
Error in `parse_file_name()`:
! Could not parse file name 'random-path' for submission metadata. Please consult documentation for file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for correct metadata parsing.
x File name 'random-path' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements (<https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure>) for details.

# validate_model_data with v3 sample data works

Expand Down
14 changes: 14 additions & 0 deletions tests/testthat/test-parse_file_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ test_that("parse_file_name fails correctly", {
),
error = TRUE
)
# Correctly detects invalid characters
expect_snapshot(
parse_file_name(
"model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv"
),
error = TRUE
)
expect_snapshot(
parse_file_name(
"model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv"
),
error = TRUE
)

expect_error(
parse_file_name(
"model-output/team1-goodmodel/round_1-team1-good-model.parquet"
Expand Down

0 comments on commit 5ea1a10

Please sign in to comment.