diff --git a/NEWS.md b/NEWS.md index 414715c9..9d45181a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ * Added: - new vignette on how to create custom validation checks for hub validations (#121) - new section on how to manage additional dependencies required by custom validation functions (#22). +* Bolstered parsing of file names in `parse_file_name()` to ensure they match the specific pattern required rather than just checking patterns of interest are extracted successfully and added more fine-grained check error messages (#132). # hubValidations 0.7.0 diff --git a/R/parse_file_name.R b/R/parse_file_name.R index 9392b593..1e872b4d 100644 --- a/R/parse_file_name.R +++ b/R/parse_file_name.R @@ -28,55 +28,120 @@ parse_file_name <- function(file_path, file_type = c("model_output", "model_metadata")) { file_type <- rlang::arg_match(file_type) checkmate::assert_string(file_path) + file_name <- tools::file_path_sans_ext(basename(file_path)) + # Detect, validate and remove compression extension before validating and splitting + # file name + compression_ext <- validate_compression_ext( + compression_ext = fs::path_ext(file_name) + ) + file_name <- tools::file_path_sans_ext(file_name) + validate_filename_contents(file_name) + validate_filename_pattern(file_name, file_type) + + split_res <- split_filename(file_name, file_type) + + list( + round_id = split_res[1], + team_abbr = split_res[2], + model_abbr = split_res[3], + model_id = paste(split_res[2], split_res[3], sep = "-"), + ext = fs::path_ext(file_path), + compression_ext = compression_ext + ) %>% purrr::compact() +} +# Split file name into round_id, team_abbr and model_abbr +split_filename <- function(file_name, file_type) { split_pattern <- stringr::regex( "([[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2})|[a-z_0-9]+", TRUE ) - compress_regex <- paste0( - "\\.(", - paste0(compress_codec, collapse = "|"), - ")$" - ) - file_name <- tools::file_path_sans_ext(basename(file_path)) |> - # remove compression extension before validating and splitting - stringr::str_remove(compress_regex) + split_res <- stringr::str_extract_all(file_name, split_pattern) |> unlist() - split_res <- unlist( - stringr::str_extract_all( - file_name, - split_pattern - ) - ) exp_n <- switch(file_type, model_output = 3L, model_metadata = 2L ) + if (length(split_res) != exp_n) { cli::cli_abort( "Could not parse file name {.path {file_name}} for submission metadata. Please consult - {.href [documentation for file name requirements + {.href [documentation on file name requirements ](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)} for correct metadata parsing." ) } if (file_type == "model_metadata") { split_res <- c(NA, split_res) } - # extract compression extension if present - compression_ext <- stringr::str_extract(file_name, compress_regex) - if (is.na(compression_ext)) { - compression_ext <- NULL - } else { - compression_ext <- stringr::str_remove(compression_ext, "^\\.") + split_res +} + +# Function that validates a hubverse file name does not contain invalid characters +validate_filename_contents <- function(file_name, call = rlang::caller_env()) { + pattern <- "^[A-Za-z0-9_-]+$" + invalid_contents <- isFALSE(grepl(pattern, file_name)) + + if (invalid_contents) { + invalid_char <- stringr::str_remove_all(file_name, "[A-Za-z0-9_-]+") |> + strsplit("") |> + unlist() |> + unique() + + cli::cli_abort( + c("x" = "File name {.file {file_name}} contains character{?s} + {.val {invalid_char}} that {?is/are} not allowed"), + call = call + ) + } +} + +# Function that validates a hubverse file name matches expected pattern: +# [round_id]-[team_abbr]-[model_abbr] +validate_filename_pattern <- function(file_name, file_type, + call = rlang::caller_env()) { + pattern <- switch(file_type, + model_output = "^((\\d{4}-\\d{2}-\\d{2})|[A-Za-z0-9_]+)-([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$", + model_metadata = "^([A-Za-z0-9_]+)-([A-Za-z0-9_]+)$" + ) + + expected_pattern <- switch(file_type, + model_output = "[round_id]-[team_abbr]-[model_abbr]", + model_metadata = "[team_abbr]-[model_abbr]" + ) + + info_url <- switch(file_type, + model_output = "https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure", + model_metadata = "https://hubverse.io/en/latest/user-guide/model-metadata.html#directory-structure" + ) + + if (!grepl(pattern, file_name)) { + cli::cli_abort( + c("x" = "File name {.file {file_name}} does not match expected pattern of + {.field {expected_pattern}}. Please consult + {.href [documentation on file name requirements + ]({info_url})} + for details."), + call = call + ) } +} - list( - round_id = split_res[1], - team_abbr = split_res[2], - model_abbr = split_res[3], - model_id = paste(split_res[2], split_res[3], sep = "-"), - ext = fs::path_ext(file_path), - compression_ext = compression_ext - ) %>% purrr::compact() +# Function that validates a hubverse file name compression extension. +# Returns NULL if empty string or compression_ext if valid. +validate_compression_ext <- function(compression_ext, call = rlang::caller_env()) { + if (compression_ext == "") { + return(NULL) + } + if (!compression_ext %in% compress_codec) { + cli::cli_abort( + c("x" = "Compression extension {.val {compression_ext}} is not valid. + Must be one of {.val {compress_codec}}. + Please consult {.href [documentation on file name requirements + ](https://hubverse.io/en/latest/user-guide/model-output.html#directory-structure)} + for details."), + call = call + ) + } + compression_ext } diff --git a/tests/testthat/_snaps/parse_file_name.md b/tests/testthat/_snaps/parse_file_name.md index 7fd465d4..90e72cf6 100644 --- a/tests/testthat/_snaps/parse_file_name.md +++ b/tests/testthat/_snaps/parse_file_name.md @@ -88,7 +88,7 @@ parse_file_name("hubBaseline.yml", file_type = "model_metadata") Condition Error in `parse_file_name()`: - ! Could not parse file name 'hubBaseline' for submission metadata. Please consult documentation for file name requirements () for correct metadata parsing. + x File name 'hubBaseline' does not match expected pattern of [team_abbr]-[model_abbr]. Please consult documentation on file name requirements () for details. # parse_file_name fails correctly @@ -96,7 +96,23 @@ parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel.csv") Condition Error in `parse_file_name()`: - ! Could not parse file name '2022-10-08-team1_goodmodel' for submission metadata. Please consult documentation for file name requirements () for correct metadata parsing. + x File name '2022-10-08-team1_goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements () for details. + +--- + + Code + parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv") + Condition + Error in `parse_file_name()`: + x File name '2022-10-08-team1_goodmodel ' contains character " " that is not allowed + +--- + + Code + parse_file_name("model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv") + Condition + Error in `parse_file_name()`: + x File name '2022-10-08-team1_goodmodel*' contains character "*" that is not allowed # parse_file_name ignores compression extensions @@ -119,6 +135,9 @@ $ext [1] "parquet" + $compression_ext + [1] "gzip" + --- @@ -141,6 +160,9 @@ $ext [1] "parquet" + $compression_ext + [1] "gz" + --- @@ -163,6 +185,9 @@ $ext [1] "parquet" + $compression_ext + [1] "snappy" + --- @@ -171,5 +196,5 @@ "model-output/team1-goodmodel/2022-10-08-team1-goodmodel.gzipr.parquet") Condition Error in `parse_file_name()`: - ! Could not parse file name '2022-10-08-team1-goodmodel.gzipr' for submission metadata. Please consult documentation for file name requirements () for correct metadata parsing. + x Compression extension "gzipr" is not valid. Must be one of "snappy", "gzip", "gz", "brotli", "zstd", "lz4", "lzo", and "bz2". Please consult documentation on file name requirements () for details. diff --git a/tests/testthat/_snaps/utils.md b/tests/testthat/_snaps/utils.md index a7bb67b9..723fceba 100644 --- a/tests/testthat/_snaps/utils.md +++ b/tests/testthat/_snaps/utils.md @@ -41,7 +41,7 @@ get_file_round_id(file_path = "team1-goodmodel/2022-10-08-team-1-goodmodel.csv") Condition Error in `parse_file_name()`: - ! Could not parse file name '2022-10-08-team-1-goodmodel' for submission metadata. Please consult documentation for file name requirements () for correct metadata parsing. + x File name '2022-10-08-team-1-goodmodel' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements () for details. # get_file_* utils work diff --git a/tests/testthat/_snaps/validate_model_data.md b/tests/testthat/_snaps/validate_model_data.md index e1b44ebd..c05c1985 100644 --- a/tests/testthat/_snaps/validate_model_data.md +++ b/tests/testthat/_snaps/validate_model_data.md @@ -334,7 +334,7 @@ validate_model_data(hub_path, file_path = "random-path.csv") Condition Error in `parse_file_name()`: - ! Could not parse file name 'random-path' for submission metadata. Please consult documentation for file name requirements () for correct metadata parsing. + x File name 'random-path' does not match expected pattern of [round_id]-[team_abbr]-[model_abbr]. Please consult documentation on file name requirements () for details. # validate_model_data with v3 sample data works diff --git a/tests/testthat/test-parse_file_name.R b/tests/testthat/test-parse_file_name.R index 0f0cfbba..274d3774 100644 --- a/tests/testthat/test-parse_file_name.R +++ b/tests/testthat/test-parse_file_name.R @@ -56,6 +56,20 @@ test_that("parse_file_name fails correctly", { ), error = TRUE ) + # Correctly detects invalid characters + expect_snapshot( + parse_file_name( + "model-output/team1-goodmodel/2022-10-08-team1_goodmodel .csv" + ), + error = TRUE + ) + expect_snapshot( + parse_file_name( + "model-output/team1-goodmodel/2022-10-08-team1_goodmodel*.csv" + ), + error = TRUE + ) + expect_error( parse_file_name( "model-output/team1-goodmodel/round_1-team1-good-model.parquet"