Skip to content

Commit

Permalink
optimise check_tbl_values by output_type batching & ignoring derived …
Browse files Browse the repository at this point in the history
…task-ids. Related to #93
  • Loading branch information
annakrystalli committed Aug 9, 2024
1 parent 21d2f85 commit 2aa70e9
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 30 deletions.
76 changes: 50 additions & 26 deletions R/check_tbl_values.R
Original file line number Diff line number Diff line change
@@ -1,42 +1,33 @@
#' Check model output data tbl contains valid value combinations
#' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**.
#' @inherit check_tbl_colnames params
#' @inheritParams expand_model_out_grid
#' @inherit check_tbl_colnames return
#' @export
check_tbl_values <- function(tbl, round_id, file_path, hub_path) {
check_tbl_values <- function(tbl, round_id, file_path, hub_path,
derived_task_ids = NULL) {
config_tasks <- hubUtils::read_config(hub_path, "tasks")

# Coerce accepted vals to character for easier comparison of
# values. Tried to use arrow tbls for comparisons as more efficient when
# working with larger files but currently arrow does not match NAs as dplyr
# does, returning false positives for mean & median rows which contain NA in
# output type ID column.
accepted_vals <- expand_model_out_grid(
config_tasks = config_tasks,
round_id = round_id,
all_character = TRUE
)

# This approach uses dplyr to identify tbl rows that don't have a complete match
# in accepted_vals.
accepted_vals$valid <- TRUE
if (hubUtils::is_v3_config(config_tasks)) {
out_type_ids <- tbl[["output_type_id"]]
tbl[tbl$output_type == "sample", "output_type_id"] <- NA
if (!is.null(derived_task_ids)) {
tbl[, derived_task_ids] <- NA_character_
}

valid_tbl <- dplyr::left_join(
tbl, accepted_vals,
by = names(tbl)[names(tbl) != "value"]
)
valid_tbl <- tbl %>%
split(f = tbl$output_type) %>%
purrr::imap(
~ check_values_by_output_type(tbl = .x, output_type = .y,
config_tasks = config_tasks,
round_id = round_id,
derived_task_ids = derived_task_ids)
) %>% purrr::list_rbind()

check <- !any(is.na(valid_tbl$valid))

if (check) {
details <- NULL
error_tbl <- NULL
} else {
error_summary <- summarise_invalid_values(valid_tbl, accepted_vals)
error_summary <- summarise_invalid_values(valid_tbl, config_tasks, round_id,
derived_task_ids)
details <- error_summary$msg
if (length(error_summary$invalid_combs_idx) == 0L) {
error_tbl <- NULL
Expand Down Expand Up @@ -66,10 +57,43 @@ check_tbl_values <- function(tbl, round_id, file_path, hub_path) {
)
}

summarise_invalid_values <- function(valid_tbl, accepted_vals) {
check_values_by_output_type <- function(tbl, output_type, config_tasks, round_id,
derived_task_ids = NULL) {

# Coerce accepted vals to character for easier comparison of
# values. Tried to use arrow tbls for comparisons as more efficient when
# working with larger files but currently arrow does not match NAs as dplyr
# does, returning false positives for mean & median rows which contain NA in
# output type ID column.
accepted_vals <- expand_model_out_grid(
config_tasks = config_tasks,
round_id = round_id,
all_character = TRUE,
output_types = output_type,
derived_task_ids = derived_task_ids
)

# This approach uses dplyr to identify tbl rows that don't have a complete match
# in accepted_vals.
accepted_vals$valid <- TRUE
if (hubUtils::is_v3_config(config_tasks)) {
out_type_ids <- tbl[["output_type_id"]]
tbl[tbl$output_type == "sample", "output_type_id"] <- NA
}

valid_tbl <- dplyr::left_join(
tbl, accepted_vals,
by = names(tbl)[names(tbl) != "value"]
)

}

summarise_invalid_values <- function(valid_tbl, config_tasks, round_id,
derived_task_ids) {
cols <- names(valid_tbl)[!names(valid_tbl) %in% c("value", "valid")]
uniq_tbl <- purrr::map(valid_tbl[cols], unique)
uniq_config <- purrr::map(accepted_vals[cols], unique)
uniq_config <- get_round_config_values(config_tasks, round_id,
derived_task_ids)[cols]

invalid_vals <- purrr::map2(
uniq_tbl, uniq_config,
Expand Down
61 changes: 61 additions & 0 deletions R/config_tasks-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,64 @@ get_round_output_type_names <- function(config_tasks, round_id,
out
}
}

# get all task_ids values
get_round_config_values <- function(config_tasks, round_id,
derived_task_ids = NULL) {
model_tasks <- hubUtils::get_round_model_tasks(config_tasks, round_id)
task_id_names <- setdiff(
hubUtils::get_round_task_id_names(config_tasks, round_id),
derived_task_ids
)
task_id_values <- purrr::map(
purrr::set_names(task_id_names),
\(.x) get_task_id_values(.x, model_tasks)
)
if (!is.null(derived_task_ids)) {
task_id_values <- c(
task_id_values,
purrr::map(
purrr::set_names(derived_task_ids),
~NA_character_
)
)
}

output_type_names <- get_round_output_type_names(config_tasks, round_id)
output_type_id_values <- purrr::map(
output_type_names,
\(.x) get_output_type_id_values(.x, model_tasks)
) %>% purrr::flatten_chr()

output_types <- list(
output_type = output_type_names,
output_type_id = output_type_id_values
)

c(task_id_values, output_types)
}
get_task_id_values <- function(task_id, model_tasks) {
purrr::map(
model_tasks,
~ .x[["task_ids"]][[task_id]]
) %>%
unlist(use.names = FALSE) %>%
unique() %>%
as.character()
}

get_output_type_id_values <- function(output_type, model_tasks) {
out <- purrr::map(
model_tasks,
~ .x[["output_type"]][[output_type]][["output_type_id"]]
) %>%
unlist(use.names = FALSE) %>%
unique() %>%
as.character()

if (length(out) == 0L) {
return(NA_character_)
} else {
return(out)
}
}
7 changes: 5 additions & 2 deletions R/validate_model_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#' @inheritParams check_tbl_unique_round_id
#' @inheritParams validate_model_file
#' @inheritParams hubData::create_hub_schema
#' @inheritParams expand_model_out_grid
#' @inherit validate_model_file return
#' @export
#' @details
Expand All @@ -29,7 +30,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
"double", "integer",
"logical", "Date"
),
validations_cfg_path = NULL) {
validations_cfg_path = NULL,
derived_task_ids = NULL) {
checks <- new_hub_validations()

file_meta <- parse_file_name(file_path)
Expand Down Expand Up @@ -136,7 +138,8 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
tbl_chr,
round_id = round_id,
file_path = file_path,
hub_path = hub_path
hub_path = hub_path,
derived_task_ids = derived_task_ids
), file_path
)
if (is_any_error(checks$valid_vals)) {
Expand Down
6 changes: 5 additions & 1 deletion man/check_tbl_values.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion man/validate_model_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 2aa70e9

Please sign in to comment.