From f5927c4cc72adc9ebc704c6f235d7ac1f95540d1 Mon Sep 17 00:00:00 2001 From: James Azam Date: Wed, 16 Oct 2024 16:25:23 +0100 Subject: [PATCH] Handle messaging around model treatment of missingness (#774) * Add message about NA treatment for option "missing" * Add missing tests for obs_opts * Add NEWS item * Improve messages Co-authored-by: Sebastian Funk * Check if na is explicitly specified by user Co-authored-by: Sebastian Funk * Change less regular to non-daily * Create function to test for complete data * Add tests for test_data_complete * Document test_data_complete * Add function to crosscheck na as missing setting with data * Move message elsewhere and only check if na is specified for later use * Add new global * Apply na cross checking function in main functions * Rename dirty data * Add test for na cross checking function * Add more tests to obs_opts * Remove global variable * Use alternative syntax to avoid linting issues * copy before modifying data.table * Fix test * Update NEWS.md * Make message/warning verbose and test for message/warning * Simplify comment * Use vapply for type safety * Add lifecycle badge * Add lifecycle badge * Improve docs * Return unmodified obs_opts * Move removal of element of obs_opts() into checker function * Use setequal instead of checking lengths * Remove new unnecessary new lines * Remove trailing white space * Add test to check for expected element introduced * Add new argument to pass column names * Add error for column mismatch * Refactor for readability * Perform missingness check on the raw data * Remove test that no longer applies * Add reviewer and PR number Co-authored-by: Sebastian Funk --------- Co-authored-by: Sebastian Funk --- NEWS.md | 2 + R/checks.R | 90 +++++++++++++++++++++++++++ R/estimate_infections.R | 10 +++ R/estimate_secondary.R | 13 +++- R/opts.R | 8 ++- man/check_na_setting_against_data.Rd | 31 ++++++++++ man/test_data_complete.Rd | 21 +++++++ tests/testthat/test-checks.R | 91 +++++++++++++++++++++++++++- tests/testthat/test-obs_opts.R | 31 ++++++++++ 9 files changed, 290 insertions(+), 7 deletions(-) create mode 100644 man/check_na_setting_against_data.Rd create mode 100644 man/test_data_complete.Rd create mode 100644 tests/testthat/test-obs_opts.R diff --git a/NEWS.md b/NEWS.md index 9260f8137..2353c946d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -27,6 +27,8 @@ A release that introduces model improvements to the Gaussian Process models, alo - Switch to broadcasting the day of the week effect. By @seabbs in #746 and reviewed by @jamesmbaazam. - A warning is now thrown if nonparametric PMFs passed to delay options have consecutive tail values that are below a certain low threshold as these lead to loss in speed with little gain in accuracy. By @jamesmbaazam in #752 and reviewed by @seabbs, with a subsequent bug fix in #802. - `dist_fit()` can now accept any number of `samples` without throwing a warning when `samples` < 1000 in #751 by @jamesmbaazam and reviewed by @seabbs and @sbfnk. +- `obs_opts()` now informs users about how NA observations are treated to help them decide on existing alternatives. By @jamesmbaazam in #774 and reviewed by @sbfnk. +- Users are now informed that `NA` observations will be treated as missing instead of zero when using the default `obs_opts()`. Options to treat `NA` as zeros or accumulate them are also provided. By @jamesmbaazam in #774 and reviewed by @sbfnk. ## Package changes diff --git a/R/checks.R b/R/checks.R index a36fb00be..575700173 100644 --- a/R/checks.R +++ b/R/checks.R @@ -179,3 +179,93 @@ check_sparse_pmf_tail <- function(pmf, span = 5, tol = 1e-6) { ) } } + +#' Check if data has either explicit NA values or implicit missing dates. +#' +#' @param data The data to be checked +#' @param cols_to_check A character vector of the columns to check +#' @return `TRUE` if data is complete, else if data has implicit or explicit +#' missingness, `FALSE`. +#' @importFrom cli cli_abort col_blue +#' @keywords internal +test_data_complete <- function(data, cols_to_check) { + data <- setDT(data) # Convert data to data.table + + if (!all(cols_to_check %in% names(data))) { + cli_abort( + c( + "x" = "{.var cols_to_check} must be present in the data.", + "i" = "{.var data} has columns: {col_blue(names(data))} but you + specified {.var cols_to_expect}: {col_blue(cols_to_check)}." + ) + ) + } + # Check for explicit missingness in the specified columns + data_has_explicit_na <- any( + vapply(data[, cols_to_check, with = FALSE], anyNA, logical(1)) + ) + if (data_has_explicit_na) { + return(FALSE) + } + + # Check for implicit missingness by comparing the expected full date sequence + complete_dates <- seq( + min(data$date, na.rm = TRUE), + max(data$date, na.rm = TRUE), + by = "1 day" + ) + data_has_implicit_na <- !all(complete_dates %in% data$date) + if (data_has_implicit_na) { + return(FALSE) + } + + return(TRUE) # Return TRUE if no missing values or gaps in date sequence +} + +#' Cross-check treatment of `NA` in obs_opts() against input data +#' +#' @description `r lifecycle::badge("experimental")` +#' +#' This function checks the input data for implicit and/or explicit missingness +#' and checks if the user specified `na = "missing"` in [obs_opts()]. +#' If the two are TRUE, it throws a message about how the model treats +#' missingness and provides alternatives. It returns an unmodified [obs_opts()]. +#' +#' This function is necessary because the data and observation model +#' do not currently interact internally. It will be deprecated in future +#' versions when the data specification interface is enhanced. +#' +#' @param obs A call to [obs_opts()] +#' @param data The raw data +#' @inheritParams test_data_complete +#' @importFrom cli cli_inform col_red +#' +#' @return [obs_opts()] +#' @keywords internal +check_na_setting_against_data <- function(data, cols_to_check, obs) { + # If users are using the default treatment of NA's and their data has + # implicit or explicit NA's, inform them of what's happening and alternatives + data_is_complete <- test_data_complete(data, cols_to_check) + if (!obs$accumulate && + obs$na_as_missing_default_used && + !data_is_complete) { + #nolint start: duplicate_argument_linter + cli_inform( + c( + "i" = "{col_red(\"As of version 1.5.0 missing dates or dates with `NA` + cases are treated as missing. This is in contrast to previous versions + where these were interpreted as dates with zero cases. \")}", + "i" = "In order to treat missing or `NA` cases as zeroes, see + solutions in {.url https://github.com/epiforecasts/EpiNow2/issues/767#issuecomment-2348805272}", #nolint + "i" = "If the data is reported at non-daily intervals (for example + weekly), consider using `obs_opts(na=\"accumulate\")`.", + "i" = "For more information on these options, see `?obs_opts`." + ), + .frequency = "regularly", + .frequency_id = "check_na_setting_against_data" + ) + #nolint end + } + obs$na_as_missing_default_used <- NULL + return(obs) +} diff --git a/R/estimate_infections.R b/R/estimate_infections.R index fde83e449..7af5b2f1a 100644 --- a/R/estimate_infections.R +++ b/R/estimate_infections.R @@ -167,6 +167,16 @@ estimate_infections <- function(data, name = "EpiNow2.epinow.estimate_infections" ) } + + # If the user is using the default treatment of NA's as missing and + # their data has implicit or explicit NA's, inform them of what's + # happening and provide alternatives. + obs <- check_na_setting_against_data( + obs = obs, + data = dirty_reported_cases, + cols_to_check = c("date", "confirm") + ) + # Create clean and complete cases # Order cases reported_cases <- create_clean_reported_cases( data, horizon, diff --git a/R/estimate_secondary.R b/R/estimate_secondary.R index 077825381..7de6b32d5 100644 --- a/R/estimate_secondary.R +++ b/R/estimate_secondary.R @@ -182,15 +182,22 @@ estimate_secondary <- function(data, assert_logical(verbose) reports <- data.table::as.data.table(data) - secondary_reports <- reports[, list(date, confirm = secondary)] + # If the user is using the default treatment of NA's as missing and + # their data has implicit or explicit NA's, inform them of what's + # happening and provide alternatives. + obs <- check_na_setting_against_data( + obs = obs, + data = reports, + cols_to_check = c("date", "primary", "secondary") + ) + secondary_reports_dirty <- reports[, list(date, confirm = secondary)] secondary_reports <- create_clean_reported_cases( - secondary_reports, + secondary_reports_dirty, filter_leading_zeros = filter_leading_zeros, zero_threshold = zero_threshold ) ## fill in missing data (required if fitting to prevalence) complete_secondary <- create_complete_cases(secondary_reports) - ## fill down secondary_reports[, confirm := nafill(confirm, type = "locf")] ## fill any early data up diff --git a/R/opts.R b/R/opts.R index 66d8c56d2..bd08b2b44 100644 --- a/R/opts.R +++ b/R/opts.R @@ -628,6 +628,8 @@ obs_opts <- function(family = c("negbin", "poisson"), na = c("missing", "accumulate"), likelihood = TRUE, return_likelihood = FALSE) { + # NB: This has to be checked first before the na argument is touched anywhere. + na_default_used <- missing(na) na <- arg_match(na) if (na == "accumulate") { #nolint start: duplicate_argument_linter @@ -644,9 +646,8 @@ obs_opts <- function(family = c("negbin", "poisson"), .frequency = "regularly", .frequency_id = "obs_opts" ) - #nolint end } - + #nolint end if (length(phi) == 2 && is.numeric(phi)) { cli_abort( c( @@ -664,7 +665,8 @@ obs_opts <- function(family = c("negbin", "poisson"), scale = scale, accumulate = as.integer(na == "accumulate"), likelihood = likelihood, - return_likelihood = return_likelihood + return_likelihood = return_likelihood, + na_as_missing_default_used = na_default_used ) for (param in c("phi", "scale")) { diff --git a/man/check_na_setting_against_data.Rd b/man/check_na_setting_against_data.Rd new file mode 100644 index 000000000..54631c09c --- /dev/null +++ b/man/check_na_setting_against_data.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checks.R +\name{check_na_setting_against_data} +\alias{check_na_setting_against_data} +\title{Cross-check treatment of \code{NA} in obs_opts() against input data} +\usage{ +check_na_setting_against_data(data, cols_to_check, obs) +} +\arguments{ +\item{data}{The raw data} + +\item{cols_to_check}{A character vector of the columns to check} + +\item{obs}{A call to \code{\link[=obs_opts]{obs_opts()}}} +} +\value{ +\code{\link[=obs_opts]{obs_opts()}} +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +This function checks the input data for implicit and/or explicit missingness +and checks if the user specified \code{na = "missing"} in \code{\link[=obs_opts]{obs_opts()}}. +If the two are TRUE, it throws a message about how the model treats +missingness and provides alternatives. It returns an unmodified \code{\link[=obs_opts]{obs_opts()}}. + +This function is necessary because the data and observation model +do not currently interact internally. It will be deprecated in future +versions when the data specification interface is enhanced. +} +\keyword{internal} diff --git a/man/test_data_complete.Rd b/man/test_data_complete.Rd new file mode 100644 index 000000000..ceef12f30 --- /dev/null +++ b/man/test_data_complete.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checks.R +\name{test_data_complete} +\alias{test_data_complete} +\title{Check if data has either explicit NA values or implicit missing dates.} +\usage{ +test_data_complete(data, cols_to_check) +} +\arguments{ +\item{data}{The data to be checked} + +\item{cols_to_check}{A character vector of the columns to check} +} +\value{ +\code{TRUE} if data is complete, else if data has implicit or explicit +missingness, \code{FALSE}. +} +\description{ +Check if data has either explicit NA values or implicit missing dates. +} +\keyword{internal} diff --git a/tests/testthat/test-checks.R b/tests/testthat/test-checks.R index 9a3fc6525..14d207cdb 100644 --- a/tests/testthat/test-checks.R +++ b/tests/testthat/test-checks.R @@ -143,6 +143,95 @@ test_that("check_reports_valid errors for bad 'secondary' specifications", { }) test_that("check_sparse_pmf_tail throws a warning as expected", { + # NB: The warning is set to be thrown once every 8 hours, so hard to test + # regularly. The fix is to change the local setting here to throw the + # warning on demand for the sake of multiple runs of the test within + # 8 hours. That's what the rlang call below does + rlang::local_options(rlib_warning_verbosity = "verbose") pmf <- c(0.4, 0.30, 0.20, 0.05, 0.049995, 4.5e-06, rep(1e-7, 5)) - expect_warning(check_sparse_pmf_tail(pmf), "PMF tail has") + expect_warning( + check_sparse_pmf_tail(pmf), + "PMF tail has" + ) +}) + +test_that("test_data_complete detects complete and incomplete data", { + # example_confirmed with explicit missing dates + ec_missing_date <- copy(example_confirmed)[c(1, 3), date := NA] + # example_confirmed with explicit missing confirm + ec_missing_confirm <- copy(example_confirmed)[c(1, 3), confirm := NA] + # example_confirmed with implicit missing (missing entries) + ec_implicit_missing <- copy(example_confirmed)[-c(1,3,5), ] + # Create a hypothetical complete example_secondary + es <- copy(example_confirmed)[ + , primary := confirm + ][ + , secondary := primary * 0.4 + ] + # example_secondary with explicit missing primary + es_missing_primary <- copy(es)[c(1, 3), primary := NA] + # example_secondary with explicit missing secondary + es_missing_secondary <- copy(es)[c(1, 3), secondary := NA] + + # cols to check + ep_cols <- c("date", "confirm") + es_cols <- c("date", "primary", "secondary") + # Expectations + expect_true(test_data_complete(example_confirmed, ep_cols)) + expect_true(test_data_complete(es, es_cols)) + expect_false(test_data_complete(ec_missing_date, ep_cols)) + expect_false(test_data_complete(ec_missing_confirm, ep_cols)) + expect_false(test_data_complete(es_missing_primary, es_cols)) + expect_false(test_data_complete(es_missing_secondary, es_cols)) + expect_false(test_data_complete(ec_implicit_missing, ep_cols)) +}) + +test_that("check_na_setting_against_data works as expected", { + # If data is incomplete and the default na = "missing" is being used, + # expect a message thrown once every 8 hours. + # NB: We change the local setting here to throw the message on demand, rather + # than every 8 hours, for the sake of multiple runs of the test within + # 8 hours. + rlang::local_options(rlib_message_verbosity = "verbose") + expect_message( + check_na_setting_against_data( + obs = obs_opts(), + data = copy(example_confirmed)[c(1, 3), confirm := NA], + cols_to_check = c("date", "confirm") + ), + "version 1.5.0 missing dates or dates" + ) + # If data is incomplete but the user explicitly set na = "missing", then + # expect no message + expect_no_message( + check_na_setting_against_data( + obs = obs_opts(na = "missing"), + data = copy(example_confirmed)[c(1, 3), confirm := NA], + cols_to_check = c("date", "confirm") + ) + ) + # If data is complete, expect no message even when using default na as + # missing setting + expect_no_message( + check_na_setting_against_data( + obs = obs_opts(), + data = example_confirmed, + cols_to_check = c("date", "confirm") + ) + ) + expect_identical( + setdiff( + names( + obs_opts() + ), + names( + check_na_setting_against_data( + obs = obs_opts(), + data = example_confirmed, + cols_to_check = c("date", "confirm") + ) + ) + ), + "na_as_missing_default_used" + ) }) diff --git a/tests/testthat/test-obs_opts.R b/tests/testthat/test-obs_opts.R new file mode 100644 index 000000000..a9cb17db0 --- /dev/null +++ b/tests/testthat/test-obs_opts.R @@ -0,0 +1,31 @@ +test_that("obs_opts returns expected default values", { + result <- suppressWarnings(obs_opts()) + + expect_s3_class(result, "obs_opts") + expect_equal(result$family, "negbin") + expect_equal(result$weight, 1) + expect_true(result$week_effect) + expect_equal(result$week_length, 7L) + expect_equal(result$scale, list(mean = 1, sd = 0)) + expect_equal(result$accumulate, 0) + expect_true(result$likelihood) + expect_false(result$return_likelihood) +}) + +test_that("obs_opts returns expected messages", { + # The option na = "accumulate" informs the user of what is + # going to be done once every 8 hours, so hard to test regularly. + # NB: We change the local setting here to throw the message on demand, rather + # than every 8 hours, for the sake of multiple runs of the test within + # 8 hours. + rlang::local_options(rlib_message_verbosity = "verbose") + expect_message( + obs_opts(na = "accumulate"), + "modelled values that correspond to NA values" + ) +}) + +test_that("obs_opts behaves as expected for user specified na treatment", { +# If user explicitly specifies NA as missing, then don't throw message + expect_false(obs_opts(na = "missing")$na_as_missing_default_used) +}) \ No newline at end of file