diff --git a/NAMESPACE b/NAMESPACE index 8b17878e3..0b7f2ef01 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ export(extract_CrIs) export(extract_inits) export(extract_samples) export(extract_stan_param) +export(fill_missing) export(fix_dist) export(fix_parameters) export(forecast_infections) @@ -137,6 +138,7 @@ importFrom(cli,col_blue) importFrom(cli,col_red) importFrom(data.table,":=") importFrom(data.table,.N) +importFrom(data.table,CJ) importFrom(data.table,as.data.table) importFrom(data.table,copy) importFrom(data.table,data.table) diff --git a/NEWS.md b/NEWS.md index ed92b8f02..009ba40fc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,13 +1,26 @@ # EpiNow2 (development version) -## Documentation +## Model changes -- Brought the docs on `alpha_sd` up to date with the code change from prior PR #853. By @zsusswein in #862 and reviewed by @jamesmbaazam. +- The models now support more complex patterns of aggregating reported cases by accumulating them over multiple time points, as well as mixtures of accumulation and missingness via the new `fill_missing()` function and a logical `accumulate` column that can be supplied with the data. By @sbfnk in #851 and reviewed by @seabbs and @jamesmbaazam.. -## Model changes + ```r + # Deprecated + data |> + estimate_infections(obs_opts(na = "accumulate")) -- A bug was fixed where the initial growth was never estimated (i.e. the prior mean was always zero). By @sbfnk in #853 and reviewed by @seabbs. -- A bug was fixed where an internal function for applying a default cdf cutoff failed due to a difference a vector length issue. By @jamesmbaazam in #858 and reviewed by @sbfnk. + # Recommended workflow e.g. for weekly incidence data + data |> + fill_missing(missing = "accumulate", initial_accumulate = 7) |> + estimate_infections() + ``` + + - A bug was fixed where the initial growth was never estimated (i.e. the prior mean was always zero). By @sbfnk in #853 and reviewed by @seabbs. + - A bug was fixed where an internal function for applying a default cdf cutoff failed due to a difference a vector length issue. By @jamesmbaazam in #858 and reviewed by @sbfnk. + +## Documentation + +- Brought the docs on `alpha_sd` up to date with the code change from prior PR #853. By @zsusswein in #862 and reviewed by @jamesmbaazam. # EpiNow2 1.6.1 diff --git a/R/checks.R b/R/checks.R index 575700173..fb44140d5 100644 --- a/R/checks.R +++ b/R/checks.R @@ -15,14 +15,14 @@ #' "estimate_infections", "estimate_truncation", or "estimate_secondary". #' This is used to determine which checks to perform on the data input. #' @importFrom checkmate assert_data_frame assert_date assert_names -#' assert_numeric +#' assert_numeric +#' @importFrom purrr walk #' @importFrom rlang arg_match #' @return Called for its side effects. #' @keywords internal check_reports_valid <- function(data, model = c( "estimate_infections", - "estimate_truncation", "estimate_secondary" )) { # Check that the case time series (reports) is a data frame @@ -30,14 +30,13 @@ check_reports_valid <- function(data, # Perform checks depending on the model to the data is meant to be used with model <- arg_match(model) + assert_date(data$date, any.missing = FALSE) if (model == "estimate_secondary") { # Check that data has the right column names assert_names( names(data), must.include = c("date", "primary", "secondary") ) - # Check that the data data.frame has the right column types - assert_date(data$date, any.missing = FALSE) assert_numeric(data$primary, lower = 0) assert_numeric(data$secondary, lower = 0) } else { @@ -46,10 +45,10 @@ check_reports_valid <- function(data, names(data), must.include = c("date", "confirm") ) - # Check that the data data.frame has the right column types - assert_date(data$date, any.missing = FALSE) assert_numeric(data$confirm, lower = 0) } + assert_logical(data$accumulate, null.ok = TRUE) + return(invisible(data)) } #' Validate probability distribution for passing to stan @@ -224,7 +223,7 @@ test_data_complete <- function(data, cols_to_check) { #' Cross-check treatment of `NA` in obs_opts() against input data #' -#' @description `r lifecycle::badge("experimental")` +#' @description `r lifecycle::badge("deprecated")` #' #' This function checks the input data for implicit and/or explicit missingness #' and checks if the user specified `na = "missing"` in [obs_opts()]. diff --git a/R/create.R b/R/create.R index 1309fd508..2ffcfaf62 100644 --- a/R/create.R +++ b/R/create.R @@ -77,6 +77,10 @@ create_clean_reported_cases <- function(data, horizon = 0, } reported_cases[is.na(confirm), confirm := fill] reported_cases[, "average_7_day" := NULL] + ## set accumulate to FALSE in added rows + if ("accumulate" %in% colnames(reported_cases)) { + reported_cases[is.na(accumulate), accumulate := FALSE] + } return(reported_cases) } @@ -484,7 +488,6 @@ create_obs_model <- function(obs = obs_opts(), dates) { obs_scale = as.integer(obs$scale$sd > 0 || obs$scale$mean != 1), obs_scale_mean = obs$scale$mean, obs_scale_sd = obs$scale$sd, - accumulate = obs$accumulate, likelihood = as.numeric(obs$likelihood), return_likelihood = as.numeric(obs$return_likelihood) ) @@ -525,14 +528,16 @@ create_obs_model <- function(obs = obs_opts(), dates) { create_stan_data <- function(data, seeding_time, rt, gp, obs, horizon, backcalc, shifted_cases) { - cases <- data[(seeding_time + 1):(.N - horizon)] complete_cases <- create_complete_cases(cases) cases <- cases$confirm + accumulate <- data[-(1:seeding_time)]$accumulate stan_data <- list( cases = complete_cases$confirm, cases_time = complete_cases$lookup, + any_accumulate = as.integer(any(accumulate)), + accumulate = as.integer(accumulate), lt = nrow(complete_cases), shifted_cases = shifted_cases, t = length(data$date), diff --git a/R/estimate_infections.R b/R/estimate_infections.R index 7af5b2f1a..8c1f457c9 100644 --- a/R/estimate_infections.R +++ b/R/estimate_infections.R @@ -17,8 +17,14 @@ #' for an example of using `estimate_infections` within the `epinow` wrapper to #' estimate Rt for Covid-19 in a country from the ECDC data source. #' -#' @param data A `` of confirmed cases (confirm) by date -#' (date). `confirm` must be numeric and `date` must be in date format. +#' @param data A `` of confirmed cases (confirm) by date (date). +#' `confirm` must be numeric and `date` must be in date format. Optionally +#' this can also have a logical `accumulate` column which indicates whether +#' data should be added to the next data point. This is useful when modelling +#' e.g. weekly incidence data. See also the [fill_missing()] function which +#' helps add the `accumulate` column with the desired properties when dealing +#' with non-daily data. If any accumulation is done this happens after +#' truncation as specified by the `truncation` argument. #' #' @param reported_cases Deprecated; use `data` instead. #' @@ -176,10 +182,11 @@ estimate_infections <- function(data, data = dirty_reported_cases, cols_to_check = c("date", "confirm") ) + # Fill missing dates + reported_cases <- default_fill_missing_obs(data, obs, "confirm") # Create clean and complete cases - # Order cases reported_cases <- create_clean_reported_cases( - data, horizon, + reported_cases, horizon, filter_leading_zeros = filter_leading_zeros, zero_threshold = zero_threshold ) @@ -197,9 +204,9 @@ estimate_infections <- function(data, min(reported_cases$date) - 1, by = "days" ), - confirm = 0, breakpoint = 0 + confirm = 0, accumulate = FALSE, breakpoint = 0 ), - reported_cases + reported_cases[, .(date, confirm, accumulate, breakpoint)] )) shifted_cases <- create_shifted_cases( diff --git a/R/estimate_secondary.R b/R/estimate_secondary.R index 1af53fbb5..9c24eafe2 100644 --- a/R/estimate_secondary.R +++ b/R/estimate_secondary.R @@ -26,7 +26,13 @@ #' respectively on the log scale). #' #' @param data A `` containing the `date` of report and both -#' `primary` and `secondary` reports. +#' `primary` and `secondary` reports. Optionally this can also have a logical +#' `accumulate` column which indicates whether data should be added to the +#' next data point. This is useful when modelling e.g. weekly incidence data. +#' See also the [fill_missing()] function which helps add the `accumulate` +#' column with the desired properties when dealing with non-daily data. If any +#' accumulation is done this happens after truncation as specified by the +#' `truncation` argument. #' #' @param reports Deprecated; use `data` instead. #' @@ -190,7 +196,10 @@ estimate_secondary <- function(data, data = reports, cols_to_check = c("date", "primary", "secondary") ) - secondary_reports_dirty <- reports[, list(date, confirm = secondary)] + reports <- default_fill_missing_obs(reports, obs, "secondary") + + secondary_reports_dirty <- + reports[, list(date, confirm = secondary, accumulate)] secondary_reports <- create_clean_reported_cases( secondary_reports_dirty, filter_leading_zeros = filter_leading_zeros, @@ -225,7 +234,9 @@ estimate_secondary <- function(data, obs_time = complete_secondary[lookup > burn_in]$lookup - burn_in, lt = sum(complete_secondary$lookup > burn_in), burn_in = burn_in, - seeding_time = 0 + seeding_time = 0, + any_accumulate = as.integer(any(reports$accumulate > 0)), + accumulate = as.integer(reports$accumulate) ) # secondary model options stan_data <- c(stan_data, secondary) diff --git a/R/estimate_truncation.R b/R/estimate_truncation.R index 55efc164f..0601bd073 100644 --- a/R/estimate_truncation.R +++ b/R/estimate_truncation.R @@ -145,7 +145,7 @@ estimate_truncation <- function(data, ) } # Validate inputs - walk(data, check_reports_valid, model = "estimate_truncation") + walk(data, check_reports_valid, model = "estimate_infections") assert_class(truncation, "dist_spec") assert_class(model, "stanfit", null.ok = TRUE) assert_numeric(CrIs, lower = 0, upper = 1) diff --git a/R/opts.R b/R/opts.R index 2627b14aa..d56a31a32 100644 --- a/R/opts.R +++ b/R/opts.R @@ -594,15 +594,7 @@ gp_opts <- function(basis_prop = 0.2, #' scale) or a list with numeric elements mean (`mean`) and standard deviation #' (`sd`) defining a normally distributed scaling factor. Defaults to 1, i.e. #' no scaling. -#' @param na Character. Options are "missing" (the default) and "accumulate". -#' This determines how NA values in the data are interpreted. If set to -#' "missing", any NA values in the observation data set will be interpreted as -#' missing and skipped in the likelihood. If set to "accumulate", modelled -#' observations will be accumulated and added to the next non-NA data point. -#' This can be used to model incidence data that is reported at less than -#' daily intervals. If set to "accumulate", the first data point is not -#' included in the likelihood but used only to reset modelled observations to -#' zero. +#' @param na Deprecated; use the [fill_missing()] function instead #' @param likelihood Logical, defaults to `TRUE`. Should the likelihood be #' included in the model. #' @param return_likelihood Logical, defaults to `FALSE`. Should the likelihood @@ -631,6 +623,21 @@ obs_opts <- function(family = c("negbin", "poisson"), return_likelihood = FALSE) { # NB: This has to be checked first before the na argument is touched anywhere. na_default_used <- missing(na) + if (!na_default_used) { + lifecycle::deprecate_warn( + "1.7.0", + "obs_opts(na)", + "fill_missing()", + details = c( + paste0( + "If NA values are not to be treated as missing use the ", + "`fill_missing()` function instead." + ), + "This argument will be removed in the next release of EpiNow2." + ) + ) + + } na <- arg_match(na) if (na == "accumulate") { #nolint start: duplicate_argument_linter @@ -641,8 +648,8 @@ obs_opts <- function(family = c("negbin", "poisson"), "i" = "This means that the first data point is not included in the likelihood but used only to reset modelled observations to zero.", "i" = "{col_red('If the first data point should be included in the - likelihood this can be achieved by adding a data point of arbitrary - value before the first data point.')}" + likelihood this can be achieved by using the `fill_missing()` function + with a non-zero `initial_missing` argument.')}" ), .frequency = "regularly", .frequency_id = "obs_opts" diff --git a/R/preprocessing.R b/R/preprocessing.R new file mode 100644 index 000000000..a43171bad --- /dev/null +++ b/R/preprocessing.R @@ -0,0 +1,179 @@ +##' Fill missing data in a data set to prepare it for use within the package +##' +##' @description `r lifecycle::badge("experimental")` +##' This function ensures that all days between the first and last date in the +##' data are present. It adds an `accumulate` column that indicates whether +##' modelled observations should be accumulated onto a later data point. +##' point. This is useful for modelling data that is reported less frequently +##' than daily, e.g. weekly incidence data, as well as other reporting +##' artifacts such as delayed weekedn reporting. The function can also be used +##' to fill in missing observations with zeros. +##' +##' @param data Data frame with a `date` column. The other columns depend on the +##' model that the data are to be used, e.g. [estimate_infections()] or +##' [estimate_secondary()]. See the documentation there for the expected +##' format. The data must not already have an `accumulate` function, otherwise +##' the function will fail with an error. +##' @param missing_dates Character. Options are "ignore" (the default), +##' "accumulate" and "zero". This determines how missing dates in the data are +##' interpreted. If set to "ignore", any missing dates in the observation +##' data will be interpreted as missing and skipped in the likelihood. If set +##' to "accumulate", modelled observations on dates that are missing in the +##' data will be accumulated and added to the next non-missing data point. +##' This can be used to model incidence data that is reported less frequently +##' than daily. In that case, the first data point is not included in the +##' likelihood (unless `initial_accumulate` is set to a value greater than +##' one) but used only to reset modelled observations to zero. If "zero" then +##' all observations on missing dates will be assumed to be of value 0. +##' @param missing_obs Character. How to process dates that exist in the data +##' but have observations with NA values. The options available are the same +##' ones as for the `missing_dates` argument. +##' @param initial_accumulate Integer. The number of initial dates to accumulate +##' if `missing_dates` or `missing_obs` is set to `"accumulate"`. This +##' argument needs ot have a minimum of 1. If it is set to 1 then no +##' accumulation is happening on the first data point. If it is greater than 1 +##' then dates are added to the beginning of the data set to get be able to +##' have a sufficient number of modelled observations accumulated onto the +##' first data point. For modelling weekly incidence data this should be set +##' to 7. If accumulating and the first data point is not NA and this is +##' argument is not set, then that data point will be removed with a warning. +##' @param obs_column Character (default: "confirm"). If given, only the column +##' specified here will be used for checking missingness. This is useful if +##' using a data set that has multiple columns of hwich one of them +##' corresponds to observations that are to be processed here. +##' @param by Character vector. Name(s) of any additional column(s) where +##' missing data should be processed separately for each value in the column. +##' This is useful when using data representing e.g. multiple geographies. If +##' NULL (default) no such grouping is done. +##' @return a data.table with an `accumulate` column that indicates whether +##' values are accumulated (see the documentation of the `data` argument in +##' [estimate_infections()]) +##' @importFrom rlang arg_match +##' @importFrom data.table as.data.table CJ +##' @importFrom checkmate assert_data_frame assert_names assert_date +##' assert_character assert_integerish +##' @export +##' @examples +##' cases <- data.table::copy(example_confirmed) +##' ## calculate weekly sum +##' cases[, confirm := data.table::frollsum(confirm, 7)] +##' ## limit to dates once a week +##' cases <- cases[seq(7, nrow(cases), 7)] +##' ## set the second observation to missing +##' cases[2, confirm := NA] +##' ## fill missing data +##' fill_missing(cases, missing_dates = "accumulate", initial_accumulate = 7) +fill_missing <- function(data, + missing_dates = c("ignore", "accumulate", "zero"), + missing_obs = c("ignore", "accumulate", "zero"), + initial_accumulate, + obs_column = "confirm", + by = NULL) { + assert_data_frame(data) + assert_character(missing_dates) + assert_character(missing_obs) + assert_character(obs_column) + assert_character(by, null.ok = TRUE) + if (!missing(initial_accumulate)) { + assert_integerish(initial_accumulate, lower = 1) + } + assert_names( + colnames(data), + must.include = c("date", by, obs_column), + disjunct.from = "accumulate" + ) + assert_date(data$date, any.missing = FALSE) + + data <- as.data.table(data) + + missing_dates <- arg_match(missing_dates) + missing_obs <- arg_match(missing_obs) + + ## first, processing missing dates + initial_add <- ifelse(missing(initial_accumulate), 1, initial_accumulate) + + cols <- list( + date = seq(min(data$date) - initial_add + 1, max(data$date), by = "day") + ) + if (!is.null(by)) { + for (by_col in by) { + cols[[by_col]] <- unique(data[[by_col]]) + } + } + + complete <- do.call(CJ, cols) + + ## mark dates that are present in the data + data[, ..present := TRUE] + data <- merge(data, complete, by = c("date", by), all.y = TRUE) + + data[, accumulate := FALSE] + for (col in obs_column) { + if (missing_dates == "accumulate") { + data[is.na(..present), accumulate := TRUE] + } else if (missing_dates == "zero") { + data[is.na(..present), paste(col) := 0] + } + } + if (missing(initial_accumulate) && + isFALSE(data$accumulate[1]) && + any(data$accumulate)) { + #nolint start: duplicate_argument_linter + cli_warn( + c( + "!" = + "Initial data point not marked as accumulated but some others are.", + "i" = + "This means that the first data point will not be included in the + likelihood.", + "i" = + "To change this behaviour, set `initial_accumulate` + (including setting it to 1 if the first data point is to be taken + as is, i.e. as the first daily data point)." + ) + ) + #nolint end + data <- data[date > min(date)] + } + + ## second, processing missing observations + for (col in obs_column) { + if (missing_obs == "accumulate") { + data[is.na(get(col)) & !is.na(..present), accumulate := TRUE] + } else if (missing_obs == "zero") { + data[is.na(get(col)) & !is.na(..present), paste(col) := 0] + } + } + + data[, ..present := NULL] + + return(data[]) +} + +##' Temporary function to support the transition to full support of missing +##' data. +##' +##' @description `r lifecycle::badge("deprecated")` +##' +##' @inheritParams create_stan_data +##' @inheritParams fill_missing +##' @return data set with missing dates filled in as na values +##' @keywords internal +default_fill_missing_obs <- function(data, obs, obs_column) { + if (!("accumulate" %in% colnames(data))) { + data_rows <- nrow(data) + data <- fill_missing(data = data, obs_column = obs_column) + if (nrow(data) > data_rows && !obs$accumulate) { + #nolint start: duplicate_argument_linter + cli_warn( + "!" = "Data contains missing dates.", + "i" = "Missing dates are interpreted as truly missing data. + In the future, this behaviour will be deprecated and the package + functions will expect complete data.", + "i" = "Complete data can be created using the `fill_missing` function." + ) + #nolint end + } + } + return(data) +} diff --git a/R/utilities.R b/R/utilities.R index 91867b062..5fb413fc5 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -441,6 +441,7 @@ globalVariables( "central_lower", "central_upper", "mean_sd", "sd_sd", "average_7_day", "..lowers", "..upper_CrI", "..uppers", "timing", "dataset", "last_confirm", "report_date", "secondary", "id", "conv", "meanlog", "primary", "scaled", - "scaling", "sdlog", "lookup", "new_draw", ".draw", "p", "distribution" + "scaling", "sdlog", "lookup", "new_draw", ".draw", "p", "distribution", + "accumulate", "..present" ) ) diff --git a/inst/dev/accumulation.Rmd b/inst/dev/accumulation.Rmd new file mode 100644 index 000000000..cb4c14f15 --- /dev/null +++ b/inst/dev/accumulation.Rmd @@ -0,0 +1,63 @@ +--- +output: github_document +--- + +```{r setup, echo = FALSE} +library("knitr") +``` + +# Supporting missing data + +We want to support reporting patterns that include *accumulation* (i.e. batch reporting of data from multiple dates, for example weekly) and *missingness* (dates which are lacking reports) in incidence and prevalence data. + +## Proposed interface + +### Missing data + +Any dates between the minimum date and maximum date in the data that is either absent, or present with an `NA` value (currently called `confirm`) is interpreted as missing and ignored in the likelihood. +All other data points are used in the likelihood. +This matches the current default behaviour, introduced in version 1.5.0. + +### Accumulation + +If instead modelled values on these days should be accumulated onto the next reporting date, the passed `data.frame` must have an additional logical column, `accumulate`. +If `accumulate` is TRUE then the modelled value of the observed latent variable on that day is added to any existing accumulation and stored for later observation. +If `accumulate` is FALSE the modelled value is added to any stored accumulated variables before potentially being used in the likelihood on that day (if not `NA`). +Subsequently the stored accumulated variable is reset to zero. + +### Example + +```{r results = "asis"} +df <- data.frame( + date = as.Date(c("2024-10-23", "2024-10-24", "2024-10-26", "2024-10-27", "2024-10-28")), + confirm = c(NA, 10, NA, NA, 17), + accumulate = c(TRUE, FALSE, TRUE, TRUE, FALSE) +) +df |> + kable(align = "l") +``` + +The likelihood is evaluated on two days, 24 October and 28 October. +On 24 October the data (10) is compared to (modelled value on 23 October) + (modelled value on 24 October). +On 28 October the data (17) is compared to (modelled value on 26 October) + (modelled value on 27 October) + (modelled value on 28 October). + +## Helper functions + +A helper function, `fill_missing()` can be used to convert weekly data to the required format, i.e. + +```{r, results = "asis"} +df <- data.frame( + date = as.Date(c("2024-10-24", "2024-10-31", "2024-11-07")), + confirm = c(10, 17, 11) +) +df |> + kable(align = "l") +``` + +can be converted with `fill_missing(missing_dates = "accumulate", initial_accumulate = 7)` to + +```{r, results = "asis"} +df |> + fill_missing(missing_dates = "accumulate", initial_accumulate = 7) |> + kable(align = "l") +``` diff --git a/inst/dev/accumulation.md b/inst/dev/accumulation.md index b10c5a108..63956a22c 100644 --- a/inst/dev/accumulation.md +++ b/inst/dev/accumulation.md @@ -1,50 +1,89 @@ + # Supporting missing data -We want to support reporting patterns that include *accumulation* (i.e. batch reporting of data from multiple dates, for example weekly) and *missingness* (dates which are lacking reports) in incidence and prevalence data. +We want to support reporting patterns that include *accumulation* +(i.e. batch reporting of data from multiple dates, for example weekly) +and *missingness* (dates which are lacking reports) in incidence and +prevalence data. ## Proposed interface ### Missing data -Any dates between the minimum date and maximum date in the data that is either absent, or present with an `NA` value (currently called `confirm`) is interpreted as missing and ignored in the likelihood. -All other data points are used in the likelihood. -This matches the current default behaviour, introduced in version 1.5.0. +Any dates between the minimum date and maximum date in the data that is +either absent, or present with an `NA` value (currently called +`confirm`) is interpreted as missing and ignored in the likelihood. All +other data points are used in the likelihood. This matches the current +default behaviour, introduced in version 1.5.0. ### Accumulation -If instead modelled values on these days should be accumulated onto the next reporting date, the passed `data.frame` must have an additional logical column, `accumulate`. -If `accumulate` is TRUE then the modelled value of the observed latent variable on that day is added to any existing accumulation and stored for later observation. -If `accumulate` is FALSE the modelled value is added to any stored accumulated variables before potentially being used in the likelihood on that day (if not `NA`). -Subsequently the stored accumulated variable is reset to zero. +If instead modelled values on these days should be accumulated onto the +next reporting date, the passed `data.frame` must have an additional +logical column, `accumulate`. If `accumulate` is TRUE then the modelled +value of the observed latent variable on that day is added to any +existing accumulation and stored for later observation. If `accumulate` +is FALSE the modelled value is added to any stored accumulated variables +before potentially being used in the likelihood on that day (if not +`NA`). Subsequently the stored accumulated variable is reset to zero. ### Example +``` r +df <- data.frame( + date = as.Date(c("2024-10-23", "2024-10-24", "2024-10-26", "2024-10-27", "2024-10-28")), + confirm = c(NA, 10, NA, NA, 17), + accumulate = c(TRUE, FALSE, TRUE, TRUE, FALSE) +) +df |> + kable(align = "l") +``` + | date | confirm | accumulate | -|------------|---------|------------| +|:-----------|:--------|:-----------| | 2024-10-23 | NA | TRUE | | 2024-10-24 | 10 | FALSE | | 2024-10-26 | NA | TRUE | | 2024-10-27 | NA | TRUE | | 2024-10-28 | 17 | FALSE | -The likelihood is evaluated on two days, 24 October and 28 October. -On 24 October the data (10) is compared to (modelled value on 23 October) + (modelled value on 24 October). -On 28 October the data (17) is compared to (modelled value on 26 October) + (modelled value on 27 October) + (modelled value on 28 October). +The likelihood is evaluated on two days, 24 October and 28 October. On +24 October the data (10) is compared to (modelled value on 23 October) + +(modelled value on 24 October). On 28 October the data (17) is compared +to (modelled value on 26 October) + (modelled value on 27 October) + +(modelled value on 28 October). ## Helper functions -A helper function, `fill_missing_dates()` can be used to convert weekly data to the required format, i.e. +A helper function, `fill_missing()` can be used to convert weekly data +to the required format, i.e. + +``` r +df <- data.frame( + date = as.Date(c("2024-10-24", "2024-10-31", "2024-11-07")), + confirm = c(10, 17, 11) +) +df |> + kable(align = "l") +``` | date | confirm | -|------------|---------| +|:-----------|:--------| | 2024-10-24 | 10 | | 2024-10-31 | 17 | | 2024-11-07 | 11 | -can be converted with `fill_missing_dates(missing = "accumulate", initial = 7)` to +can be converted with +`fill_missing(missing_dates = "accumulate", initial_accumulate = 7)` to + +``` r +df |> + fill_missing(missing_dates = "accumulate", initial_accumulate = 7) |> + kable(align = "l" ) +``` | date | confirm | accumulate | -|------------|---------|------------| +|:-----------|:--------|:-----------| | 2024-10-18 | NA | TRUE | | 2024-10-19 | NA | TRUE | | 2024-10-20 | NA | TRUE | @@ -65,4 +104,4 @@ can be converted with `fill_missing_dates(missing = "accumulate", initial = 7)` | 2024-11-04 | NA | TRUE | | 2024-11-05 | NA | TRUE | | 2024-11-06 | NA | TRUE | -| 2024-11-07 | 11 | TRUE | +| 2024-11-07 | 11 | FALSE | diff --git a/inst/stan/data/observation_model.stan b/inst/stan/data/observation_model.stan index 671004ef4..0ce9ef3bb 100644 --- a/inst/stan/data/observation_model.stan +++ b/inst/stan/data/observation_model.stan @@ -9,6 +9,5 @@ real obs_weight; // weight given to observation in log density int likelihood; // Should the likelihood be included in the model int return_likelihood; // Should the likehood be returned by the model - int accumulate; // Should missing values be accumulated int trunc_id; // id of truncation int delay_id; // id of delay diff --git a/inst/stan/data/observations.stan b/inst/stan/data/observations.stan index 11fe8463c..1b41f54d6 100644 --- a/inst/stan/data/observations.stan +++ b/inst/stan/data/observations.stan @@ -5,4 +5,6 @@ int future_time; // time in future for Rt array[lt] int cases; // observed cases array[lt] int cases_time; // time of observed cases + int any_accumulate; // Should any missing values be accumulated? + array[t - seeding_time] int accumulate; // Should missing values be accumulated (by time) vector[t] shifted_cases; // prior infections (for backcalculation) diff --git a/inst/stan/estimate_infections.stan b/inst/stan/estimate_infections.stan index 303b6b0e4..7d7ea1018 100644 --- a/inst/stan/estimate_infections.stan +++ b/inst/stan/estimate_infections.stan @@ -156,6 +156,13 @@ transformed parameters { } else { obs_reports = reports[1:ot]; } + + // accumulate reports + if (any_accumulate) { + profile("accumulate") { + obs_reports = accumulate_reports(obs_reports, accumulate); + } + } } model { @@ -199,7 +206,7 @@ model { profile("report lp") { report_lp( cases, cases_time, obs_reports, rep_phi, phi_mean, phi_sd, model_type, - obs_weight, accumulate + obs_weight ); } } diff --git a/inst/stan/estimate_secondary.stan b/inst/stan/estimate_secondary.stan index 8f5081fb0..15836243c 100644 --- a/inst/stan/estimate_secondary.stan +++ b/inst/stan/estimate_secondary.stan @@ -13,6 +13,8 @@ data { array[lt] int obs_time; // observed secondary data vector[t] primary; // observed primary data int burn_in; // time period to not use for fitting + int any_accumulate; // Should any missing values be accumulated? + array[t] int accumulate; // Should missing values be accumulated (by time) #include data/secondary.stan #include data/delays.stan #include data/observation_model.stan @@ -70,6 +72,7 @@ transformed parameters { if (week_effect > 1) { secondary = day_of_week_effect(secondary, day_of_week, day_of_week_simplex); } + // truncate near time cases to observed reports if (trunc_id) { vector[delay_type_max[trunc_id]] trunc_rev_cmf = get_delay_rev_pmf( @@ -80,6 +83,13 @@ transformed parameters { ); secondary = truncate_obs(secondary, trunc_rev_cmf, 0); } + + // accumulate reports + if (any_accumulate) { + profile("accumulate") { + secondary = accumulate_reports(secondary, accumulate); + } + } } model { @@ -97,7 +107,7 @@ model { if (likelihood) { report_lp( obs[(burn_in + 1):t][obs_time], obs_time, secondary[(burn_in + 1):t], - rep_phi, phi_mean, phi_sd, model_type, 1, accumulate + rep_phi, phi_mean, phi_sd, model_type, 1 ); } } diff --git a/inst/stan/functions/observation_model.stan b/inst/stan/functions/observation_model.stan index aa10ccfda..9a96d958e 100644 --- a/inst/stan/functions/observation_model.stan +++ b/inst/stan/functions/observation_model.stan @@ -102,52 +102,54 @@ void truncation_lp(array[] real truncation_mean, array[] real truncation_sd, * @param phi_sd Real value for standard deviation of reporting overdispersion prior. * @param model_type Integer indicating the model type (0 for Poisson, >0 for Negative Binomial). * @param weight Real value for weighting the log density contribution. - * @param accumulate Integer flag indicating whether to accumulate reports (1) or not (0). + * @param accumulate Array of integers indicating, for each time point, whether + * to accumulate reports (1) or not (0). */ void report_lp(array[] int cases, array[] int cases_time, vector reports, array[] real rep_phi, real phi_mean, real phi_sd, - int model_type, real weight, int accumulate) { - int n = num_elements(cases_time) - accumulate; // number of observations - vector[n] obs_reports; // reports at observation time - array[n] int obs_cases; // observed cases at observation time - if (accumulate) { - int t = num_elements(reports); - int i = 0; - int current_obs = 0; - obs_reports = rep_vector(0, n); - while (i <= t && current_obs <= n) { - if (current_obs > 0) { // first observation gets ignored when accumulating - obs_reports[current_obs] += reports[i]; - } - if (i == cases_time[current_obs + 1]) { - current_obs += 1; - } - i += 1; - } - obs_cases = cases[2:(n + 1)]; - } else { - obs_reports = reports[cases_time]; - obs_cases = cases; - } + int model_type, real weight) { + int n = num_elements(cases_time); // number of observations + vector[n] obs_reports = reports[cases_time]; // reports at observation time if (model_type) { real dispersion = inv_square(phi_sd > 0 ? rep_phi[model_type] : phi_mean); if (phi_sd > 0) { rep_phi[model_type] ~ normal(phi_mean, phi_sd) T[0,]; } if (weight == 1) { - obs_cases ~ neg_binomial_2(obs_reports, dispersion); + cases ~ neg_binomial_2(obs_reports, dispersion); } else { target += neg_binomial_2_lpmf( - obs_cases | obs_reports, dispersion + cases | obs_reports, dispersion ) * weight; } } else { if (weight == 1) { - obs_cases ~ poisson(obs_reports); + cases ~ poisson(obs_reports); } else { - target += poisson_lpmf(obs_cases | obs_reports) * weight; + target += poisson_lpmf(cases | obs_reports) * weight; + } + } +} + +/** + * Accumulate reports according to a binary flag at each time point + * + * This function accumulates reports according to a binary flag at each time point. + * + * @param reports Vector of expected reports. + * @param accumulate Array of integers indicating, for each time point, whether to accumulate or not. + * + * @return A vector of accumulated reports. + */ +vector accumulate_reports(vector reports, array[] int accumulate) { + int ot_h = num_elements(reports); // number of reporting time points modelled + vector[ot_h] accumulated_reports = reports; + for (i in 1:(ot_h - 1)) { + if (accumulate[i]) { // first observation gets ignored when accumulating + accumulated_reports[i + 1] += accumulated_reports[i]; } } + return accumulated_reports; } /** diff --git a/man/check_na_setting_against_data.Rd b/man/check_na_setting_against_data.Rd index 54631c09c..0fc510489 100644 --- a/man/check_na_setting_against_data.Rd +++ b/man/check_na_setting_against_data.Rd @@ -17,7 +17,7 @@ check_na_setting_against_data(data, cols_to_check, obs) \code{\link[=obs_opts]{obs_opts()}} } \description{ -\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This function checks the input data for implicit and/or explicit missingness and checks if the user specified \code{na = "missing"} in \code{\link[=obs_opts]{obs_opts()}}. diff --git a/man/check_reports_valid.Rd b/man/check_reports_valid.Rd index de18dc20d..03e19a540 100644 --- a/man/check_reports_valid.Rd +++ b/man/check_reports_valid.Rd @@ -6,7 +6,7 @@ \usage{ check_reports_valid( data, - model = c("estimate_infections", "estimate_truncation", "estimate_secondary") + model = c("estimate_infections", "estimate_secondary") ) } \arguments{ diff --git a/man/create_clean_reported_cases.Rd b/man/create_clean_reported_cases.Rd index f782e588e..949880c81 100644 --- a/man/create_clean_reported_cases.Rd +++ b/man/create_clean_reported_cases.Rd @@ -14,8 +14,14 @@ create_clean_reported_cases( ) } \arguments{ -\item{data}{A \verb{} of confirmed cases (confirm) by date -(date). \code{confirm} must be numeric and \code{date} must be in date format.} +\item{data}{A \verb{} of confirmed cases (confirm) by date (date). +\code{confirm} must be numeric and \code{date} must be in date format. Optionally +this can also have a logical \code{accumulate} column which indicates whether +data should be added to the next data point. This is useful when modelling +e.g. weekly incidence data. See also the \code{\link[=fill_missing]{fill_missing()}} function which +helps add the \code{accumulate} column with the desired properties when dealing +with non-daily data. If any accumulation is done this happens after +truncation as specified by the \code{truncation} argument.} \item{horizon}{Numeric, defaults to 7. Number of days into the future to forecast.} diff --git a/man/create_shifted_cases.Rd b/man/create_shifted_cases.Rd index 1a1dff5a6..a9d6fc048 100644 --- a/man/create_shifted_cases.Rd +++ b/man/create_shifted_cases.Rd @@ -7,8 +7,14 @@ create_shifted_cases(data, shift, smoothing_window, horizon) } \arguments{ -\item{data}{A \verb{} of confirmed cases (confirm) by date -(date). \code{confirm} must be numeric and \code{date} must be in date format.} +\item{data}{A \verb{} of confirmed cases (confirm) by date (date). +\code{confirm} must be numeric and \code{date} must be in date format. Optionally +this can also have a logical \code{accumulate} column which indicates whether +data should be added to the next data point. This is useful when modelling +e.g. weekly incidence data. See also the \code{\link[=fill_missing]{fill_missing()}} function which +helps add the \code{accumulate} column with the desired properties when dealing +with non-daily data. If any accumulation is done this happens after +truncation as specified by the \code{truncation} argument.} \item{shift}{Numeric, mean delay shift to apply.} diff --git a/man/create_stan_data.Rd b/man/create_stan_data.Rd index ed80f379e..383f0844e 100644 --- a/man/create_stan_data.Rd +++ b/man/create_stan_data.Rd @@ -16,8 +16,14 @@ create_stan_data( ) } \arguments{ -\item{data}{A \verb{} of confirmed cases (confirm) by date -(date). \code{confirm} must be numeric and \code{date} must be in date format.} +\item{data}{A \verb{} of confirmed cases (confirm) by date (date). +\code{confirm} must be numeric and \code{date} must be in date format. Optionally +this can also have a logical \code{accumulate} column which indicates whether +data should be added to the next data point. This is useful when modelling +e.g. weekly incidence data. See also the \code{\link[=fill_missing]{fill_missing()}} function which +helps add the \code{accumulate} column with the desired properties when dealing +with non-daily data. If any accumulation is done this happens after +truncation as specified by the \code{truncation} argument.} \item{seeding_time}{Integer; seeding time, usually obtained using \code{\link[=get_seeding_time]{get_seeding_time()}}.} diff --git a/man/default_fill_missing_obs.Rd b/man/default_fill_missing_obs.Rd new file mode 100644 index 000000000..81e6f60b5 --- /dev/null +++ b/man/default_fill_missing_obs.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{default_fill_missing_obs} +\alias{default_fill_missing_obs} +\title{Temporary function to support the transition to full support of missing +data.} +\usage{ +default_fill_missing_obs(data, obs, obs_column) +} +\arguments{ +\item{data}{A \verb{} of confirmed cases (confirm) by date (date). +\code{confirm} must be numeric and \code{date} must be in date format. Optionally +this can also have a logical \code{accumulate} column which indicates whether +data should be added to the next data point. This is useful when modelling +e.g. weekly incidence data. See also the \code{\link[=fill_missing]{fill_missing()}} function which +helps add the \code{accumulate} column with the desired properties when dealing +with non-daily data. If any accumulation is done this happens after +truncation as specified by the \code{truncation} argument.} + +\item{obs}{A list of options as generated by \code{\link[=obs_opts]{obs_opts()}} defining the +observation model. Defaults to \code{\link[=obs_opts]{obs_opts()}}.} + +\item{obs_column}{Character (default: "confirm"). If given, only the column +specified here will be used for checking missingness. This is useful if +using a data set that has multiple columns of hwich one of them +corresponds to observations that are to be processed here.} +} +\value{ +data set with missing dates filled in as na values +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} +} +\keyword{internal} diff --git a/man/epinow.Rd b/man/epinow.Rd index 6046dea56..2654dfe0b 100644 --- a/man/epinow.Rd +++ b/man/epinow.Rd @@ -30,8 +30,14 @@ epinow( ) } \arguments{ -\item{data}{A \verb{} of confirmed cases (confirm) by date -(date). \code{confirm} must be numeric and \code{date} must be in date format.} +\item{data}{A \verb{} of confirmed cases (confirm) by date (date). +\code{confirm} must be numeric and \code{date} must be in date format. Optionally +this can also have a logical \code{accumulate} column which indicates whether +data should be added to the next data point. This is useful when modelling +e.g. weekly incidence data. See also the \code{\link[=fill_missing]{fill_missing()}} function which +helps add the \code{accumulate} column with the desired properties when dealing +with non-daily data. If any accumulation is done this happens after +truncation as specified by the \code{truncation} argument.} \item{generation_time}{A call to \code{\link[=gt_opts]{gt_opts()}} (or its alias \code{\link[=generation_time_opts]{generation_time_opts()}}) defining the generation time distribution used. diff --git a/man/estimate_infections.Rd b/man/estimate_infections.Rd index cab919f85..528a76edf 100644 --- a/man/estimate_infections.Rd +++ b/man/estimate_infections.Rd @@ -26,8 +26,14 @@ estimate_infections( ) } \arguments{ -\item{data}{A \verb{} of confirmed cases (confirm) by date -(date). \code{confirm} must be numeric and \code{date} must be in date format.} +\item{data}{A \verb{} of confirmed cases (confirm) by date (date). +\code{confirm} must be numeric and \code{date} must be in date format. Optionally +this can also have a logical \code{accumulate} column which indicates whether +data should be added to the next data point. This is useful when modelling +e.g. weekly incidence data. See also the \code{\link[=fill_missing]{fill_missing()}} function which +helps add the \code{accumulate} column with the desired properties when dealing +with non-daily data. If any accumulation is done this happens after +truncation as specified by the \code{truncation} argument.} \item{generation_time}{A call to \code{\link[=gt_opts]{gt_opts()}} (or its alias \code{\link[=generation_time_opts]{generation_time_opts()}}) defining the generation time distribution used. diff --git a/man/estimate_secondary.Rd b/man/estimate_secondary.Rd index 06ff2e848..b985c121c 100644 --- a/man/estimate_secondary.Rd +++ b/man/estimate_secondary.Rd @@ -26,7 +26,13 @@ estimate_secondary( } \arguments{ \item{data}{A \verb{} containing the \code{date} of report and both -\code{primary} and \code{secondary} reports.} +\code{primary} and \code{secondary} reports. Optionally this can also have a logical +\code{accumulate} column which indicates whether data should be added to the +next data point. This is useful when modelling e.g. weekly incidence data. +See also the \code{\link[=fill_missing]{fill_missing()}} function which helps add the \code{accumulate} +column with the desired properties when dealing with non-daily data. If any +accumulation is done this happens after truncation as specified by the +\code{truncation} argument.} \item{secondary}{A call to \code{\link[=secondary_opts]{secondary_opts()}} or a list containing the following binary variables: cumulative, historic, primary_hist_additive, diff --git a/man/fill_missing.Rd b/man/fill_missing.Rd new file mode 100644 index 000000000..2120c6ddf --- /dev/null +++ b/man/fill_missing.Rd @@ -0,0 +1,84 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{fill_missing} +\alias{fill_missing} +\title{Fill missing data in a data set to prepare it for use within the package} +\usage{ +fill_missing( + data, + missing_dates = c("ignore", "accumulate", "zero"), + missing_obs = c("ignore", "accumulate", "zero"), + initial_accumulate, + obs_column = "confirm", + by = NULL +) +} +\arguments{ +\item{data}{Data frame with a \code{date} column. The other columns depend on the +model that the data are to be used, e.g. \code{\link[=estimate_infections]{estimate_infections()}} or +\code{\link[=estimate_secondary]{estimate_secondary()}}. See the documentation there for the expected +format. The data must not already have an \code{accumulate} function, otherwise +the function will fail with an error.} + +\item{missing_dates}{Character. Options are "ignore" (the default), +"accumulate" and "zero". This determines how missing dates in the data are +interpreted. If set to "ignore", any missing dates in the observation +data will be interpreted as missing and skipped in the likelihood. If set +to "accumulate", modelled observations on dates that are missing in the +data will be accumulated and added to the next non-missing data point. +This can be used to model incidence data that is reported less frequently +than daily. In that case, the first data point is not included in the +likelihood (unless \code{initial_accumulate} is set to a value greater than +one) but used only to reset modelled observations to zero. If "zero" then +all observations on missing dates will be assumed to be of value 0.} + +\item{missing_obs}{Character. How to process dates that exist in the data +but have observations with NA values. The options available are the same +ones as for the \code{missing_dates} argument.} + +\item{initial_accumulate}{Integer. The number of initial dates to accumulate +if \code{missing_dates} or \code{missing_obs} is set to \code{"accumulate"}. This +argument needs ot have a minimum of 1. If it is set to 1 then no +accumulation is happening on the first data point. If it is greater than 1 +then dates are added to the beginning of the data set to get be able to +have a sufficient number of modelled observations accumulated onto the +first data point. For modelling weekly incidence data this should be set +to 7. If accumulating and the first data point is not NA and this is +argument is not set, then that data point will be removed with a warning.} + +\item{obs_column}{Character (default: "confirm"). If given, only the column +specified here will be used for checking missingness. This is useful if +using a data set that has multiple columns of hwich one of them +corresponds to observations that are to be processed here.} + +\item{by}{Character vector. Name(s) of any additional column(s) where +missing data should be processed separately for each value in the column. +This is useful when using data representing e.g. multiple geographies. If +NULL (default) no such grouping is done.} +} +\value{ +a data.table with an \code{accumulate} column that indicates whether +values are accumulated (see the documentation of the \code{data} argument in +\code{\link[=estimate_infections]{estimate_infections()}}) +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +This function ensures that all days between the first and last date in the +data are present. It adds an \code{accumulate} column that indicates whether +modelled observations should be accumulated onto a later data point. +point. This is useful for modelling data that is reported less frequently +than daily, e.g. weekly incidence data, as well as other reporting +artifacts such as delayed weekedn reporting. The function can also be used +to fill in missing observations with zeros. +} +\examples{ +cases <- data.table::copy(example_confirmed) +## calculate weekly sum +cases[, confirm := data.table::frollsum(confirm, 7)] +## limit to dates once a week +cases <- cases[seq(7, nrow(cases), 7)] +## set the second observation to missing +cases[2, confirm := NA] +## fill missing data +fill_missing(cases, missing_dates = "accumulate", initial_accumulate = 7) +} diff --git a/man/obs_opts.Rd b/man/obs_opts.Rd index 36f1d9ed2..432f1c3af 100644 --- a/man/obs_opts.Rd +++ b/man/obs_opts.Rd @@ -44,15 +44,7 @@ scale) or a list with numeric elements mean (\code{mean}) and standard deviation (\code{sd}) defining a normally distributed scaling factor. Defaults to 1, i.e. no scaling.} -\item{na}{Character. Options are "missing" (the default) and "accumulate". -This determines how NA values in the data are interpreted. If set to -"missing", any NA values in the observation data set will be interpreted as -missing and skipped in the likelihood. If set to "accumulate", modelled -observations will be accumulated and added to the next non-NA data point. -This can be used to model incidence data that is reported at less than -daily intervals. If set to "accumulate", the first data point is not -included in the likelihood but used only to reset modelled observations to -zero.} +\item{na}{Deprecated; use the \code{\link[=fill_missing]{fill_missing()}} function instead} \item{likelihood}{Logical, defaults to \code{TRUE}. Should the likelihood be included in the model.} diff --git a/man/save_input.Rd b/man/save_input.Rd index 2f1ae2747..7afff45dc 100644 --- a/man/save_input.Rd +++ b/man/save_input.Rd @@ -7,8 +7,14 @@ save_input(data, target_folder) } \arguments{ -\item{data}{A \verb{} of confirmed cases (confirm) by date -(date). \code{confirm} must be numeric and \code{date} must be in date format.} +\item{data}{A \verb{} of confirmed cases (confirm) by date (date). +\code{confirm} must be numeric and \code{date} must be in date format. Optionally +this can also have a logical \code{accumulate} column which indicates whether +data should be added to the next data point. This is useful when modelling +e.g. weekly incidence data. See also the \code{\link[=fill_missing]{fill_missing()}} function which +helps add the \code{accumulate} column with the desired properties when dealing +with non-daily data. If any accumulation is done this happens after +truncation as specified by the \code{truncation} argument.} \item{target_folder}{Character string specifying where to save results (will create if not present).} diff --git a/man/setup_dt.Rd b/man/setup_dt.Rd index e3811555c..32a7347d4 100644 --- a/man/setup_dt.Rd +++ b/man/setup_dt.Rd @@ -7,8 +7,14 @@ setup_dt(data) } \arguments{ -\item{data}{A \verb{} of confirmed cases (confirm) by date -(date). \code{confirm} must be numeric and \code{date} must be in date format.} +\item{data}{A \verb{} of confirmed cases (confirm) by date (date). +\code{confirm} must be numeric and \code{date} must be in date format. Optionally +this can also have a logical \code{accumulate} column which indicates whether +data should be added to the next data point. This is useful when modelling +e.g. weekly incidence data. See also the \code{\link[=fill_missing]{fill_missing()}} function which +helps add the \code{accumulate} column with the desired properties when dealing +with non-daily data. If any accumulation is done this happens after +truncation as specified by the \code{truncation} argument.} } \value{ A data table diff --git a/man/update_horizon.Rd b/man/update_horizon.Rd index 387646e0b..a86477339 100644 --- a/man/update_horizon.Rd +++ b/man/update_horizon.Rd @@ -13,8 +13,14 @@ forecast.} \item{target_date}{Date, defaults to maximum found in the data if not specified.} -\item{data}{A \verb{} of confirmed cases (confirm) by date -(date). \code{confirm} must be numeric and \code{date} must be in date format.} +\item{data}{A \verb{} of confirmed cases (confirm) by date (date). +\code{confirm} must be numeric and \code{date} must be in date format. Optionally +this can also have a logical \code{accumulate} column which indicates whether +data should be added to the next data point. This is useful when modelling +e.g. weekly incidence data. See also the \code{\link[=fill_missing]{fill_missing()}} function which +helps add the \code{accumulate} column with the desired properties when dealing +with non-daily data. If any accumulation is done this happens after +truncation as specified by the \code{truncation} argument.} } \value{ Numeric forecast horizon adjusted for the users intention diff --git a/tests/testthat/test-checks.R b/tests/testthat/test-checks.R index 14d207cdb..68c5ce6a4 100644 --- a/tests/testthat/test-checks.R +++ b/tests/testthat/test-checks.R @@ -205,7 +205,7 @@ test_that("check_na_setting_against_data works as expected", { # expect no message expect_no_message( check_na_setting_against_data( - obs = obs_opts(na = "missing"), + obs = suppressWarnings(obs_opts(na = "missing")), data = copy(example_confirmed)[c(1, 3), confirm := NA], cols_to_check = c("date", "confirm") ) diff --git a/tests/testthat/test-create_obs_model.R b/tests/testthat/test-create_obs_model.R index b704ca54d..a9115e261 100644 --- a/tests/testthat/test-create_obs_model.R +++ b/tests/testthat/test-create_obs_model.R @@ -3,10 +3,10 @@ dates <- seq(as.Date("2020-03-15"), by = "days", length.out = 15) test_that("create_obs_model works with default settings", { obs <- create_obs_model(dates = dates) - expect_equal(length(obs), 12) + expect_equal(length(obs), 11) expect_equal(names(obs), c( "model_type", "phi_mean", "phi_sd", "week_effect", "obs_weight", - "obs_scale", "obs_scale_mean", "obs_scale_sd", "accumulate", + "obs_scale", "obs_scale_mean", "obs_scale_sd", "likelihood", "return_likelihood", "day_of_week" )) expect_equal(obs$model_type, 1) diff --git a/tests/testthat/test-estimate_infections.R b/tests/testthat/test-estimate_infections.R index e089f887b..549cfeef0 100644 --- a/tests/testthat/test-estimate_infections.R +++ b/tests/testthat/test-estimate_infections.R @@ -65,7 +65,10 @@ test_that("estimate_infections successfully returns estimates when accumulating reported_cases_weekly[, confirm := frollsum(confirm, 7)] reported_cases_weekly <- reported_cases_weekly[seq(7, nrow(reported_cases_weekly), 7)] - test_estimate_infections(reported_cases_weekly, obs = obs_opts(na = "accumulate")) + reported_cases_weekly <- fill_missing( + reported_cases_weekly, missing_obs = "accumulate", initial_accumulate = 7 + ) + test_estimate_infections(reported_cases_weekly) }) test_that("estimate_infections successfully returns estimates using no delays", { diff --git a/tests/testthat/test-estimate_secondary.R b/tests/testthat/test-estimate_secondary.R index 9b46e7641..dfb9277a6 100644 --- a/tests/testthat/test-estimate_secondary.R +++ b/tests/testthat/test-estimate_secondary.R @@ -73,7 +73,7 @@ test_that("estimate_secondary can return values from simulated data and plot expect_equal( names(inc$predictions), c( - "date", "primary", "secondary", "mean", "se_mean", "sd", + "date", "primary", "secondary", "accumulate", "mean", "se_mean", "sd", "lower_90", "lower_50", "lower_20", "median", "upper_20", "upper_50", "upper_90" ) ) @@ -115,6 +115,9 @@ test_that("estimate_secondary successfully returns estimates when accumulating t cases_weekly <- merge( cases[, list(date, primary)], secondary_weekly, by = "date", all.x = TRUE ) + cases_weekly <- fill_missing( + cases_weekly, missing_obs = "accumulate", obs_column = "secondary" + ) inc_weekly <- estimate_secondary(cases_weekly, delays = delay_opts( LogNormal( @@ -122,7 +125,7 @@ test_that("estimate_secondary successfully returns estimates when accumulating t ) ), obs = obs_opts( - scale = list(mean = 0.4, sd = 0.05), week_effect = FALSE, na = "accumulate" + scale = list(mean = 0.4, sd = 0.05), week_effect = FALSE ), verbose = FALSE ) expect_true(is.list(inc_weekly$data)) diff --git a/tests/testthat/test-obs_opts.R b/tests/testthat/test-obs_opts.R index a9cb17db0..a90b3e6fe 100644 --- a/tests/testthat/test-obs_opts.R +++ b/tests/testthat/test-obs_opts.R @@ -18,14 +18,12 @@ test_that("obs_opts returns expected messages", { # NB: We change the local setting here to throw the message on demand, rather # than every 8 hours, for the sake of multiple runs of the test within # 8 hours. - rlang::local_options(rlib_message_verbosity = "verbose") - expect_message( - obs_opts(na = "accumulate"), - "modelled values that correspond to NA values" - ) + suppressMessages(expect_deprecated(obs_opts(na = "accumulate"))) }) test_that("obs_opts behaves as expected for user specified na treatment", { # If user explicitly specifies NA as missing, then don't throw message - expect_false(obs_opts(na = "missing")$na_as_missing_default_used) -}) \ No newline at end of file + expect_false( + suppressWarnings(obs_opts(na = "missing"))$na_as_missing_default_used + ) +}) diff --git a/tests/testthat/test-preprocessing.R b/tests/testthat/test-preprocessing.R new file mode 100644 index 000000000..1a9462d2b --- /dev/null +++ b/tests/testthat/test-preprocessing.R @@ -0,0 +1,35 @@ +cases <- data.table::copy(example_confirmed) +cases[, confirm := frollsum(confirm, 7)] +cases <- cases[seq(7, nrow(cases), 7)] +cases[2, confirm := NA] + +test_that("fill_missing works with NA and missing cases", { + filled <- fill_missing( + cases, missing_dates = "accumulate", initial_accumulate = 7 + ) + expect_equal(nrow(filled), nrow(cases) * 7) + expect_true(all(filled[!is.na(confirm), accumulate] == FALSE)) + expect_equal(filled[1:7, accumulate], c(rep(TRUE, 6), FALSE)) +}) + +test_that("fill_missing works with by columns", { + complete <- CJ( + date = cases$date, + country = c("A", "B"), + obs = c("Cases", "Deaths") + ) + more_cases <- merge(cases, complete, by = "date") + filled <- fill_missing( + more_cases, missing_obs = "accumulate", initial_accumulate = 7, + by = c("country", "obs") + ) + expect_equal(nrow(filled), nrow(more_cases) * 7) + expect_true(all(filled[!is.na(confirm), accumulate] == FALSE)) +}) + +test_that("fill_missing warns about initial data points", { + expect_warning( + fill_missing(cases, missing_dates = "accumulate"), + "Initial data point not marked as accumulated" + ) +})