updated interface for accumulation (#851)

* add new fill_missing function * add new accumulate variable * update stan model * update checks * expand docs * create accumulate column when buffeting * use helper function for deprecation * render docs * update tests * add news item * linting * fix arg name * fix bugs * do all reports * update test of deprecated functionality * export function * fix example * add global var * add missing man page * fix issues identified in checks * fix recommended workflow * remove obsolete arguments * Apply suggestions from code review Co-authored-by: James Azam <[email protected]> Co-authored-by: Sam Abbott <[email protected]> * move accumulation to after truncation * lint * rename arguments * make "confirm" the default for `obs_column` * add "by" arugment to `fill_missing()` * clarify `initial_accumulate` argument * add assertions suggested by @jamesmbaazam * Apply suggestions from code review Co-authored-by: James Azam <[email protected]> * update example * only assert if not missing * set column order * update tests * render design doc from Rmd to check correctness * accumulate after truncation * add more documentation * clarify truncation happens before accumulation --------- Co-authored-by: James Azam <[email protected]> Co-authored-by: Sam Abbott <[email protected]>
epiforecasts · Nov 25, 2024 · 2f27a3f · 2f27a3f
1 parent 1abee6b
commit 2f27a3f
Show file tree

Hide file tree

Showing 37 changed files with 674 additions and 125 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -61,6 +61,7 @@ export(extract_CrIs)
 export(extract_inits)
 export(extract_samples)
 export(extract_stan_param)
+export(fill_missing)
 export(fix_dist)
 export(fix_parameters)
 export(forecast_infections)
@@ -137,6 +138,7 @@ importFrom(cli,col_blue)
 importFrom(cli,col_red)
 importFrom(data.table,":=")
 importFrom(data.table,.N)
+importFrom(data.table,CJ)
 importFrom(data.table,as.data.table)
 importFrom(data.table,copy)
 importFrom(data.table,data.table)

diff --git a/NEWS.md b/NEWS.md
@@ -1,13 +1,26 @@
 # EpiNow2 (development version)
 
-## Documentation
+## Model changes
 
-- Brought the docs on `alpha_sd` up to date with the code change from prior PR #853. By @zsusswein in #862 and reviewed by @jamesmbaazam.
+- The models now support more complex patterns of aggregating reported cases by accumulating them over multiple time points, as well as mixtures of accumulation and missingness via the new `fill_missing()` function and a logical `accumulate` column that can be supplied with the data. By @sbfnk in #851 and reviewed by @seabbs and @jamesmbaazam..
 
-## Model changes
+  ```r
+  # Deprecated
+  data |>
+    estimate_infections(obs_opts(na = "accumulate"))
 
-- A bug was fixed where the initial growth was never estimated (i.e. the prior mean was always zero). By @sbfnk in #853 and reviewed by @seabbs.
-- A bug was fixed where an internal function for applying a default cdf cutoff failed due to a difference a vector length issue. By @jamesmbaazam in #858 and reviewed by @sbfnk.
+  # Recommended workflow e.g. for weekly incidence data
+  data |>
+    fill_missing(missing = "accumulate", initial_accumulate = 7) |>
+    estimate_infections()
+  ```
+
+  - A bug was fixed where the initial growth was never estimated (i.e. the prior mean was always zero). By @sbfnk in #853 and reviewed by @seabbs.
+  - A bug was fixed where an internal function for applying a default cdf cutoff failed due to a difference a vector length issue. By @jamesmbaazam in #858 and reviewed by @sbfnk.
+
+## Documentation
+
+- Brought the docs on `alpha_sd` up to date with the code change from prior PR #853. By @zsusswein in #862 and reviewed by @jamesmbaazam.
 
 # EpiNow2 1.6.1
 

diff --git a/R/checks.R b/R/checks.R
@@ -15,29 +15,28 @@
 #' "estimate_infections", "estimate_truncation", or "estimate_secondary".
 #' This is used to determine which checks to perform on the data input.
 #' @importFrom checkmate assert_data_frame assert_date assert_names
-#' assert_numeric
+#'   assert_numeric
+#' @importFrom purrr walk
 #' @importFrom rlang arg_match
 #' @return Called for its side effects.
 #' @keywords internal
 check_reports_valid <- function(data,
                                 model = c(
                                   "estimate_infections",
-                                  "estimate_truncation",
                                   "estimate_secondary"
                                 )) {
   # Check that the case time series (reports) is a data frame
   assert_data_frame(data)
   # Perform checks depending on the model to the data is meant to be used with
   model <- arg_match(model)
 
+  assert_date(data$date, any.missing = FALSE)
   if (model == "estimate_secondary") {
     # Check that data has the right column names
     assert_names(
       names(data),
       must.include = c("date", "primary", "secondary")
     )
-    # Check that the data data.frame has the right column types
-    assert_date(data$date, any.missing = FALSE)
     assert_numeric(data$primary, lower = 0)
     assert_numeric(data$secondary, lower = 0)
   } else {
@@ -46,10 +45,10 @@ check_reports_valid <- function(data,
       names(data),
       must.include = c("date", "confirm")
     )
-    # Check that the data data.frame has the right column types
-    assert_date(data$date, any.missing = FALSE)
     assert_numeric(data$confirm, lower = 0)
   }
+  assert_logical(data$accumulate, null.ok = TRUE)
+  return(invisible(data))
 }
 
 #' Validate probability distribution for passing to stan
@@ -224,7 +223,7 @@ test_data_complete <- function(data, cols_to_check) {
 
 #' Cross-check treatment of `NA` in obs_opts() against input data
 #'
-#' @description `r lifecycle::badge("experimental")`
+#' @description `r lifecycle::badge("deprecated")`
 #'
 #' This function checks the input data for implicit and/or explicit missingness
 #' and checks if the user specified `na = "missing"` in [obs_opts()].

diff --git a/R/create.R b/R/create.R
@@ -77,6 +77,10 @@ create_clean_reported_cases <- function(data, horizon = 0,
   }
   reported_cases[is.na(confirm), confirm := fill]
   reported_cases[, "average_7_day" := NULL]
+  ## set accumulate to FALSE in added rows
+  if ("accumulate" %in% colnames(reported_cases)) {
+    reported_cases[is.na(accumulate), accumulate := FALSE]
+  }
   return(reported_cases)
 }
 
@@ -484,7 +488,6 @@ create_obs_model <- function(obs = obs_opts(), dates) {
     obs_scale = as.integer(obs$scale$sd > 0 || obs$scale$mean != 1),
     obs_scale_mean = obs$scale$mean,
     obs_scale_sd = obs$scale$sd,
-    accumulate = obs$accumulate,
     likelihood = as.numeric(obs$likelihood),
     return_likelihood = as.numeric(obs$return_likelihood)
   )
@@ -525,14 +528,16 @@ create_obs_model <- function(obs = obs_opts(), dates) {
 create_stan_data <- function(data, seeding_time,
                              rt, gp, obs, horizon,
                              backcalc, shifted_cases) {
-
   cases <- data[(seeding_time + 1):(.N - horizon)]
   complete_cases <- create_complete_cases(cases)
   cases <- cases$confirm
+  accumulate <- data[-(1:seeding_time)]$accumulate
 
   stan_data <- list(
     cases = complete_cases$confirm,
     cases_time = complete_cases$lookup,
+    any_accumulate = as.integer(any(accumulate)),
+    accumulate = as.integer(accumulate),
     lt = nrow(complete_cases),
     shifted_cases = shifted_cases,
     t = length(data$date),

diff --git a/R/estimate_infections.R b/R/estimate_infections.R
@@ -17,8 +17,14 @@
 #' for an example of using `estimate_infections` within the `epinow` wrapper to
 #' estimate Rt for Covid-19 in a country from the ECDC data source.
 #'
-#' @param data A `<data.frame>` of confirmed cases (confirm) by date
-#' (date). `confirm` must be numeric and `date` must be in date format.
+#' @param data A `<data.frame>` of confirmed cases (confirm) by date (date).
+#' `confirm` must be numeric and `date` must be in date format. Optionally
+#' this can also have a logical `accumulate` column which indicates whether
+#' data should be added to the next data point. This is useful when modelling
+#' e.g. weekly incidence data. See also the [fill_missing()] function which
+#' helps add the `accumulate` column with the desired properties when dealing
+#' with non-daily data. If any accumulation is done this happens after
+#' truncation as specified by the `truncation` argument.
 #'
 #' @param reported_cases Deprecated; use `data` instead.
 #'
@@ -176,10 +182,11 @@ estimate_infections <- function(data,
     data = dirty_reported_cases,
     cols_to_check = c("date", "confirm")
   )
+  # Fill missing dates
+  reported_cases <- default_fill_missing_obs(data, obs, "confirm")
   # Create clean and complete cases
-  # Order cases
   reported_cases <- create_clean_reported_cases(
-    data, horizon,
+    reported_cases, horizon,
     filter_leading_zeros = filter_leading_zeros,
     zero_threshold = zero_threshold
   )
@@ -197,9 +204,9 @@ estimate_infections <- function(data,
         min(reported_cases$date) - 1,
         by = "days"
       ),
-      confirm = 0, breakpoint = 0
+      confirm = 0, accumulate = FALSE, breakpoint = 0
     ),
-    reported_cases
+    reported_cases[, .(date, confirm, accumulate, breakpoint)]
   ))
 
   shifted_cases <- create_shifted_cases(

diff --git a/R/estimate_secondary.R b/R/estimate_secondary.R
@@ -26,7 +26,13 @@
 #' respectively on the log scale).
 #'
 #' @param data A `<data.frame>` containing the `date` of report and both
-#' `primary` and `secondary` reports.
+#' `primary` and `secondary` reports. Optionally this can also have a logical
+#' `accumulate` column which indicates whether data should be added to the
+#' next data point. This is useful when modelling e.g. weekly incidence data.
+#' See also the [fill_missing()] function which helps add the `accumulate`
+#' column with the desired properties when dealing with non-daily data. If any
+#' accumulation is done this happens after truncation as specified by the
+#' `truncation` argument.
 #'
 #' @param reports Deprecated; use `data` instead.
 #'
@@ -190,7 +196,10 @@ estimate_secondary <- function(data,
     data = reports,
     cols_to_check = c("date", "primary", "secondary")
   )
-  secondary_reports_dirty <- reports[, list(date, confirm = secondary)]
+  reports <- default_fill_missing_obs(reports, obs, "secondary")
+
+  secondary_reports_dirty <-
+    reports[, list(date, confirm = secondary, accumulate)]
   secondary_reports <- create_clean_reported_cases(
     secondary_reports_dirty,
     filter_leading_zeros = filter_leading_zeros,
@@ -225,7 +234,9 @@ estimate_secondary <- function(data,
     obs_time = complete_secondary[lookup > burn_in]$lookup - burn_in,
     lt = sum(complete_secondary$lookup > burn_in),
     burn_in = burn_in,
-    seeding_time = 0
+    seeding_time = 0,
+    any_accumulate = as.integer(any(reports$accumulate > 0)),
+    accumulate = as.integer(reports$accumulate)
   )
   # secondary model options
   stan_data <- c(stan_data, secondary)

diff --git a/R/estimate_truncation.R b/R/estimate_truncation.R
@@ -145,7 +145,7 @@ estimate_truncation <- function(data,
     )
   }
    # Validate inputs
-  walk(data, check_reports_valid, model = "estimate_truncation")
+  walk(data, check_reports_valid, model = "estimate_infections")
   assert_class(truncation, "dist_spec")
   assert_class(model, "stanfit", null.ok = TRUE)
   assert_numeric(CrIs, lower = 0, upper = 1)

diff --git a/R/opts.R b/R/opts.R
@@ -594,15 +594,7 @@ gp_opts <- function(basis_prop = 0.2,
 #'   scale) or a list with numeric elements mean (`mean`) and standard deviation
 #'   (`sd`) defining a normally distributed scaling factor. Defaults to 1, i.e.
 #'   no scaling.
-#' @param na Character. Options are "missing" (the default) and "accumulate".
-#'   This determines how NA values in the data are interpreted. If set to
-#'   "missing", any NA values in the observation data set will be interpreted as
-#'   missing and skipped in the likelihood. If set to "accumulate", modelled
-#'   observations will be accumulated and added to the next non-NA data point.
-#'   This can be used to model incidence data that is reported at less than
-#'   daily intervals. If set to "accumulate", the first data point is not
-#'   included in the likelihood but used only to reset modelled observations to
-#'   zero.
+#' @param na Deprecated; use the [fill_missing()] function instead
 #' @param likelihood Logical, defaults to `TRUE`. Should the likelihood be
 #'   included in the model.
 #' @param return_likelihood Logical, defaults to `FALSE`. Should the likelihood
@@ -631,6 +623,21 @@ obs_opts <- function(family = c("negbin", "poisson"),
                      return_likelihood = FALSE) {
   # NB: This has to be checked first before the na argument is touched anywhere.
   na_default_used <- missing(na)
+  if (!na_default_used) {
+    lifecycle::deprecate_warn(
+      "1.7.0",
+      "obs_opts(na)",
+      "fill_missing()",
+      details =  c(
+        paste0(
+          "If NA values are not to be treated as missing use the ",
+          "`fill_missing()` function instead."
+        ),
+        "This argument will be removed in the next release of EpiNow2."
+      )
+    )
+
+  }
   na <- arg_match(na)
   if (na == "accumulate") {
     #nolint start: duplicate_argument_linter
@@ -641,8 +648,8 @@ obs_opts <- function(family = c("negbin", "poisson"),
         "i" = "This means that the first data point is not included in the
       likelihood but used only to reset modelled observations to zero.",
         "i" = "{col_red('If the first data point should be included in the
-        likelihood this can be achieved by adding a data point of arbitrary
-        value before the first data point.')}"
+        likelihood this can be achieved by using the `fill_missing()` function
+        with a non-zero `initial_missing` argument.')}"
       ),
       .frequency = "regularly",
       .frequency_id = "obs_opts"