diff --git a/DESCRIPTION b/DESCRIPTION index 4694024..e9c30d0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,6 +13,7 @@ Imports: cli, dplyr, lifecycle, + rlang, waldo Suggests: knitr, diff --git a/NAMESPACE b/NAMESPACE index 366c797..da68701 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,4 +4,7 @@ export(catch) export(create_object_list) export(loupe) export(release) +export(timeline) +export(timeline_group) importFrom(lifecycle,deprecated) +importFrom(rlang,.data) diff --git a/R/create_object_list.R b/R/create_object_list.R index 7d425bc..85de8c5 100644 --- a/R/create_object_list.R +++ b/R/create_object_list.R @@ -11,8 +11,8 @@ #' #' The main assumption is that `df_current` and `df_previous` are a newer and #' older versions of the same data, and that the `datetime_variable` variable -#' name always remains the same. Elsewhere new columns can of appear, -#' and these will be returned in the report. +#' name always remains the same. Elsewhere new columns can of appear, and these +#' will be returned in the report. #' #' @param df_current data.frame, the newest/current version of dataset x. #' @param df_previous data.frame, the old version of dataset, @@ -104,7 +104,7 @@ create_object_list <- function( deparse(substitute(df_current)), "' is your most recent data, and '", deparse(substitute(df_previous)), - "' is your previous data. If comparing directly use waldo::compare()." + "' is your previous data. If comparing directly, try waldo::compare()." ) } else { # Tell the user which rows are new, regardless of previous data changing diff --git a/R/data.R b/R/data.R index c5ae851..7bb022d 100644 --- a/R/data.R +++ b/R/data.R @@ -6,9 +6,25 @@ #' A list with 4 dataframes (january, february, march, april) containing #' 3 columns, and 3 + n_month rows: #' \describe{ -#' \item{time}{The date on which the count took place, in yyyy-mm-dd format} +#' \item{time}{The date on which the imaginary count took place, +#' in yyyy-mm-dd format} #' \item{count}{Number of fictional butterflies counted} #' \item{species}{Butterfly species name, only appears in april} #' ... #' } "butterflycount" + +#' Forest precipitation dummy data +#' +#' A completely fictional dataset of daily precipitation +#' +#' @format ## `butterflycount` +#' A list with 2 dataframes (january, february) containing 2 columns, +#' and 6 rows. February intentionally resets to 1970-01-01 +#' \describe{ +#' \item{time}{The date on which the imaginary rainfall was measured took +#' place, in yyyy-mm-dd format} +#' \item{rainfall_mm}{Rainfall in mm} +#' ... +#' } +"forestprecipitation" diff --git a/R/timeline.R b/R/timeline.R new file mode 100644 index 0000000..f702cfe --- /dev/null +++ b/R/timeline.R @@ -0,0 +1,86 @@ +#' timeline: check if a timeseries is continuous +#' +#' Check if a timeseries is continuous. Even if a timeseries does not contain +#' obvious gaps, this does not automatically mean it is also continuous. +#' +#' Measuring instruments can have different behaviours when they fail. For +#' example, during power failure an internal clock could reset to "1970-01-01", +#' or the manufacturing date (say, "2021-01-01"). This leads to unpredictable +#' ways of checking if a dataset is continuous. +#' +#' The `timeline_group()` and `timeline()` functions attempt to give the user +#' control over how to check for continuity by providing an `expected_lag`. The +#' difference between timesteps in a dataset should not exceed the +#' `expected_lag`. +#' +#' @inheritParams timeline_group +#' +#' @seealso [timeline_group()] +#' +#' @returns A boolean, TRUE if the timeseries is continuous, and FALSE if there +#' are more than one continuous timeseries within the dataset. +#' +#' @examples +#' # A nice continuous dataset should return TRUE +#' butterfly::timeline( +#' forestprecipitation$january, +#' datetime_variable = "time", +#' expected_lag = 1 +#' ) +#' +#' # In February, our imaginary rain gauge's onboard computer had a failure. +#' # The timestamp was reset to 1970-01-01 +#' butterfly::timeline( +#' forestprecipitation$february, +#' datetime_variable = "time", +#' expected_lag = 1 +#' ) +#' +#' @export +timeline <- function( + df_current, + datetime_variable, + expected_lag = 1 +) { + + df_timelines <- timeline_group( + df_current, + datetime_variable, + expected_lag + ) + + if (length(unique(df_timelines$timeline_group)) == 1) { + is_continuous <- TRUE + + cli::cat_bullet( + "There are no time lags which are greater than the expected lag: ", + deparse(substitute(expected_lag)), + " ", + units(df_timelines$timelag), + ". By this measure, the timeseries is continuous.", + bullet = "tick", + col = "green", + bullet_col = "green" + ) + + } else if (length(unique(df_timelines$timeline_group)) > 1 ) { + is_continuous <- FALSE + + cli::cat_bullet( + "There are time lags which are greater than the expected lag: ", + deparse(substitute(expected_lag)), + " ", + units(df_timelines$timelag), + ". This indicates the timeseries is not continuous. There are ", + length(unique(df_timelines$timeline_group)), + " distinct continuous sequences. Use `timeline_group()` to extract.", + bullet = "info", + col = "orange", + bullet_col = "orange" + ) + } + + return(is_continuous) +} + + diff --git a/R/timeline_group.R b/R/timeline_group.R new file mode 100644 index 0000000..6b650da --- /dev/null +++ b/R/timeline_group.R @@ -0,0 +1,104 @@ +#' timeline_group: check if a timeseries is continuous +#' +#' If after using `timeline()` you have established a timeseries is not +#' continuous, or if you are working with data where you expect distinct +#' sequences or events, you can use `timeline_group()` to extract and +#' classify different distinct continuous chunks of your data. +#' +#' We attempt to do this without sorting, or changing the data for a couple +#' of reasons: +#' +#' 1. There are no difference in dates: +#' Some instruments might record dates that appear identical, +#' but are still in chronological order. For example, high-frequency data +#' in fractional seconds. This is a rare use case though. +#' +#' 2. Dates are generally ascending/descending, but the instrument has +#' returned to origin. Probably more common, and will results in a +#' non-continuous dataset, however the records are still in chronological order +#' This is something we would like to discover. This is accounted for in the +#' logic in case_when(). +#' +#' @param df_current data.frame, the newest/current version of dataset x. +#' @param datetime_variable string, the "datetime" variable that should be +#' checked for continuity. +#' @param expected_lag numeric, the acceptable difference between timestep for +#' a timeseries to be classed as continuous. Any difference greater than +#' `expected_lag` will indicate a timeseries is not continuous. Default is 1. +#' The smallest units of measurement present in the column will be used. In a +#' column formatted YYYY-MM-DD day will be used. +#' +#' @returns A data.frame, identical to `df_current`, but with extra columns +#' `timeline_group`, which assigns a number to each continuous sets of +#' data and `timelag` which specifies the time lags between rows. +#' +#' @examples +#' # A nice continuous dataset should return TRUE +#' # In February, our imaginary rain gauge's onboard computer had a failure. +#' # The timestamp was reset to 1970-01-01 +#' +#' # We want to group these different distinct continuous sequences: +#' butterfly::timeline_group( +#' forestprecipitation$february, +#' datetime_variable = "time", +#' expected_lag = 1 +#' ) +#' +#' @importFrom rlang .data +#' +#' @export +timeline_group <- function( + df_current, + datetime_variable, + expected_lag = 1 +) { + stopifnot("`df_current` must be a data.frame" = is.data.frame(df_current)) + stopifnot("`expected_lag` must be numeric" = is.numeric(expected_lag)) + + # Check if `datetime_variable` is in `df_current` + if (!datetime_variable %in% names(df_current)) { + cli::cli_abort( + "`datetime_variable` must be present in `df_current`" + ) + } + + # Check if datetime_variable can be used by lag + if ( + inherits( + df_current[[datetime_variable]], + c("POSIXct", "POSIXlt", "POSIXt", "Date") + ) == FALSE + ) { + cli::cli_abort( + "`datetime_variable` must be class of POSIXct, POSIXlt, POSIXt, Date" + ) + } + + # Obtain distinct sequences of continuous measurement + df_timeline <- df_current |> + dplyr::mutate( + timelag = ( + .data[[datetime_variable]] - dplyr::lag( + .data[[datetime_variable]], + 1 + ) + ) + ) |> + dplyr::mutate( + timeline_group1 = dplyr::case_when( + # Include negative timelag, for example if a sensor cpu shuts down + # It can return to its original date (e.g. 1970-01-01 or when it was + # deployed) + is.na(timelag) | timelag > expected_lag | timelag < -expected_lag ~ 1, + TRUE ~ 2 + ) + ) |> + dplyr::mutate( + timeline_group = cumsum(.data$timeline_group1 == 1) + ) |> + dplyr::select( + -"timeline_group1" + ) + + return(df_timeline) +} diff --git a/README.Rmd b/README.Rmd index 1d26a64..f5fc07c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -52,7 +52,10 @@ The butterfly package contains the following: * `butterfly::catch()` - returns rows which contain previously changed values in a dataframe. * `butterfly::release()` - drops rows which contain previously changed values, and returns a dataframe containing new and unchanged rows. * `butterfly::create_object_list()` - returns a list of objects required by all of `loupe()`, `catch()` and `release()`. Contains underlying functionality. + * `butterfly::timeline()` - check if a timeseries is continuous between timesteps. + * `butterfly::timeline_group()` - group distinct, but continuous sequences of a timeseres. * `butterflycount` - a list of monthly dataframes, which contain fictional butterfly counts for a given date. + * `forestprecipitation` - a list of monthly dataframes, which contain fictional daily precipitation measurements for a given date. ## Examples diff --git a/README.md b/README.md index 22c7936..ee53193 100644 --- a/README.md +++ b/README.md @@ -67,8 +67,14 @@ The butterfly package contains the following: - `butterfly::create_object_list()` - returns a list of objects required by all of `loupe()`, `catch()` and `release()`. Contains underlying functionality. +- `butterfly::timeline()` - check if a timeseries is continuous between + timesteps. +- `butterfly::timeline_group()` - group distinct, but continuous + sequences of a timeseres. - `butterflycount` - a list of monthly dataframes, which contain fictional butterfly counts for a given date. +- `forestprecipitation` - a list of monthly dataframes, which contain + fictional daily precipitation measurements for a given date. ## Examples diff --git a/codemeta.json b/codemeta.json index d588c5a..eee6bed 100644 --- a/codemeta.json +++ b/codemeta.json @@ -13,7 +13,7 @@ "name": "R", "url": "https://r-project.org" }, - "runtimePlatform": "R version 4.4.1 (2024-06-14)", + "runtimePlatform": "R version 4.4.2 (2024-10-31)", "author": [ { "@type": "Person", @@ -109,6 +109,18 @@ "sameAs": "https://CRAN.R-project.org/package=lifecycle" }, "4": { + "@type": "SoftwareApplication", + "identifier": "rlang", + "name": "rlang", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=rlang" + }, + "5": { "@type": "SoftwareApplication", "identifier": "waldo", "name": "waldo", @@ -120,7 +132,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=waldo" }, - "5": { + "6": { "@type": "SoftwareApplication", "identifier": "R", "name": "R", @@ -128,7 +140,7 @@ }, "SystemRequirements": null }, - "fileSize": "408.569KB", + "fileSize": "416.405KB", "citation": [ { "@type": "CreativeWork", diff --git a/data/butterflycount.rda b/data/butterflycount.rda index a377d0e..58e07cb 100644 Binary files a/data/butterflycount.rda and b/data/butterflycount.rda differ diff --git a/data/forestprecipitation.rda b/data/forestprecipitation.rda new file mode 100644 index 0000000..4e69395 Binary files /dev/null and b/data/forestprecipitation.rda differ diff --git a/man/butterflycount.Rd b/man/butterflycount.Rd index 9990250..af88821 100644 --- a/man/butterflycount.Rd +++ b/man/butterflycount.Rd @@ -10,7 +10,8 @@ A list with 4 dataframes (january, february, march, april) containing 3 columns, and 3 + n_month rows: \describe{ -\item{time}{The date on which the count took place, in yyyy-mm-dd format} +\item{time}{The date on which the imaginary count took place, +in yyyy-mm-dd format} \item{count}{Number of fictional butterflies counted} \item{species}{Butterfly species name, only appears in april} ... diff --git a/man/create_object_list.Rd b/man/create_object_list.Rd index acefb26..60c03ce 100644 --- a/man/create_object_list.Rd +++ b/man/create_object_list.Rd @@ -36,8 +36,8 @@ returns a \code{waldo::compare()} call to give a detailed breakdown of changes. The main assumption is that \code{df_current} and \code{df_previous} are a newer and older versions of the same data, and that the \code{datetime_variable} variable -name always remains the same. Elsewhere new columns can of appear, -and these will be returned in the report. +name always remains the same. Elsewhere new columns can of appear, and these +will be returned in the report. } \examples{ butterfly_object_list <- butterfly::create_object_list( diff --git a/man/forestprecipitation.Rd b/man/forestprecipitation.Rd new file mode 100644 index 0000000..fa2029e --- /dev/null +++ b/man/forestprecipitation.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{forestprecipitation} +\alias{forestprecipitation} +\title{Forest precipitation dummy data} +\format{ +\subsection{\code{butterflycount}}{ + +A list with 2 dataframes (january, february) containing 2 columns, +and 6 rows. February intentionally resets to 1970-01-01 +\describe{ +\item{time}{The date on which the imaginary rainfall was measured took +place, in yyyy-mm-dd format} +\item{rainfall_mm}{Rainfall in mm} +... +} +} +} +\usage{ +forestprecipitation +} +\description{ +A completely fictional dataset of daily precipitation +} +\keyword{datasets} diff --git a/man/timeline.Rd b/man/timeline.Rd new file mode 100644 index 0000000..79ed1e5 --- /dev/null +++ b/man/timeline.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/timeline.R +\name{timeline} +\alias{timeline} +\title{timeline: check if a timeseries is continuous} +\usage{ +timeline(df_current, datetime_variable, expected_lag = 1) +} +\arguments{ +\item{df_current}{data.frame, the newest/current version of dataset x.} + +\item{datetime_variable}{string, the "datetime" variable that should be +checked for continuity.} + +\item{expected_lag}{numeric, the acceptable difference between timestep for +a timeseries to be classed as continuous. Any difference greater than +\code{expected_lag} will indicate a timeseries is not continuous. Default is 1. +The smallest units of measurement present in the column will be used. In a +column formatted YYYY-MM-DD day will be used.} +} +\value{ +A boolean, TRUE if the timeseries is continuous, and FALSE if there +are more than one continuous timeseries within the dataset. +} +\description{ +Check if a timeseries is continuous. Even if a timeseries does not contain +obvious gaps, this does not automatically mean it is also continuous. +} +\details{ +Measuring instruments can have different behaviours when they fail. For +example, during power failure an internal clock could reset to "1970-01-01", +or the manufacturing date (say, "2021-01-01"). This leads to unpredictable +ways of checking if a dataset is continuous. + +The \code{timeline_group()} and \code{timeline()} functions attempt to give the user +control over how to check for continuity by providing an \code{expected_lag}. The +difference between timesteps in a dataset should not exceed the +\code{expected_lag}. +} +\examples{ +# A nice continuous dataset should return TRUE +butterfly::timeline( + forestprecipitation$january, + datetime_variable = "time", + expected_lag = 1 +) + +# In February, our imaginary rain gauge's onboard computer had a failure. +# The timestamp was reset to 1970-01-01 +butterfly::timeline( + forestprecipitation$february, + datetime_variable = "time", + expected_lag = 1 +) + +} +\seealso{ +\code{\link[=timeline_group]{timeline_group()}} +} diff --git a/man/timeline_group.Rd b/man/timeline_group.Rd new file mode 100644 index 0000000..e41715d --- /dev/null +++ b/man/timeline_group.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/timeline_group.R +\name{timeline_group} +\alias{timeline_group} +\title{timeline_group: check if a timeseries is continuous} +\usage{ +timeline_group(df_current, datetime_variable, expected_lag = 1) +} +\arguments{ +\item{df_current}{data.frame, the newest/current version of dataset x.} + +\item{datetime_variable}{string, the "datetime" variable that should be +checked for continuity.} + +\item{expected_lag}{numeric, the acceptable difference between timestep for +a timeseries to be classed as continuous. Any difference greater than +\code{expected_lag} will indicate a timeseries is not continuous. Default is 1. +The smallest units of measurement present in the column will be used. In a +column formatted YYYY-MM-DD day will be used.} +} +\value{ +A data.frame, identical to \code{df_current}, but with extra columns +\code{timeline_group}, which assigns a number to each continuous sets of +data and \code{timelag} which specifies the time lags between rows. +} +\description{ +If after using \code{timeline()} you have established a timeseries is not +continuous, or if you are working with data where you expect distinct +sequences or events, you can use \code{timeline_group()} to extract and +classify different distinct continuous chunks of your data. +} +\details{ +We attempt to do this without sorting, or changing the data for a couple +of reasons: +\enumerate{ +\item There are no difference in dates: +Some instruments might record dates that appear identical, +but are still in chronological order. For example, high-frequency data +in fractional seconds. This is a rare use case though. +\item Dates are generally ascending/descending, but the instrument has +returned to origin. Probably more common, and will results in a +non-continuous dataset, however the records are still in chronological order +This is something we would like to discover. This is accounted for in the +logic in case_when(). +} +} +\examples{ +# A nice continuous dataset should return TRUE +# In February, our imaginary rain gauge's onboard computer had a failure. +# The timestamp was reset to 1970-01-01 + +# We want to group these different distinct continuous sequences: +butterfly::timeline_group( + forestprecipitation$february, + datetime_variable = "time", + expected_lag = 1 +) + +} diff --git a/tests/testthat/test-create_object_list.R b/tests/testthat/test-create_object_list.R index 2099b1d..ccfdb01 100644 --- a/tests/testthat/test-create_object_list.R +++ b/tests/testthat/test-create_object_list.R @@ -17,6 +17,17 @@ test_that("error when no new rows", { ) }) +test_that("error when no datetime_variable not present in both dfs", { + expect_error( + create_object_list( + butterflycount$january, + butterflycount$february, + datetime_variable = "foo" + ), + "`datetime_variable` must be present in `df_current` and `df_previous`" + ) +}) + test_that("correct message is fed back", { expect_output( create_object_list( diff --git a/tests/testthat/test-timeline.R b/tests/testthat/test-timeline.R new file mode 100644 index 0000000..e1a87fa --- /dev/null +++ b/tests/testthat/test-timeline.R @@ -0,0 +1,35 @@ +test_that("correct message is fed back", { + expect_output( + timeline( + forestprecipitation$january, + datetime_variable = "time", + expected_lag = 1 + ), + "There are no time lags which are greater than the expected lag" + ) + expect_output( + timeline( + forestprecipitation$february, + datetime_variable = "time", + expected_lag = 1 + ), + "There are time lags which are greater than the expected lag" + ) +}) + +test_that("correct message is fed back", { + expect_true( + timeline( + forestprecipitation$january, + datetime_variable = "time", + expected_lag = 1 + ) + ) + expect_false( + timeline( + forestprecipitation$february, + datetime_variable = "time", + expected_lag = 1 + ) + ) +}) diff --git a/tests/testthat/test-timeline_group.R b/tests/testthat/test-timeline_group.R new file mode 100644 index 0000000..75ff648 --- /dev/null +++ b/tests/testthat/test-timeline_group.R @@ -0,0 +1,83 @@ +test_that("returns dataframe", { + df_timelines <- butterfly::timeline_group( + forestprecipitation$january, + datetime_variable = "time", + expected_lag = 1 + ) + + expect_s3_class( + df_timelines, + "data.frame" + ) + + expect_named( + df_timelines, + c( + "time", + "rainfall_mm", + "timelag", + "timeline_group" + ) + ) +}) + +test_that("returns expected number of sequences", { + df_timelines <- butterfly::timeline_group( + forestprecipitation$january, + datetime_variable = "time", + expected_lag = 1 + ) + + expect_equal( + length( + unique( + df_timelines$timeline_group + ) + ), + 1 + ) + + df_reset <- butterfly::timeline_group( + forestprecipitation$february, + datetime_variable = "time", + expected_lag = 1 + ) + + expect_equal( + length( + unique( + df_reset$timeline_group + ) + ), + 2 + ) +}) + +test_that("expected errors work", { + expect_error( + df_timelines <- butterfly::timeline_group( + forestprecipitation$january, + datetime_variable = "foo", + expected_lag = 1 + ), + "`datetime_variable` must be present in `df_current`" + ) + + df_timelines <- butterfly::timeline_group( + forestprecipitation$january, + datetime_variable = "time", + expected_lag = 1 + ) + + df_timelines$time <- as.character(df_timelines$time) + + expect_error( + df_timelines <- butterfly::timeline_group( + df_timelines, + datetime_variable = "time", + expected_lag = 1 + ), + "`datetime_variable` must be class of POSIXct, POSIXlt, POSIXt, Date" + ) + +}) diff --git a/vignettes/articles/butterfly_paper.Rmd b/vignettes/articles/butterfly_paper.Rmd new file mode 100644 index 0000000..f7e91ae --- /dev/null +++ b/vignettes/articles/butterfly_paper.Rmd @@ -0,0 +1,108 @@ +--- +title: 'butterfly: An R package for the verification of continually updating timeseries data where we expect new values, but want to ensure previous data remains unchanged.' +tags: +- R +- quality assurance +- timeseries +- ERA5 +date: "23 October 2024" +affiliations: +- name: British Antarctic Survey, UK + index: 1 +authors: +- name: Thomas Zwagerman + orcid: "0009-0003-3742-3234" + equal-contrib: true + affiliation: 1 +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +#\> left out bibliography: paper.bib from yaml + +# Summary + +Previously recorded data could be revised after initial publication number of reasons, such as discovery of an inconsistency or error, a change in methodology or instrument re-calibration. When using other data to generate your own, it is crucial to reference the exact version of the data used, in order to maintain data provenance. Unnoticed changes in previous data could have unintended consequences, such as invalidating a published dataset’s Digital Object Identfier (DOI), or altering future predictions if used as input in forecasting models. + +But what if you are not aware of upstream changes to your input data? Monitoring data sources for these changes is not always possible. Here we present butterfly, an R package for the verification of continually updating timeseries data where we expect new values, but want to ensure previous data remains unchanged. + +The intention of butterfly is to check for changes in previously published data, and warn the user with a report that contains as much details as possible. This will allow them to stop unintended data transfer, revise their published data, release a new version and communicate the significance of the change to their users. + +# Statement of Need + +Importance of citing exact extract of data [(Klump et al. 2021)](https://datascience.codata.org/articles/10.5334/dsj-2021-012) + +Semantic versioning is widely adopted in research software [(Preston-Werner 2013)](https://semver.org/spec/v2.0.0.html) + +Generating a derived data product + +A key recommendation in Siddorn et al.'s (2022) report "An Information Management Framework for Environmental Digital Twins (IMFe)... + +data provenance must be maintained + +data quality frameworks + +clearly documented for users and available in machine-readable format + +tools and methods + +... for a FAIR implementation (Wilkinson et al. 2016). + +At the British Antarctic Survey (BAS), we developed this package to deal with a very specific issue. + +Quality assurance in continually updating and continually published ERA5-derived data. + +At BAS, we frequently use ERA5 (Hersbach et al. 2023) as an input to climate models. + +IceNet a sea ice prediction system based on deep learning (Andersson et al. 2021) + +ERA5-derived data. + +## The issue with ERA5 and ERA5-Interim + +This package was originally developed to deal with [ERA5](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-single-levels?tab=documentation)'s initial release data, ERA5T. ERA5T data for a month is overwritten with the final ERA5 data two months after the month in question. + +Usually ERA5 and ERA5T are identical, but occasionally an issue with input data can (for example for [09/21 - 12/21](https://confluence.ecmwf.int/display/CKB/ERA5T+issue+in+snow+depth), and [07/24](https://forum.ecmwf.int/t/final-validated-era5-product-to-differ-from-era5t-in-july-2024/6685)) force a recalculation, meaning previously published data differs from the final product. + +In most cases, this is not an issue. For static data publications which are a snapshot in time, such as data associated with a specific paper, as in "Forecasts, neural networks, and results from the paper: 'Seasonal Arctic sea ice forecasting with probabilistic deep learning'" [Andersson & Hosking (2021)](https://data.bas.ac.uk/full-record.php?id=GB/NERC/BAS/PDC/01526)[@Andersson_2021] or time period as in "Downscaled ERA5 monthly precipitation data using Multi-Fidelity Gaussian Processes between 1980 and 2012 for the Upper Beas and Sutlej Basins, Himalaya" [Tazi (2023)](https://data.bas.ac.uk/full-record.php?id=GB/NERC/BAS/PDC/01769), this is not an issue. These datasets clearly describe a version and time period of ERA5 from which the data were derived, and will not be amended or updated in the future, even if ERA5 is recalculated. + +In our case however we want to continually append ERA5-derived datasets **and** continually publish them. This would be useful when functioning as a data source for an environmental digital twin (Blair & Hnerys et al. 2023), or simply as input data into an environmental forecasting model which itself is frequently running. + +Continually appending **and** publishing will require strict quality assurance. If a published dataset is only appended a DOI can be minted for it.  However, if the previously published data change, this will then invalidate the DOI.  For example, if you developed your code to find a better measure (more accurate, more precise) of the low pressure region, and wanted to reanalyse the previous data and republish. + +One such ERA5-derived dataset which we (will hopefully soon!) publish at BAS is the Amundsen Sea Low Index (ASLI). + +## What is the Amundsen Sea Low Index + +The Amundsen Seas Low (ASL) is a highly dynamic and mobile climatological low pressure system located in the Pacific sector of the Southern Ocean. In this sector, variability in sea-level pressure is greater than anywhere in the Southern Hemisphere, making it challenging to isolate local fluctuations in the ASL from larger-scale shifts in atmospheric pressure. The position and strength of the ASL are crucial for understanding regional change over West Antarctica (Hosking et al. 2016). + +# Citations + +# Acknowledgements + +# References + +Andersson, T., & Hosking, J. (2021). Forecasts, neural networks, and results from the paper: 'Seasonal Arctic sea ice forecasting with probabilistic deep learning' (Version 1.0) [Data set]. NERC EDS UK Polar Data Centre. + +Andersson, T.R., Hosking, J.S., Pérez-Ortiz, M. *et al.* Seasonal Arctic sea ice forecasting with probabilistic deep learning. *Nat Commun* **12**, 5124 (2021). + +Blair, Gordon S., and Peter A. Henrys. 2023. “The Role of Data Science in Environmental Digital Twins: In Praise of the Arrows.” Environmetrics 34 (January): Not available. . + +Hersbach, H., Bell, B., Berrisford, P., Biavati, G., Horányi, A., Muñoz Sabater, J., Nicolas, J., Peubey, C., Radu, R., Rozum, I., Schepers, D., Simmons, A., Soci, C., Dee, D., Thépaut, J-N. (2023): ERA5 hourly data on single levels from 1940 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.adbb2d47 + +Hosking, J. S., A. Orr, T. J. Bracegirdle, and J. Turner (2016), Future circulation changes off West Antarctica: Sensitivity of the Amundsen Sea Low to projected anthropogenic forcing, Geophys. Res. Lett., 43, 367–376, . + +Klump, J., Wyborn, L., Wu, M., Martin, J., Downs, R.R. and Asmi, A. (2021) ‘Versioning Data Is About More than Revisions: A Conceptual Framework and Proposed Principles’, Data Science Journal, 20(1), p. 12. Available at: . + +Preston-Werner, T. 2013. Semantic Versioning 2.0.0. Semantic Versioning. Available at [Last accessed 28 October 2024]. + +Siddorn, John, Gordon Shaw Blair, David Boot, Justin James Henry Buck, Andrew Kingdon, et al. 2022. “An Information Management Framework for Environmental Digital Twins (IMFe).” Zenodo. . + +Tazi, K. (2023). Downscaled ERA5 monthly precipitation data using Multi-Fidelity Gaussian Processes between 1980 and 2012 for the Upper Beas and Sutlej Basins, Himalayas (Version 1.0) [Data set]. NERC EDS UK Polar Data Centre. + +Wilkinson, Mark D., Michel Dumontier, IJsbrand Jan Aalbersberg, Gabrielle Appleton, Myles Axton, et al. 2016. “The FAIR Guiding Principles for Scientific Data Management and Stewardship.” Scientific Data 3 (1). . diff --git a/vignettes/butterfly.Rmd b/vignettes/butterfly.Rmd index eb434d1..5a4ce31 100644 --- a/vignettes/butterfly.Rmd +++ b/vignettes/butterfly.Rmd @@ -31,7 +31,7 @@ butterflycount This dataset is entirely fictional, and merely included to aid demonstrating butterfly's functionality. -## Examining datasets: loupe() +## Examining datasets: `loupe()` We can use `butterfly::loupe()` to examine in detail whether previous values have changed. @@ -70,7 +70,7 @@ butterfly::loupe( Call `?waldo::compare()` to see the full list of arguments. -## Extracting unexpected changes: catch() +## Extracting unexpected changes: `catch()` You might want to return changed rows as a dataframe. For this `butterfly::catch()`is provided. @@ -86,7 +86,7 @@ df_caught <- butterfly::catch( df_caught ``` -## Dropping unexpecrted changes: release() +## Dropping unexpected changes: `release()` Conversely, `butterfly::release()` drops all rows which had changed from the previous version. Note it retains new rows, as these were expected. @@ -114,6 +114,59 @@ df_release_without_new ``` +## Checking for continuity: `timeline()` +To check if a timeseries is continuous, `timeline()` and `timeline_group()` are +provided. Even if a timeseries does not contain obvious gaps, this does not +automatically mean it is also continuous. + +Measuring instruments can have different behaviours when they fail. For +example, during power failure an internal clock could reset to "1970-01-01", +or the manufacturing date (say, "2021-01-01"). This leads to unpredictable +ways of checking if a dataset is continuous. + +To check if a timeseries is continuous: + +```{r check_continuity} +butterfly::timeline( + forestprecipitation$january, + datetime_variable = "time", + expected_lag = 1 + ) +``` + +The above is a nice continuous dataset, where there is no more than a difference +of 1 day between timesteps. + +However, in February our imaginary rain gauge's onboard computer had a failure. + +The timestamp was reset to 1970-01-01: + +```{r not_continuous} +forestprecipitation$february + +butterfly::timeline( + forestprecipitation$february, + datetime_variable = "time", + expected_lag = 1 + ) +``` + +## Grouping distinct continuous sequences: `timeline_group()` + +If we wanted to group chunks of our timeseries that are distinct, or broken up +in some way, but still continuous, we can use `timeline_group()`: + +```{r timeline_group} +butterfly::timeline_group( + forestprecipitation$february, + datetime_variable = "time", + expected_lag = 1 + ) +``` + +We now have groups 1 & 2, which are both continuous sets of data, but there is +no continuity between them. + ## Using `butterfly` in a data processing pipeline If you would like to know more about using `butterfly` in an operational data processing pipeline, please refer to the article on [using `butterfly` in an operational pipeline](https://thomaszwagerman.github.io/butterfly/articles/butterfly_in_pipeline.html).