Skip to content

Commit

Permalink
Merge pull request #25 from thomaszwagerman/timelines
Browse files Browse the repository at this point in the history
Adding in continuity checking functions
  • Loading branch information
thomaszwagerman authored Nov 6, 2024
2 parents b28f2c3 + 6129790 commit 0c2e6ee
Show file tree
Hide file tree
Showing 21 changed files with 679 additions and 13 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Imports:
cli,
dplyr,
lifecycle,
rlang,
waldo
Suggests:
knitr,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@ export(catch)
export(create_object_list)
export(loupe)
export(release)
export(timeline)
export(timeline_group)
importFrom(lifecycle,deprecated)
importFrom(rlang,.data)
6 changes: 3 additions & 3 deletions R/create_object_list.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#'
#' The main assumption is that `df_current` and `df_previous` are a newer and
#' older versions of the same data, and that the `datetime_variable` variable
#' name always remains the same. Elsewhere new columns can of appear,
#' and these will be returned in the report.
#' name always remains the same. Elsewhere new columns can of appear, and these
#' will be returned in the report.
#'
#' @param df_current data.frame, the newest/current version of dataset x.
#' @param df_previous data.frame, the old version of dataset,
Expand Down Expand Up @@ -104,7 +104,7 @@ create_object_list <- function(
deparse(substitute(df_current)),
"' is your most recent data, and '",
deparse(substitute(df_previous)),
"' is your previous data. If comparing directly use waldo::compare()."
"' is your previous data. If comparing directly, try waldo::compare()."
)
} else {
# Tell the user which rows are new, regardless of previous data changing
Expand Down
18 changes: 17 additions & 1 deletion R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,25 @@
#' A list with 4 dataframes (january, february, march, april) containing
#' 3 columns, and 3 + n_month rows:
#' \describe{
#' \item{time}{The date on which the count took place, in yyyy-mm-dd format}
#' \item{time}{The date on which the imaginary count took place,
#' in yyyy-mm-dd format}
#' \item{count}{Number of fictional butterflies counted}
#' \item{species}{Butterfly species name, only appears in april}
#' ...
#' }
"butterflycount"

#' Forest precipitation dummy data
#'
#' A completely fictional dataset of daily precipitation
#'
#' @format ## `butterflycount`
#' A list with 2 dataframes (january, february) containing 2 columns,
#' and 6 rows. February intentionally resets to 1970-01-01
#' \describe{
#' \item{time}{The date on which the imaginary rainfall was measured took
#' place, in yyyy-mm-dd format}
#' \item{rainfall_mm}{Rainfall in mm}
#' ...
#' }
"forestprecipitation"
86 changes: 86 additions & 0 deletions R/timeline.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#' timeline: check if a timeseries is continuous
#'
#' Check if a timeseries is continuous. Even if a timeseries does not contain
#' obvious gaps, this does not automatically mean it is also continuous.
#'
#' Measuring instruments can have different behaviours when they fail. For
#' example, during power failure an internal clock could reset to "1970-01-01",
#' or the manufacturing date (say, "2021-01-01"). This leads to unpredictable
#' ways of checking if a dataset is continuous.
#'
#' The `timeline_group()` and `timeline()` functions attempt to give the user
#' control over how to check for continuity by providing an `expected_lag`. The
#' difference between timesteps in a dataset should not exceed the
#' `expected_lag`.
#'
#' @inheritParams timeline_group
#'
#' @seealso [timeline_group()]
#'
#' @returns A boolean, TRUE if the timeseries is continuous, and FALSE if there
#' are more than one continuous timeseries within the dataset.
#'
#' @examples
#' # A nice continuous dataset should return TRUE
#' butterfly::timeline(
#' forestprecipitation$january,
#' datetime_variable = "time",
#' expected_lag = 1
#' )
#'
#' # In February, our imaginary rain gauge's onboard computer had a failure.
#' # The timestamp was reset to 1970-01-01
#' butterfly::timeline(
#' forestprecipitation$february,
#' datetime_variable = "time",
#' expected_lag = 1
#' )
#'
#' @export
timeline <- function(
df_current,
datetime_variable,
expected_lag = 1
) {

df_timelines <- timeline_group(
df_current,
datetime_variable,
expected_lag
)

if (length(unique(df_timelines$timeline_group)) == 1) {
is_continuous <- TRUE

cli::cat_bullet(
"There are no time lags which are greater than the expected lag: ",
deparse(substitute(expected_lag)),
" ",
units(df_timelines$timelag),
". By this measure, the timeseries is continuous.",
bullet = "tick",
col = "green",
bullet_col = "green"
)

} else if (length(unique(df_timelines$timeline_group)) > 1 ) {
is_continuous <- FALSE

cli::cat_bullet(
"There are time lags which are greater than the expected lag: ",
deparse(substitute(expected_lag)),
" ",
units(df_timelines$timelag),
". This indicates the timeseries is not continuous. There are ",
length(unique(df_timelines$timeline_group)),
" distinct continuous sequences. Use `timeline_group()` to extract.",
bullet = "info",
col = "orange",
bullet_col = "orange"
)
}

return(is_continuous)
}


104 changes: 104 additions & 0 deletions R/timeline_group.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#' timeline_group: check if a timeseries is continuous
#'
#' If after using `timeline()` you have established a timeseries is not
#' continuous, or if you are working with data where you expect distinct
#' sequences or events, you can use `timeline_group()` to extract and
#' classify different distinct continuous chunks of your data.
#'
#' We attempt to do this without sorting, or changing the data for a couple
#' of reasons:
#'
#' 1. There are no difference in dates:
#' Some instruments might record dates that appear identical,
#' but are still in chronological order. For example, high-frequency data
#' in fractional seconds. This is a rare use case though.
#'
#' 2. Dates are generally ascending/descending, but the instrument has
#' returned to origin. Probably more common, and will results in a
#' non-continuous dataset, however the records are still in chronological order
#' This is something we would like to discover. This is accounted for in the
#' logic in case_when().
#'
#' @param df_current data.frame, the newest/current version of dataset x.
#' @param datetime_variable string, the "datetime" variable that should be
#' checked for continuity.
#' @param expected_lag numeric, the acceptable difference between timestep for
#' a timeseries to be classed as continuous. Any difference greater than
#' `expected_lag` will indicate a timeseries is not continuous. Default is 1.
#' The smallest units of measurement present in the column will be used. In a
#' column formatted YYYY-MM-DD day will be used.
#'
#' @returns A data.frame, identical to `df_current`, but with extra columns
#' `timeline_group`, which assigns a number to each continuous sets of
#' data and `timelag` which specifies the time lags between rows.
#'
#' @examples
#' # A nice continuous dataset should return TRUE
#' # In February, our imaginary rain gauge's onboard computer had a failure.
#' # The timestamp was reset to 1970-01-01
#'
#' # We want to group these different distinct continuous sequences:
#' butterfly::timeline_group(
#' forestprecipitation$february,
#' datetime_variable = "time",
#' expected_lag = 1
#' )
#'
#' @importFrom rlang .data
#'
#' @export
timeline_group <- function(
df_current,
datetime_variable,
expected_lag = 1
) {
stopifnot("`df_current` must be a data.frame" = is.data.frame(df_current))
stopifnot("`expected_lag` must be numeric" = is.numeric(expected_lag))

# Check if `datetime_variable` is in `df_current`
if (!datetime_variable %in% names(df_current)) {
cli::cli_abort(
"`datetime_variable` must be present in `df_current`"
)
}

# Check if datetime_variable can be used by lag
if (
inherits(
df_current[[datetime_variable]],
c("POSIXct", "POSIXlt", "POSIXt", "Date")
) == FALSE
) {
cli::cli_abort(
"`datetime_variable` must be class of POSIXct, POSIXlt, POSIXt, Date"
)
}

# Obtain distinct sequences of continuous measurement
df_timeline <- df_current |>
dplyr::mutate(
timelag = (
.data[[datetime_variable]] - dplyr::lag(
.data[[datetime_variable]],
1
)
)
) |>
dplyr::mutate(
timeline_group1 = dplyr::case_when(
# Include negative timelag, for example if a sensor cpu shuts down
# It can return to its original date (e.g. 1970-01-01 or when it was
# deployed)
is.na(timelag) | timelag > expected_lag | timelag < -expected_lag ~ 1,
TRUE ~ 2
)
) |>
dplyr::mutate(
timeline_group = cumsum(.data$timeline_group1 == 1)
) |>
dplyr::select(
-"timeline_group1"
)

return(df_timeline)
}
3 changes: 3 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,10 @@ The butterfly package contains the following:
* `butterfly::catch()` - returns rows which contain previously changed values in a dataframe.
* `butterfly::release()` - drops rows which contain previously changed values, and returns a dataframe containing new and unchanged rows.
* `butterfly::create_object_list()` - returns a list of objects required by all of `loupe()`, `catch()` and `release()`. Contains underlying functionality.
* `butterfly::timeline()` - check if a timeseries is continuous between timesteps.
* `butterfly::timeline_group()` - group distinct, but continuous sequences of a timeseres.
* `butterflycount` - a list of monthly dataframes, which contain fictional butterfly counts for a given date.
* `forestprecipitation` - a list of monthly dataframes, which contain fictional daily precipitation measurements for a given date.

## Examples

Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,14 @@ The butterfly package contains the following:
- `butterfly::create_object_list()` - returns a list of objects required
by all of `loupe()`, `catch()` and `release()`. Contains underlying
functionality.
- `butterfly::timeline()` - check if a timeseries is continuous between
timesteps.
- `butterfly::timeline_group()` - group distinct, but continuous
sequences of a timeseres.
- `butterflycount` - a list of monthly dataframes, which contain
fictional butterfly counts for a given date.
- `forestprecipitation` - a list of monthly dataframes, which contain
fictional daily precipitation measurements for a given date.

## Examples

Expand Down
18 changes: 15 additions & 3 deletions codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"name": "R",
"url": "https://r-project.org"
},
"runtimePlatform": "R version 4.4.1 (2024-06-14)",
"runtimePlatform": "R version 4.4.2 (2024-10-31)",
"author": [
{
"@type": "Person",
Expand Down Expand Up @@ -109,6 +109,18 @@
"sameAs": "https://CRAN.R-project.org/package=lifecycle"
},
"4": {
"@type": "SoftwareApplication",
"identifier": "rlang",
"name": "rlang",
"provider": {
"@id": "https://cran.r-project.org",
"@type": "Organization",
"name": "Comprehensive R Archive Network (CRAN)",
"url": "https://cran.r-project.org"
},
"sameAs": "https://CRAN.R-project.org/package=rlang"
},
"5": {
"@type": "SoftwareApplication",
"identifier": "waldo",
"name": "waldo",
Expand All @@ -120,15 +132,15 @@
},
"sameAs": "https://CRAN.R-project.org/package=waldo"
},
"5": {
"6": {
"@type": "SoftwareApplication",
"identifier": "R",
"name": "R",
"version": ">= 2.10"
},
"SystemRequirements": null
},
"fileSize": "408.569KB",
"fileSize": "416.405KB",
"citation": [
{
"@type": "CreativeWork",
Expand Down
Binary file modified data/butterflycount.rda
Binary file not shown.
Binary file added data/forestprecipitation.rda
Binary file not shown.
3 changes: 2 additions & 1 deletion man/butterflycount.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/create_object_list.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions man/forestprecipitation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 0c2e6ee

Please sign in to comment.