Skip to content

Commit

Permalink
moved common functionality to create_object_list, updated documentati…
Browse files Browse the repository at this point in the history
…on, testing etc
  • Loading branch information
thomaszwagerman committed Oct 17, 2024
1 parent a0334fa commit 7185708
Show file tree
Hide file tree
Showing 17 changed files with 376 additions and 353 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(catch)
export(create_object_list)
export(loupe)
export(release)
importFrom(lifecycle,deprecated)
98 changes: 19 additions & 79 deletions R/catch.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#' which contains only rows that have changed compared to previous data. It will
#' not return any new rows.
#'
#' The underlying functionality is handled by `create_object_list()`.
#'
#' @param df_current data.frame, the newest/current version of dataset x.
#' @param df_previous data.frame, the old version of dataset, for example x - t1.
#' @param datetime_variable character, which variable to use as unique ID to join `df_current` and `df_previous`. Usually a "datetime" variable.
Expand All @@ -13,6 +15,7 @@
#' also returns a waldo object as in `loupe()`.
#'
#' @seealso [loupe()]
#' @seealso [create_object_list()]
#'
#' @examples
#' df_caught <- butterfly::catch(
Expand All @@ -25,92 +28,29 @@
#'
#' @export
catch <- function(df_current, df_previous, datetime_variable) {
# Check input is as expected
stopifnot("`df_current` must be a data.frame" = is.data.frame(df_current))
stopifnot("`df_previous` must be a data.frame" = is.data.frame(df_previous))

# Check if `datetime_variable` is in both `df_current` and `df_previous`
if (!datetime_variable %in% names(df_current) || !datetime_variable %in% names(df_previous)) {
stop(
"`datetime_variable` must be present in both `df_current` and `df_previous`"
)
}

# Using semi_join to extract rows with matching datetime_variables
# (ie previously generated data)
df_current_without_new_row <- dplyr::semi_join(
butterfly_object_list <- create_object_list(
df_current,
df_previous,
by = datetime_variable
datetime_variable
)

# Compare the current data with the previous data, without "new" values
waldo_object <- waldo::compare(
df_current_without_new_row,
df_previous
)

# Obtaining the new rows to provide in feedback
df_current_new_rows <- dplyr::anti_join(
df_current,
df_previous,
by = datetime_variable
)

if (nrow(df_current_new_rows) == 0) {
warning(
"There are no new rows. Check '",
deparse(substitute(df_current)),
"' is your most recent data, and '",
deparse(substitute(df_previous)),
"' is your previous data."
)
} else {
# Tell the user which rows are new, regardless of previous data changing
cli::cat_line(
paste0(
"The following rows are new in '",
deparse(substitute(df_current)),
"': "
),
col = "green"
)

cli::cat_print(
df_current_new_rows
# By using an inner join, we drop any row which does not match in
# df_previous.
df_rows_changed_from_previous <- suppressMessages(
dplyr::anti_join(
butterfly_object_list$df_current_without_new_row,
df_previous
)
}

# Return a simple message if there are no changes in previous data
if (length(waldo_object) == 0) {
stop(
"There are no differences between current and previous data."
)
} else {
# Return detailed breakdown and warning if previous data have changed.
if (length(waldo_object) > 0) {
cli::cat_line()
)

cli::cat_bullet(
"The following rows have changed from the previous data, and will be returned:",
bullet = "info",
col = "orange",
bullet_col = "orange"
)
cli::cat_line()

cli::cat_print(
waldo_object
)
cli::cat_bullet(
"Only these rows are returned.",
bullet = "info",
col = "orange",
bullet_col = "orange"
)

# By using an inner join, we drop any row which does not match in
# df_previous.
df_rows_changed_from_previous <- suppressMessages(
dplyr::anti_join(
df_current_without_new_row,
df_previous
)
)
}
}
return(df_rows_changed_from_previous)
}
139 changes: 139 additions & 0 deletions R/create_object_list.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#' create_object_list: creates a list of objects used in all butterfly functions
#'
#' This function creates a list of objects which is used by all of `loupe()`,
#' `catch()` and `release()`.
#'
#' This function matches two dataframe objects by their unique identifier
#' (usually "time" or "datetime in a timeseries).
#'
#' It informs the user of new (unmatched) rows which have appeared, and then
#' returns a `waldo::compare()` call to give a detailed breakdown of changes.
#'
#' The main assumption is that `df_current` and `df_previous` are a newer and
#' older versions of the same data, and that the `datetime_variable` variable name always
#' remains the same. Elsewhere new columns can of appear, and these will be
#' returned in the report.
#'
#' @param df_current data.frame, the newest/current version of dataset x.
#' @param df_previous data.frame, the old version of dataset, for example x - t1.
#' @param datetime_variable string, which variable to use as unique ID to join
#' `df_current` and `df_previous`. Usually a "datetime" variable.
#'
#' @returns A list containing boolean where TRUE indicates no changes to
#' previous data and FALSE indicates unexpected changes, a dataframe of
#' the current data without new rows and a dataframe of new rows only
#'
#' @examples
#' butterfly_object_list <- butterfly::create_object_list(
#' butterflycount$february,
#' butterflycount$january,
#' datetime_variable = "time"
#' )
#'
#' butterfly_object_list
#'
#' @export
create_object_list <- function(df_current, df_previous, datetime_variable) {
# Check input is as expected
stopifnot("`df_current` must be a data.frame" = is.data.frame(df_current))
stopifnot("`df_previous` must be a data.frame" = is.data.frame(df_previous))

# Check if `datetime_variable` is in both `df_current` and `df_previous`
if (!datetime_variable %in% names(df_current) || !datetime_variable %in% names(df_previous)) {
stop(
"`datetime_variable` must be present in both `df_current` and `df_previous`"
)
}

# Initialise list to store objects used by `loupe()`, `catch()` and `release()`
list_butterfly <- list(
"waldo_object" = character(),
"df_current_without_new_row" = data.frame(),
"df_current_new_rows" = data.frame()
)

# Using semi_join to extract rows with matching datetime_variables
# (ie previously generated data)
df_current_without_new_row <- dplyr::semi_join(
df_current,
df_previous,
by = datetime_variable
)

# Obtaining the new rows to provide in feedback
df_current_new_rows <- dplyr::anti_join(
df_current,
df_previous,
by = datetime_variable
)

# Compare the current data with the previous data, without "new" values
waldo_object <- waldo::compare(
df_current_without_new_row,
df_previous
)

# Creating a feedback message depending on the waldo object's output
# First checking if there are new rows at all:
if (nrow(df_current_new_rows) == 0) {
stop(
"There are no new rows. Check '",
deparse(substitute(df_current)),
"' is your most recent data, and '",
deparse(substitute(df_previous)),
"' is your previous data. If comparing like for like, try waldo::compare()."
)
} else {
# Tell the user which rows are new, regardless of previous data changing
cli::cat_line(
"The following rows are new in '",
deparse(substitute(df_current)),
"': ",
col = "green"
)
cli::cat_print(
df_current_new_rows
)
}

# Return a simple message if there are no changes in previous data
if (length(waldo_object) == 0) {
cli::cat_bullet(
"And there are no differences with previous data.",
bullet = "tick",
col = "green",
bullet_col = "green"
)

butterfly_status <- TRUE

} else {
# Return detailed breakdown and warning if previous data have changed.
if (length(waldo_object) > 0) {
cli::cat_line()

cli::cat_bullet(
"The following values have changes from the previous data.",
bullet = "info",
col = "orange",
bullet_col = "orange"
)

cli::cat_print(
waldo_object
)

butterfly_status <- FALSE

}
}

# Populate list with objects
list_butterfly <- list(
butterfly_status = butterfly_status,
df_current_without_new_row = df_current_without_new_row,
df_current_new_rows = df_current_new_rows
)

return(list_butterfly)
}
81 changes: 8 additions & 73 deletions R/loupe.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@
#' remains the same. Elsewhere new columns can of appear, and these will be
#' returned in the report.
#'
#' The underlying functionality is handled by `create_object_list()`.
#'
#' @param df_current data.frame, the newest/current version of dataset x.
#' @param df_previous data.frame, the old version of dataset, for example x - t1.
#' @param datetime_variable string, which variable to use as unique ID to join `df_current` and `df_previous`. Usually a "datetime" variable.
#'
#' @returns A waldo object containing a message on differences or 'And there are no differences with previous data'.
#' @returns A boolean where TRUE indicates no changes to previous data and FALSE indicates unexpected changes.
#'
#' @seealso [create_object_list()]
#'
#' @examples
#' # This example contains no differences with previous data
Expand All @@ -41,81 +45,12 @@
#'
#' @export
loupe <- function(df_current, df_previous, datetime_variable) {
# Check input is as expected
stopifnot("`df_current` must be a data.frame" = is.data.frame(df_current))
stopifnot("`df_previous` must be a data.frame" = is.data.frame(df_previous))

# Check if `datetime_variable` is in both `df_current` and `df_previous`
if (!datetime_variable %in% names(df_current) || !datetime_variable %in% names(df_previous)) {
stop(
"`datetime_variable` must be present in both `df_current` and `df_previous`"
)
}

# Using semi_join to extract rows with matching datetime_variables
# (ie previously generated data)
df_current_without_new_row <- dplyr::semi_join(
butterfly_object_list <- create_object_list(
df_current,
df_previous,
by = datetime_variable
)

# Compare the current data with the previous data, without "new" values
waldo_object <- waldo::compare(
df_current_without_new_row,
df_previous
datetime_variable
)

# Obtaining the new rows to provide in feedback
df_current_new_rows <- dplyr::anti_join(
df_current,
df_previous,
by = datetime_variable
)

# Creating a feedback message depending on the waldo object's output
# First checking if there are new rows at all:
if (nrow(df_current_new_rows) == 0) {
stop(
"There are no new rows. Check '",
deparse(substitute(df_current)),
"' is your most recent data, and '",
deparse(substitute(df_previous)),
"' is your previous data. If comparing like for like, try waldo::compare()."
)
} else {
# Tell the user which rows are new, regardless of previous data changing
cli::cat_line(
"The following rows are new in '",
deparse(substitute(df_current)),
"': ",
col = "green"
)
cli::cat_print(
df_current_new_rows
)
}

# Return a simple message if there are no changes in previous data
if (length(waldo_object) == 0) {
cli::cat_bullet(
"And there are no differences with previous data.",
bullet = "tick",
col = "green",
bullet_col = "green"
)
} else {
# Return detailed breakdown and warning if previous data have changed.
if (length(waldo_object) > 0) {
cli::cat_line()
return(butterfly_object_list$butterfly_status)

cli::cat_bullet(
"But the following values have changes from the previous data:",
bullet = "info",
col = "orange",
bullet_col = "orange"
)
return(waldo_object)
}
}
}
Loading

0 comments on commit 7185708

Please sign in to comment.