Skip to content

Commit

Permalink
Merge pull request #65 from oxford-pharmacoepi/dev_nmb
Browse files Browse the repository at this point in the history
date range and intersect cohort clean
  • Loading branch information
edward-burn authored Apr 9, 2024
2 parents a3958d3 + bf7ee67 commit 8f05e80
Show file tree
Hide file tree
Showing 11 changed files with 428 additions and 353 deletions.
36 changes: 21 additions & 15 deletions R/generateIntersectCohorts.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#' combinations.
#' @param targetCohortId Ids to combine of the target cohort. If NULL all
#' cohort present in the table will be used.
#' @param gap Number of days between two subsequent cohort entries to be merged
#' in a single cohort record.
#' @param mutuallyExclusive Whether the generated cohorts are mutually
#' exclusive or not.
#' @param returnOnlyComb Whether to only get the combination cohort back
Expand Down Expand Up @@ -36,6 +38,7 @@ generateIntersectCohortSet <- function(cdm,
name,
targetCohortName,
targetCohortId = NULL,
gap = 0,
mutuallyExclusive = FALSE,
returnOnlyComb = FALSE) {
# initial checks
Expand Down Expand Up @@ -105,18 +108,18 @@ generateIntersectCohortSet <- function(cdm,
}

if (returnOnlyComb) {
toEliminate <- cohSet %>%
dplyr::rowwise() %>%
dplyr::mutate(
sum = sum(dplyr::c_across(-dplyr::all_of(c("cohort_definition_id", "cohort_name"))),
na.rm = TRUE)
) %>%
dplyr::filter(.data$sum == 1) %>%
dplyr::pull("cohort_definition_id")
cohSet <- cohSet |>
dplyr::filter(!.data$cohort_definition_id %in% .env$toEliminate) %>%
dplyr::group_by(.data$cohort_name) %>%
dplyr::mutate(cohort_definition_id = dplyr::cur_group_id())
toEliminate <- cohSet %>%
dplyr::rowwise() %>%
dplyr::mutate(
sum = sum(dplyr::c_across(-dplyr::all_of(c("cohort_definition_id", "cohort_name"))),
na.rm = TRUE)
) %>%
dplyr::filter(.data$sum == 1) %>%
dplyr::pull("cohort_definition_id")
cohSet <- cohSet |>
dplyr::filter(!.data$cohort_definition_id %in% .env$toEliminate) %>%
dplyr::group_by(.data$cohort_name) %>%
dplyr::mutate(cohort_definition_id = dplyr::cur_group_id())
}

# add cohort definition id
Expand All @@ -134,18 +137,21 @@ generateIntersectCohortSet <- function(cdm,


if (!mutuallyExclusive) {
cohort <- joinOverlap(x = cohort, gap = 1) %>%
dplyr::compute(name = name, temporary = FALSE)
cohSet <- cohSet %>%
dplyr::group_by(.data$cohort_definition_id, .data$cohort_name) %>%
dplyr::mutate(dplyr::across(
dplyr::everything(),
~ dplyr::if_else(dplyr::n_distinct(.x) == 1, 1, as.numeric(NA))
~ dplyr::if_else(dplyr::n_distinct(.x) == 1, 1, 0)
)) %>%
dplyr::ungroup() %>%
dplyr::distinct()
}

if (cohort |> dplyr::tally() |> dplyr::pull("n") > 0) {
cohort <- joinOverlap(x = cohort, gap = gap) %>%
dplyr::compute(name = name, temporary = FALSE)
}

# TODO
# create attrition

Expand Down
20 changes: 9 additions & 11 deletions R/requireCohortIntersectFlag.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@


#' Require cohort subjects are present in another cohort
#'
#' @param x Cohort table
#' @param targetCohortTable name of the cohort that we want to check for overlap
#' @param targetCohortId vector of cohort definition ids to include
#' @param x Cohort table.
#' @param targetCohortTable Name of the cohort that we want to check for overlap.
#' @param targetCohortId Vector of cohort definition ids to include.
#' @param indexDate Variable in x that contains the date to compute the
#' intersection.
#' @param targetStartDate date of reference in cohort table, either for start
#' (in overlap) or on its own (for incidence)
#' @param targetEndDate date of reference in cohort table, either for end
#' (overlap) or NULL (if incidence)
#' @param window window to consider events over
#' @param targetStartDate Date of reference in cohort table, either for start
#' (in overlap) or on its own (for incidence).
#' @param targetEndDate Date of reference in cohort table, either for end
#' (overlap) or NULL (if incidence).
#' @param window Window to consider events over.
#' @param negate If set as TRUE, criteria will be applied as exclusion
#' rather than inclusion (i.e. require absence in another cohort)
#' rather than inclusion (i.e. require absence in another cohort).
#'
#' @return Cohort table with only those in the other cohort kept
#' @export
Expand Down
226 changes: 118 additions & 108 deletions R/requireDateRange.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#' Require that an index date is within a date range
#'
#' @param cohort A cohort table in a cdm reference
#' @param indexDate Variable in cohort that contains the index date of interest
#' @param cohort A cohort table in a cdm reference.
#' @param cohortId Vector of cohort definition ids to include. If NULL, all
#' cohort definition ids will be used.
#' @param dateRange A window of time during which the index date must have
#' been observed
#' been observed.
#' @param indexDate Variable in cohort that contains the index date of interest
#' @param name Name of the new cohort with the restriction.
#'
#' @return The cohort table with any cohort entries outside of the date range
#' dropped
Expand All @@ -17,32 +20,57 @@
#' requireInDateRange(indexDate = "cohort_start_date",
#' dateRange = as.Date(c("2010-01-01", "2019-01-01")))
requireInDateRange <- function(cohort,
indexDate = "cohort_start_date",
dateRange = as.Date(c(NA, NA))) {

checkCohort(cohort)
checkDateVariable(cohort = cohort, dateVar = indexDate)
checkDateRange(dateRange)

cohort <- cohort %>%
dplyr::filter(.data[[indexDate]] >= !!dateRange[1] &
.data[[indexDate]] <= !!dateRange[2]) %>%
CDMConnector::recordCohortAttrition(reason = paste0(
indexDate,
" between ", dateRange[1], " & ", dateRange[2]
))
cohortId = NULL,
dateRange = as.Date(c(NA, NA)),
indexDate = "cohort_start_date",
name = omopgenerics::tableName(cohort)) {

# checks
assertCharacter(name)
validateCohortTable(cohort)
cdm <- omopgenerics::cdmReference(cohort)
validateCDM(cdm)
validateIndexDate(indexDate, cohort)
ids <- omopgenerics::settings(cohort)$cohort_definition_id
cohortId <- validateCohortId(cohortId, ids)
validateDateRange(dateRange)

noRequirementsIds <- ids[!ids %in% cohortId]

if (all(ids %in% cohortId)) {
cohort <- cohort |>
dplyr::filter(.data[[indexDate]] >= !!dateRange[1] &
.data[[indexDate]] <= !!dateRange[2]) |>
dplyr::compute(name = name, temporary = FALSE) |>
CDMConnector::recordCohortAttrition(
reason = paste0(indexDate, " between ", dateRange[1], " & ", dateRange[2]
))
} else {
cohort <- cohort |>
dplyr::filter((.data[[indexDate]] >= !!dateRange[1] &
.data[[indexDate]] <= !!dateRange[2]) |
.data$cohort_definition_id %in% noRequirementsIds) |>
dplyr::compute(name = name, temporary = FALSE) |>
CDMConnector::recordCohortAttrition(
reason = paste0(indexDate, " between ", dateRange[1], " & ", dateRange[2]),
cohortId = cohortId
)
}

cohort

}

#' Trim cohort dates to be within a date range
#'
#' @param cohort A cohort table in a cdm reference
#' @param startDate Variable with earliest date
#' @param endDate Variable with latest date
#' @param cohort A cohort table in a cdm reference.
#' @param cohortId Vector of cohort definition ids to include. If NULL, all
#' cohort definition ids will be used.
#' @param dateRange A window of time during which the index date must have
#' been observed
#' been observed.
#' @param startDate Variable with earliest date.
#' @param endDate Variable with latest date.
#' @param name Name of the new cohort with the restriction.
#'
#'
#' @return The cohort table with record timings updated to only be within the
#' date range. Any records with all time outside of the range will have
Expand All @@ -59,122 +87,104 @@ requireInDateRange <- function(cohort,
#' dateRange = as.Date(c("2015-01-01",
#' "2015-12-31")))
trimToDateRange <- function(cohort,
cohortId = NULL,
dateRange = as.Date(c(NA, NA)),
startDate = "cohort_start_date",
endDate = "cohort_end_date",
dateRange = as.Date(c(NA, NA))) {

checkCohort(cohort)
checkDateVariable(cohort = cohort, dateVar = startDate)
checkDateVariable(cohort = cohort, dateVar = endDate)
checkDateRange(dateRange)

#
# # validate inputs
# if (!isTRUE(inherits(cdm, "cdm_reference"))) {
# cli::cli_abort("cohort must be part of a cdm reference")
# }
# if (!"GeneratedCohortSet" %in% class(cohort) ||
# !all(c(
# "cohort_definition_id", "subject_id",
# "cohort_start_date", "cohort_end_date"
# ) %in%
# colnames(cohort))) {
# cli::cli_abort("cohort must be a GeneratedCohortSet")
# }
#
# if (!indexDate %in% colnames(cohort)) {
# cli::cli_abort(paste0(indexDate, " must be a date column in the cohort table"))
# }
#
# if (!endDateName %in% colnames(cohort)) {
# cli::cli_abort(paste0(endDateName, " must be a date column in the cohort table"))
# }

cohort <- trimStartDate(cohort = cohort,
startDate = startDate,
endDate = endDate,
minDate = dateRange[1]
) %>%
CDMConnector::recordCohortAttrition(reason = paste0(
startDate,
" >= ", dateRange[1]))

cohort <- trimEndDate(
cohort = cohort,
startDate = startDate,
endDate = endDate,
maxDate = dateRange[2]
) %>%
CDMConnector::recordCohortAttrition(reason = paste0(
endDate,
" <= ", dateRange[2]
))
name = omopgenerics::tableName(cohort)) {

# checks
assertCharacter(name)
validateCohortTable(cohort)
cdm <- omopgenerics::cdmReference(cohort)
validateCDM(cdm)
validateIndexDate(startDate, cohort)
validateIndexDate(endDate, cohort)
ids <- omopgenerics::settings(cohort)$cohort_definition_id
cohortId <- validateCohortId(cohortId, ids)
validateDateRange(dateRange)

noRequirementsIds <- ids[!ids %in% cohortId]

if (all(ids %in% cohortId)) {
cohort <- trimStartDate(cohort = cohort,
startDate = startDate,
endDate = endDate,
minDate = dateRange[1]
) %>%
dplyr::compute(name = name, temporary = FALSE) %>%
CDMConnector::recordCohortAttrition(
reason = paste0(startDate, " >= ", dateRange[1])
) %>%
trimEndDate(
startDate = startDate,
endDate = endDate,
maxDate = dateRange[2]
) %>%
dplyr::compute(name = name, temporary = FALSE) %>%
CDMConnector::recordCohortAttrition(
reason = paste0(endDate, " <= ", dateRange[2])
)
} else {
cohort <- cohort %>%
trimStartDate(noRequirementsIds = noRequirementsIds,
startDate = startDate,
endDate = endDate,
minDate = dateRange[1]
) %>%
dplyr::compute(name = name, temporary = FALSE) %>%
CDMConnector::recordCohortAttrition(
reason = paste0(startDate, " >= ", dateRange[1]),
cohortId = cohortId
) %>%
trimEndDate(
noRequirementsIds = noRequirementsIds,
startDate = startDate,
endDate = endDate,
maxDate = dateRange[2]
) %>%
dplyr::compute(name = name, temporary = FALSE) %>%
CDMConnector::recordCohortAttrition(
reason = paste0(endDate, " <= ", dateRange[2]),
cohortId = cohortId
)
}

cohort
}

trimStartDate <- function(cohort,
startDate,
endDate,
minDate) {
minDate,
noRequirementsIds = NULL) {

if (!is.na(startDate)) {
cohort <- cohort %>%
dplyr::mutate(!!startDate := dplyr::if_else(
.data[[startDate]] <= !!minDate,
as.Date(minDate), .data[[startDate]]
)) %>%
dplyr::filter(.data[[startDate]] <= .data[[endDate]])
dplyr::filter(.data[[startDate]] <= .data[[endDate]] |
.data$cohort_definition_id %in% noRequirementsIds)
}
return(cohort)
}
trimEndDate <- function(
cohort,
startDate,
endDate,
maxDate) {
maxDate,
noRequirementsIds = NULL) {

if (!is.na(endDate)) {
cohort <- cohort %>%
dplyr::mutate(!!endDate := dplyr::if_else(
.data[[endDate]] >= !!maxDate,
as.Date(maxDate), .data[[endDate]]
)) %>%
dplyr::filter(.data[[startDate]] <= .data[[endDate]])
dplyr::filter(.data[[startDate]] <= .data[[endDate]] |
.data$cohort_definition_id %in% noRequirementsIds)
}
return(cohort)
}


checkCohort <- function(cohort){
if (!"GeneratedCohortSet" %in% class(cohort) ||
!all(c(
"cohort_definition_id", "subject_id",
"cohort_start_date", "cohort_end_date"
) %in%
colnames(cohort))) {
cli::cli_abort("cohort must be a GeneratedCohortSet")
}
}

checkDateVariable <- function(cohort, dateVar){
if (!dateVar %in% colnames(cohort)) {
cli::cli_abort(paste0(dateVar, " must be a date column in the cohort table"))
}
}

checkDateRange<-function(dateRange){
if(!"Date" %in% class(dateRange)){
cli::cli_abort("dateRange is not a date")
}
if(length(dateRange) != 2){
cli::cli_abort("dateRange must be length two")
}
if(dateRange[1]>dateRange[2]){
cli::cli_abort("First date in dateRange cannot be after second")
}
return(invisible(dateRange))
}



Loading

0 comments on commit 8f05e80

Please sign in to comment.