From eb76cf940453c1b88428da512aaaa2c712cbc9e6 Mon Sep 17 00:00:00 2001 From: Marta Alcalde-Herraiz Date: Mon, 8 Jan 2024 15:37:21 +0000 Subject: [PATCH 1/5] generateMatchedCohortSet vignette --- R/generateMatchedCohortSet.R | 6 +-- vignettes/a03_age_sex_matching.Rmd | 62 ++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/R/generateMatchedCohortSet.R b/R/generateMatchedCohortSet.R index 4a6ecde9..bcf99efa 100644 --- a/R/generateMatchedCohortSet.R +++ b/R/generateMatchedCohortSet.R @@ -63,7 +63,7 @@ generateMatchedCohortSet <- function(cdm, cdm <- checkRatio(cdm, name, ratio, targetCohortId, n) # Check cohort set ref - cdm <- checkCohortSetRef(cdm, name, matchSex, matchYearOfBirth, targetCohortId, n) + cdm <- checkCohortSetRef(cdm, name, targetCohortName, matchSex, matchYearOfBirth, targetCohortId, n) } # Return return(cdm) @@ -416,10 +416,10 @@ checkRatio <- function(cdm, name, ratio, targetCohortId, n){ } -checkCohortSetRef <- function(cdm, name, matchSex, matchYearOfBirth, targetCohortId, n){ +checkCohortSetRef <- function(cdm, name, targetCohortName, matchSex, matchYearOfBirth, targetCohortId, n){ cohort_set_ref <- cdm[[name]] %>% CDMConnector::cohort_set() %>% - dplyr::mutate(target_cohort_name = .env$name) %>% + dplyr::mutate(target_cohort_name = .env$targetCohortName) %>% dplyr::mutate(match_sex = .env$matchSex) %>% dplyr::mutate(match_year_of_birth = .env$matchYearOfBirth) %>% dplyr::mutate(match_status = dplyr::if_else(.data$cohort_definition_id %in% .env$targetCohortId, "target", "matched")) %>% diff --git a/vignettes/a03_age_sex_matching.Rmd b/vignettes/a03_age_sex_matching.Rmd index 6fc177f8..a0bae04a 100644 --- a/vignettes/a03_age_sex_matching.Rmd +++ b/vignettes/a03_age_sex_matching.Rmd @@ -14,6 +14,68 @@ knitr::opts_chunk$set( ) ``` +# Introduction +CohortConstructor packages includes a function to obtain an age and sex matched cohort, the `generateMatchedCohortSet()` function. In this vignette, we will explore the functionalities of the function and provide valuable examples for its usage. + +## Create mock data +We will first use `mockDrugUtilisation()` functin from DrugUtilisation package to create mock data. + ```{r setup} library(CohortConstructor) +library(dplyr) +library(DrugUtilisation) + +cdm <- mockDrugUtilisation(numberIndividual = 200) +``` + +As we will use `cohort1` to explore `generateMatchedCohortSet()`, let us first use `cohort_attrition()` from CDMConnector package to explore the this cohort: + +```{r setup} +CDMConnector::cohort_set(cdm$cohort1) ``` +Notice that there are three cohorts within this tibble, with id's going from 1 to 3. + +# Use `generateMatchedCohortSet()` to create an age-sex matched cohort +Let us first see an example of how this function works. For its usage, we need to provide a `cdm` object, the `targetCohortName`, which is the name of the table containing the cohort of interest, and the `name` of the new table that will be created containing the cohort and the matching cohort. We will also use the argument `targetCohortId` to specify that we only want a matching cohort for `cohort_definition_id = 1`. + +```{r setup} +cdm <- generateMatchedCohortSet(cdm = cdm, + name = "matched_cohort1", + targetCohortName = "cohort1", + targetCohortId = 1) + +CDMConnector::cohort_set(cdm$matched_cohort1) +``` +Notice that in the generated tibble, there are two cohorts: `cohort_definition_id = 1` (original cohort), and `cohort_definition_id = 4` (matched cohort). `target_cohort_name` column indicates which is the original cohort. `match_sex` and `match_year_of_birth` adopt boolean values (`TRUE`/`FALSE`) indicating if we have matched for sex and age, or not. `match_status` indicate if it is the original cohort (`target`) or if it is the matched cohort (`matched`). `target_cohort_id` indicates which is the cohort_id of the original cohort. + +`matchSex` is a boolean parameter (`TRUE`/`FALSE`) indicating if we want to match by sex (`TRUE`) or we do not want (`FALSE`). + +`matchYear` is another boolean parameter (`TRUE`/`FALSE`) indicating if we want to match by age (`TRUE`) or we do not want (`FALSE`). + +The default matching ratio is 1:1. Use `cohort_counts()` from CDMConnector to check if the matching has been done as desired. + +```{r setup} +CDMConnector::cohort_count(cdm$matched_cohort1) +``` +To have more information about the exclusion criteria applied to perform the matching, use `cohort_attrition()` from CDMConnector package: +```{r setup} +# Original cohort +CDMConnector::cohort_attrition(cdm$matched_cohort1) %>% filter(cohort_definition_id == 1) + +# Matched cohort +CDMConnector::cohort_attrition(cdm$matched_cohort1) %>% filter(cohort_definition_id == 4) +``` +Briefly, from the original cohort, we exclude first those individuals that do not have a match, and then individuals that their matching pair is not in observation during the assigned `cohort_start_date`. From the matched cohort, we start from the whole database and we first exclude individuals that are in the original cohort. Afterwards, we exclude individuals that do not have a match, then individuals that are not in observation during the assigned `cohort_start_date`, and finally we remove the as many individuals as required to fulfill the ratio. + +You can modify the `ratio` parameter to tailor your matched cohort. + +```{r setup} +cdm <- generateMatchedCohortSet(cdm = cdm, + name = "matched_cohort1", + targetCohortName = "cohort1", + targetCohortId = 1, + ratio = Inf) + +CDMConnector::cohort_count(cdm$matched_cohort1) +``` + From 67c0b0f0b395138a79ad00d8af48cceb4eab34c4 Mon Sep 17 00:00:00 2001 From: Marta Alcalde-Herraiz Date: Mon, 8 Jan 2024 15:50:23 +0000 Subject: [PATCH 2/5] Update 03_age_sex_matching --- vignettes/a03_age_sex_matching.Rmd | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vignettes/a03_age_sex_matching.Rmd b/vignettes/a03_age_sex_matching.Rmd index a0bae04a..c1bf5c6a 100644 --- a/vignettes/a03_age_sex_matching.Rmd +++ b/vignettes/a03_age_sex_matching.Rmd @@ -20,7 +20,7 @@ CohortConstructor packages includes a function to obtain an age and sex matched ## Create mock data We will first use `mockDrugUtilisation()` functin from DrugUtilisation package to create mock data. -```{r setup} +```{r, include = FALSE} library(CohortConstructor) library(dplyr) library(DrugUtilisation) @@ -30,7 +30,7 @@ cdm <- mockDrugUtilisation(numberIndividual = 200) As we will use `cohort1` to explore `generateMatchedCohortSet()`, let us first use `cohort_attrition()` from CDMConnector package to explore the this cohort: -```{r setup} +```{r, include = FALSE} CDMConnector::cohort_set(cdm$cohort1) ``` Notice that there are three cohorts within this tibble, with id's going from 1 to 3. @@ -38,7 +38,7 @@ Notice that there are three cohorts within this tibble, with id's going from 1 t # Use `generateMatchedCohortSet()` to create an age-sex matched cohort Let us first see an example of how this function works. For its usage, we need to provide a `cdm` object, the `targetCohortName`, which is the name of the table containing the cohort of interest, and the `name` of the new table that will be created containing the cohort and the matching cohort. We will also use the argument `targetCohortId` to specify that we only want a matching cohort for `cohort_definition_id = 1`. -```{r setup} +```{r, include = FALSE} cdm <- generateMatchedCohortSet(cdm = cdm, name = "matched_cohort1", targetCohortName = "cohort1", @@ -54,11 +54,11 @@ Notice that in the generated tibble, there are two cohorts: `cohort_definition_i The default matching ratio is 1:1. Use `cohort_counts()` from CDMConnector to check if the matching has been done as desired. -```{r setup} +```{r, include = FALSE} CDMConnector::cohort_count(cdm$matched_cohort1) ``` To have more information about the exclusion criteria applied to perform the matching, use `cohort_attrition()` from CDMConnector package: -```{r setup} +```{r, include = FALSE} # Original cohort CDMConnector::cohort_attrition(cdm$matched_cohort1) %>% filter(cohort_definition_id == 1) @@ -69,7 +69,7 @@ Briefly, from the original cohort, we exclude first those individuals that do no You can modify the `ratio` parameter to tailor your matched cohort. -```{r setup} +```{r, include = FALSE} cdm <- generateMatchedCohortSet(cdm = cdm, name = "matched_cohort1", targetCohortName = "cohort1", From b7f6de277df3aa61f4b432c4b6cb602da66ae199 Mon Sep 17 00:00:00 2001 From: Marta Alcalde-Herraiz Date: Fri, 12 Jan 2024 16:12:47 +0000 Subject: [PATCH 3/5] Update - Add an error message for ratio < 1 - Update matching vignette --- .Rhistory | 553 ++++++++++++++++++++++++++--- R/generateMatchedCohortSet.R | 7 + vignettes/a03_age_sex_matching.Rmd | 80 +++-- 3 files changed, 571 insertions(+), 69 deletions(-) diff --git a/.Rhistory b/.Rhistory index 08afce68..1d3b7143 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,41 +1,512 @@ -x <- c(1, 2, 3, 4, 5) -expand.grid(c) -expand.grid(x) -expand.grid(rep(list(x), length(x))) -5*5*5*5*5 -lapply(names, c(0, 1)) -lapply(names, function(x){c(0, 1)}) -names -names <- c("cohort1", "cohort2", "cohort3", "cohort3") -names <- c("cohort1", "cohort2", "cohort3", "cohort4") -lapply(names, function(x){c(0, 1)}) -lapply(names, function(x){c(0, 1)}) |> -expand.grid() -lapply(names, function(x){c(0, 1)}) |> -expand.grid() |> -rlang::set_names(name) -lapply(names, function(x){c(0, 1)}) |> -expand.grid() |> -rlang::set_names(names) -lapply(names, function(x){c(0, 1)}) |> -expand.grid() |> -rlang::set_names(names) |> -dplyr::as_tibble() -lapply(names, function(x){c(0, 1)}) |> -expand.grid() |> -rlang::set_names(names) |> -dplyr::as_tibble() %>% -dplyr::mutate("cohort_definition_id" = dplyr::row_number()) -lapply(names, function(x){c(0, 1)}) |> -expand.grid() |> -rlang::set_names(names) |> -dplyr::as_tibble() |> -dplyr::mutate("cohort_definition_id" = dplyr::row_number()) -getCohortSetMutuallyEclusive <- function(names) { -lapply(names, function(x){c(0, 1)}) |> -expand.grid() |> -rlang::set_names(names) |> -dplyr::as_tibble() |> -dplyr::mutate("cohort_definition_id" = dplyr::row_number()) -} -getCohortSetMutuallyEclusive(names) +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = NULL, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a +a[["new_cohort"]] +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = 1, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a +a[["new_cohort"]] +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = 1, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a[["new_cohort"]] +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = 2, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a[["new_cohort"]] +name = "new_cohort" +targetCohortName = "cases" +targetCohortId = 2 +matchSex = TRUE +matchYearOfBirth = TRUE +ratio = 4 +# validate initial input +validateInput( +cdm = cdm, name = name, targetCohortName = targetCohortName, +targetCohortId = targetCohortId, matchSex = matchSex, +matchYearOfBirth = matchYearOfBirth, ratio = ratio +) +# get the number of cohorts +n <- getNumberOfCohorts(cdm, targetCohortName) +n +# get target cohort id +targetCohortId <- getTargetCohortId(cdm, targetCohortId, targetCohortName) +targetCohortId +# Create the cohort name with cases and controls of the targetCohortId +cdm <- getNewCohort(cdm, name, targetCohortName, targetCohortId, n) +cdm +cdm[["new_cohort"]] +cdm[["new_cohort"]] %>% print(n = 10) +cdm[["new_cohort"]] %>% print(n = 15) +cdm[["cases"]] +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8507,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2012-01-04","2014-12-15","2004-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2012-01-05","2014-12-16","2004-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10), +) +) +cdm <- DrugUtilisation::generateConceptCohortSet( +cdm = cdmMock, +conceptSet = list(c1 = 317009, c2 = 4266367), +name = "cases", +end = "observation_period_end_date", +requiredObservation = c(0,0), +overwrite = TRUE +) +cdm[["cases"]] +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2012-01-04","2014-12-15","2004-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2012-01-05","2014-12-16","2004-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10), +) +) +cdmMock +cdmMock[["condition_occurrence"]] +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2012-01-04","2014-12-15","2004-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2012-01-05","2014-12-16","2004-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10), +) +) +cdm <- DrugUtilisation::generateConceptCohortSet( +cdm = cdmMock, +conceptSet = list(c1 = 317009, c2 = 4266367), +name = "cases", +end = "observation_period_end_date", +requiredObservation = c(0,0), +overwrite = TRUE +) +cdm[["cases"]] +cdm[["observation_period"]] +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2003-01-04","2014-12-15","2010-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2003-01-05","2014-12-16","2010-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10), +) +) +cdm <- DrugUtilisation::generateConceptCohortSet( +cdm = cdmMock, +conceptSet = list(c1 = 317009, c2 = 4266367), +name = "cases", +end = "observation_period_end_date", +requiredObservation = c(0,0), +overwrite = TRUE +) +cdm[["cases"]] +generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = 2, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = 2, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +cdm[["cases"]] +a[["new_cohort"]] +cdm[["observation_period"]] +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2003-01-04","2014-12-15","2010-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2003-01-05","2014-12-16","2010-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10)), +observation_period = tibble::tibble("observation_period_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"observation_period_start_date" = rep(as.Date(1950-01-01),10), +"observation_period_end_date" = rep(as.Date(2023-12-05),10), +"period_type_concept_id" = 44814724) +) +cdm <- DrugUtilisation::generateConceptCohortSet( +cdm = cdmMock, +conceptSet = list(c1 = 317009, c2 = 4266367), +name = "cases", +end = "observation_period_end_date", +requiredObservation = c(0,0), +overwrite = TRUE +) +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = 2, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2003-01-04","2014-12-15","2010-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2003-01-05","2014-12-16","2010-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10)), +observation_period = tibble::tibble("observation_period_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"observation_period_start_date" = rep(as.Date(1950-01-01),10), +"observation_period_end_date" = rep(as.Date(2023-12-05),10), +"period_type_concept_id" = 44814724) +) +cdm <- DrugUtilisation::generateConceptCohortSet( +cdm = cdmMock, +conceptSet = list(c1 = 317009, c2 = 4266367), +name = "cases", +end = "observation_period_end_date", +requiredObservation = c(0,0), +overwrite = TRUE +) +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = 2, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +cdm[["cases"]] +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2003-01-04","2014-12-15","2010-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2003-01-05","2014-12-16","2010-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10)), +observation_period = tibble::tibble("observation_period_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"observation_period_start_date" = rep(as.Date(1950-01-01),10), +"observation_period_end_date" = rep(as.Date(2021-12-05),10), +"period_type_concept_id" = 44814724) +) +cdm <- DrugUtilisation::generateConceptCohortSet( +cdm = cdmMock, +conceptSet = list(c1 = 317009, c2 = 4266367), +name = "cases", +end = "observation_period_end_date", +requiredObservation = c(0,0), +overwrite = TRUE +) +cdm[["cases"]] +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2003-01-04","2014-12-15","2010-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2003-01-05","2014-12-16","2010-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10)), +observation_period = tibble::tibble("observation_period_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"observation_period_start_date" = rep(as.Date(1984-01-01),10), +"observation_period_end_date" = rep(as.Date(2021-12-05),10), +"period_type_concept_id" = 44814724) +) +rep(as.Date(1984-01-01),10) +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2003-01-04","2014-12-15","2010-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2003-01-05","2014-12-16","2010-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10)), +observation_period = tibble::tibble("observation_period_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"observation_period_start_date" = as.Date(rep("1984-01-01",10)), +"observation_period_end_date" = as.Date(rep("2021-01-01",10)), +"period_type_concept_id" = 44814724) +) +cdm <- DrugUtilisation::generateConceptCohortSet( +cdm = cdmMock, +conceptSet = list(c1 = 317009, c2 = 4266367), +name = "cases", +end = "observation_period_end_date", +requiredObservation = c(0,0), +overwrite = TRUE +) +cdm[["cases"]] +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = 2, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a[["new_cohort"]] +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = NULL, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a[["cases"]] +a[["new_cohort"]] +a[["new_cohort"]] %>% print(n = 20) +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(1,3)) %>% +dplyr::summarise(subject_unique) +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(1,3)) %>% +dplyr::summarise(subject_id) +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(1,3)) %>% +dplyr::summarise(subject_id) %>% unique() +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(1,3)) %>% +dplyr::summarise(subject_id) %>% +distinct() +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(1,3)) %>% +dplyr::summarise(subject_id) %>% +dplyr::distinct() +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(1,3)) %>% +dplyr::summarise(subject_id) %>% +dplyr::distinct() %>% dplyr::pull() +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(1,3)) %>% +dplyr::summarise(subject_id) %>% +dplyr::distinct() %>% dplyr::pull() %>% length() == 10 +expect_true(a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(2,4)) %>% +dplyr::summarise(subject_id) %>% +dplyr::distinct() %>% dplyr::pull() %>% length() == 10) +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(2,4)) %>% +dplyr::summarise(subject_id) %>% +dplyr::distinct() %>% dplyr::pull() %>% length() +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(2,4)) +a[["new_cohort"]] %>% +dplyr::filter(cohort_definition_id %in% c(2,4)) %>% +dplyr::summarise(cohort_start_date) %>% +dplyr::distinct() %>% dplyr::pull() %>% length() == 2 +a[["new_cohort"]] %>% dplyr::filter(subject_id == 5) +a[["new_cohort"]] %>% dplyr::filter(subject_id == 5) %>% dplyr::summarise(cohort_start_date) %>% dplyr::arrange() %>% dplyr::pull() +a[["new_cohort"]] %>% dplyr::filter(subject_id == 5) %>% dplyr::summarise(cohort_start_date) %>% dplyr::arrange(desc()) %>% dplyr::pull() +a[["new_cohort"]] %>% dplyr::filter(subject_id == 5) %>% dplyr::summarise(cohort_start_date) +a[["new_cohort"]] %>% dplyr::filter(subject_id == 5) %>% dplyr::summarise(cohort_start_date) %>% dplyr::pull() +a[["new_cohort"]] %>% dplyr::filter(subject_id == 5) %>% dplyr::summarise(cohort_start_date) %>% dplyr::arrange() +as.Date(c("2017-01-01","2019-01-01","2015-01-01")) +arrange(as.Date(c("2017-01-01","2019-01-01","2015-01-01"))) +dplyr::arrange(as.Date(c("2017-01-01","2019-01-01","2015-01-01"))) +order(as.Date(c("2017-01-01","2019-01-01","2015-01-01"))) +as.Date(c("2017-01-01","2019-01-01","2015-01-01")) +as.Date(c("2017-01-01","2019-01-01","2015-01-01")) %>% order() +as.Date(c("2017-01-01","2019-01-01","2015-01-01")) %>% mdy +as.Date(c("2017-01-01","2019-01-01","2015-01-01")) %>% lubridate::mdy() +a[["new_cohort"]] %>% dplyr::filter(subject_id == 5) %>% dplyr::summarise(cohort_start_date) %>% dplyr::pull() +a[["new_cohort"]] %>% dplyr::filter(subject_id == 5) %>% dplyr::summarise(cohort_start_date) %>% dplyr::pull() %in% c("2017-10-30","2003-01-04","2014-12-15","2010-09-09") +expect_true(a[["new_cohort"]] %>% +dplyr::filter(subject_id %in% c(seq(5,10,1)) %>% dplyr::summarise(cohort_start_date) %>% dplyr::pull() %in% c("2017-10-30","2003-01-04","2014-12-15","2010-09-09")) +}) +expect_true(a[["new_cohort"]] %>% +dplyr::filter(subject_id %in% c(seq(5,10,1))) %>% dplyr::summarise(cohort_start_date) %>% dplyr::pull() %in% c("2017-10-30","2003-01-04","2014-12-15","2010-09-09")) +expect_true(a[["new_cohort"]] %>% +dplyr::filter(subject_id %in% seq(5,10,1)) %>% dplyr::summarise(cohort_start_date) %>% dplyr::pull() %in% c("2017-10-30","2003-01-04","2014-12-15","2010-09-09")) +expect_true(a[["new_cohort"]] %>% +dplyr::filter(subject_id == 5) %>% dplyr::summarise(cohort_start_date) %>% dplyr::pull() %in% c("2017-10-30","2003-01-04","2014-12-15","2010-09-09")) +outc %>% dplyr::distinct() +outc <- a[["new_cohort"]] %>% +dplyr::filter(subject_id == 5) %>% dplyr::summarise(cohort_start_date) %>% dplyr::pull() %in% c("2017-10-30","2003-01-04","2014-12-15","2010-09-09") +outc %>% dplyr::distinct() +outc == c(TRUE,TRUE) +dplyr::distinct(outc) +unique(outc) +expect_true(unique(outc) == TRUE) +devtools::load_all(".") +devtools::check() +usethis::use_package(tibble,"Suggests") +usethis::use_package(tibble,"Suggests") +usethis::use_package("tibble","Suggests") +devtools::check() +devtools::load_all(".") +devtools::check() +devtools::load_all(".") +devtools::check() +install.packages("DrugUtilisation") +install.packages("DrugUtilisation") +devtools::load_all(".") +devtools::check() +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date(1980,04,01),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2003-01-04","2014-12-15","2010-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2003-01-05","2014-12-16","2010-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10)), +observation_period = tibble::tibble("observation_period_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"observation_period_start_date" = as.Date(rep("1984-01-01",10)), +"observation_period_end_date" = as.Date(rep("2021-01-01",10)), +"period_type_concept_id" = 44814724) +) +help(as.Date) +help(rep) +help(seq) +cdm <- DrugUtilisation::generateConceptCohortSet( +cdm = cdmMock, +conceptSet = list(c1 = 317009, c2 = 4266367), +name = "cases", +end = "observation_period_end_date", +requiredObservation = c(0,0), +overwrite = TRUE +) +generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = NULL, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +a <- generateMatchedCohortSet(cdm, +name = "new_cohort", +targetCohortName = "cases", +targetCohortId = NULL, +matchSex = TRUE, +matchYearOfBirth = TRUE, +ratio = 4) +rep(as.Date(1980,04,01),10) +# Generate mock data +cdmMock <- DrugUtilisation::mockDrugUtilisation( +numberIndividuals = 10, +person = tibble::tibble("person_id" = seq(1,10,1), +"gender_concept_id" = rep(8532,10), +"year_of_birth" = rep(1980, 10), +"day_of_birth" = rep(1, 10), +"birth_date_time" = rep(as.Date("1980,04,01"),10), +"month_of_birth" = rep(4, 10)), +condition_occurrence = tibble::tibble("condition_ocurrence_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"condition_concept_id" = c(317009,317009,4266367,4266367,rep(1,6)), +"condition_start_date" = as.Date(c("2017-10-30","2003-01-04","2014-12-15","2010-09-09","2004-08-26","1985-03-31","1985-03-13","1985-07-11","1983-11-07","2020-01-13")), +"condition_end_date" = as.Date(c("2017-11-01","2003-01-05","2014-12-16","2010-09-10","2004-08-27","1985-04-01","1985-03-14","1985-07-12","1983-11-08","2020-01-14")), +"condition_type_concept_id" = rep(32020,10)), +observation_period = tibble::tibble("observation_period_id" = seq(1,10,1), +"person_id" = seq(1,10,1), +"observation_period_start_date" = as.Date(rep("1984-01-01",10)), +"observation_period_end_date" = as.Date(rep("2021-01-01",10)), +"period_type_concept_id" = 44814724) +) +as.Date(rep("1980,04,01",10)) +as.Date(rep("1980,04,01",10)) +as.Date(rep("1980-04-01",10)) +devtools::load_all(".") +devtools::check() +library(CohortConstructor) +library(CohortConstructor) diff --git a/R/generateMatchedCohortSet.R b/R/generateMatchedCohortSet.R index bcf99efa..d0d12f5e 100644 --- a/R/generateMatchedCohortSet.R +++ b/R/generateMatchedCohortSet.R @@ -125,6 +125,13 @@ validateInput <- function(cdm, errorMessage$push(glue::glue("- {name} table does not containg '{targetCohortId}' as a cohort_definition_id")) } } + # Check if ratio is > 0 + ratio_check <- ratio > 0 + checkmate::assertTRUE(ratio_check, add = errorMessage) + if(!isTRUE(ratio_check)){ + errorMessage$push(glue::glue("- ratio parameter must be > 0 ")) + } + checkmate::reportAssertions(collection = errorMessage) return(invisible(TRUE)) } diff --git a/vignettes/a03_age_sex_matching.Rmd b/vignettes/a03_age_sex_matching.Rmd index c1bf5c6a..26efe14b 100644 --- a/vignettes/a03_age_sex_matching.Rmd +++ b/vignettes/a03_age_sex_matching.Rmd @@ -1,8 +1,8 @@ --- -title: "a03_age_sex_matching" +title: "Generate a matched age and sex cohort" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{a03_age_sex_matching} + %\VignetteIndexEntry{Generate a matched age and sex cohort} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -15,30 +15,30 @@ knitr::opts_chunk$set( ``` # Introduction -CohortConstructor packages includes a function to obtain an age and sex matched cohort, the `generateMatchedCohortSet()` function. In this vignette, we will explore the functionalities of the function and provide valuable examples for its usage. +CohortConstructor packages includes a function to obtain an age and sex matched cohort, the `generateMatchedCohortSet()` function. In this vignette, we will explore the usage of this function. ## Create mock data -We will first use `mockDrugUtilisation()` functin from DrugUtilisation package to create mock data. +We will first use `mockDrugUtilisation()` function from DrugUtilisation package to create mock data. -```{r, include = FALSE} +```{r setup, message = FALSE, warning = FALSE} library(CohortConstructor) library(dplyr) library(DrugUtilisation) -cdm <- mockDrugUtilisation(numberIndividual = 200) +cdm <- mockDrugUtilisation(numberIndividual = 1000) ``` -As we will use `cohort1` to explore `generateMatchedCohortSet()`, let us first use `cohort_attrition()` from CDMConnector package to explore the this cohort: +As we will use `cohort1` to explore `generateMatchedCohortSet()`, let us first use `cohort_attrition()` from CDMConnector package to explore this cohort: -```{r, include = FALSE} +```{r} CDMConnector::cohort_set(cdm$cohort1) ``` Notice that there are three cohorts within this tibble, with id's going from 1 to 3. -# Use `generateMatchedCohortSet()` to create an age-sex matched cohort -Let us first see an example of how this function works. For its usage, we need to provide a `cdm` object, the `targetCohortName`, which is the name of the table containing the cohort of interest, and the `name` of the new table that will be created containing the cohort and the matching cohort. We will also use the argument `targetCohortId` to specify that we only want a matching cohort for `cohort_definition_id = 1`. +# Use generateMatchedCohortSet() to create an age-sex matched cohort +Let us first see an example of how this function works. For its usage, we need to provide a `cdm` object, the `targetCohortName`, which is the name of the table containing the cohort of interest, and the `name` of the new generated tibble containing the cohort and the matched cohort. We will also use the argument `targetCohortId` to specify that we only want a matched cohort for `cohort_definition_id = 1`. -```{r, include = FALSE} +```{r} cdm <- generateMatchedCohortSet(cdm = cdm, name = "matched_cohort1", targetCohortName = "cohort1", @@ -46,36 +46,60 @@ cdm <- generateMatchedCohortSet(cdm = cdm, CDMConnector::cohort_set(cdm$matched_cohort1) ``` -Notice that in the generated tibble, there are two cohorts: `cohort_definition_id = 1` (original cohort), and `cohort_definition_id = 4` (matched cohort). `target_cohort_name` column indicates which is the original cohort. `match_sex` and `match_year_of_birth` adopt boolean values (`TRUE`/`FALSE`) indicating if we have matched for sex and age, or not. `match_status` indicate if it is the original cohort (`target`) or if it is the matched cohort (`matched`). `target_cohort_id` indicates which is the cohort_id of the original cohort. - -`matchSex` is a boolean parameter (`TRUE`/`FALSE`) indicating if we want to match by sex (`TRUE`) or we do not want (`FALSE`). +Notice that in the generated tibble, there are two cohorts: `cohort_definition_id = 1` (original cohort), and `cohort_definition_id = 4` (matched cohort). *target_cohort_name* column indicates which is the original cohort. *match_sex* and *match_year_of_birth* adopt boolean values (`TRUE`/`FALSE`) indicating if we have matched for sex and age, or not. *match_status* indicate if it is the original cohort (`target`) or if it is the matched cohort (`matched`). *target_cohort_id* indicates which is the cohort_id of the original cohort. -`matchYear` is another boolean parameter (`TRUE`/`FALSE`) indicating if we want to match by age (`TRUE`) or we do not want (`FALSE`). - -The default matching ratio is 1:1. Use `cohort_counts()` from CDMConnector to check if the matching has been done as desired. - -```{r, include = FALSE} -CDMConnector::cohort_count(cdm$matched_cohort1) -``` -To have more information about the exclusion criteria applied to perform the matching, use `cohort_attrition()` from CDMConnector package: -```{r, include = FALSE} +Check the exclusion criteria applied to generate the new cohorts by using `cohort_attrition()` from CDMConnector package: +```{r} # Original cohort CDMConnector::cohort_attrition(cdm$matched_cohort1) %>% filter(cohort_definition_id == 1) # Matched cohort CDMConnector::cohort_attrition(cdm$matched_cohort1) %>% filter(cohort_definition_id == 4) ``` -Briefly, from the original cohort, we exclude first those individuals that do not have a match, and then individuals that their matching pair is not in observation during the assigned `cohort_start_date`. From the matched cohort, we start from the whole database and we first exclude individuals that are in the original cohort. Afterwards, we exclude individuals that do not have a match, then individuals that are not in observation during the assigned `cohort_start_date`, and finally we remove the as many individuals as required to fulfill the ratio. +Briefly, from the original cohort, we exclude first those individuals that do not have a match, and then individuals that their matching pair is not in observation during the assigned *cohort_start_date*. From the matched cohort, we start from the whole database and we first exclude individuals that are in the original cohort. Afterwards, we exclude individuals that do not have a match, then individuals that are not in observation during the assigned *cohort_start_date*, and finally we remove as many individuals as required to fulfill the ratio. -You can modify the `ratio` parameter to tailor your matched cohort. +Notice that matching pairs are randomly assigned, so it is probable that every time you execute this function, the generated cohorts change. Use `set.seed()` to avoid this. -```{r, include = FALSE} +## matchSex parameter +`matchSex` is a boolean parameter (`TRUE`/`FALSE`) indicating if we want to match by sex (`TRUE`) or we do not want to (`FALSE`). + +## matchYear parameter +`matchYear` is another boolean parameter (`TRUE`/`FALSE`) indicating if we want to match by age (`TRUE`) or we do not want (`FALSE`). + +Notice that if `matchSex = FALSE` and `matchYear = FALSE`, we will obtain an unmatched comparator cohort. + +## ratio parameter +The default matching ratio is 1:1 (`ratio = 1`). Use `cohort_counts()` from CDMConnector to check if the matching has been done as desired. + +```{r} +CDMConnector::cohort_count(cdm$matched_cohort1) +``` + +You can modify the `ratio` parameter to tailor your matched cohort. `ratio` can adopt values from 1 to Inf. + +```{r} cdm <- generateMatchedCohortSet(cdm = cdm, - name = "matched_cohort1", + name = "matched_cohort2", targetCohortName = "cohort1", targetCohortId = 1, ratio = Inf) -CDMConnector::cohort_count(cdm$matched_cohort1) +CDMConnector::cohort_count(cdm$matched_cohort2) +``` + +## Generate matched cohorts simultaneously across multiple cohorts +All these functionalities can be implemented across multiple cohorts simultaneously. Specify in `targetCohortId` parameter which are the cohorts of interest. If set to NULL, all the cohorts present in `targetCohortName` will be matched. + +```{r} +cdm <- generateMatchedCohortSet(cdm = cdm, + name = "matched_cohort3", + targetCohortName = "cohort1", + targetCohortId = c(1,3), + ratio = 2) + +CDMConnector::cohort_set(cdm$matched_cohort3) %>% arrange(cohort_definition_id) + +CDMConnector::cohort_count(cdm$matched_cohort3) %>% arrange(cohort_definition_id) ``` +Notice that each cohort has their own (and independent of other cohorts) matched cohort. From 534e12c61aa19ed3ea9b35ee4593db9cd038a75e Mon Sep 17 00:00:00 2001 From: Marta Alcalde-Herraiz Date: Fri, 12 Jan 2024 17:10:34 +0000 Subject: [PATCH 4/5] Update generateMatchedCohortSet.R --- R/generateMatchedCohortSet.R | 66 +++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/R/generateMatchedCohortSet.R b/R/generateMatchedCohortSet.R index d0d12f5e..538b3afa 100644 --- a/R/generateMatchedCohortSet.R +++ b/R/generateMatchedCohortSet.R @@ -16,7 +16,16 @@ #' #' @export #' -#' +#' @examples +#' library(DrugUtilisation) +#' library(CohortConstructor) +#' cdm <- mockDrugUtilisation(numberIndividuals = 100) +#' cdm$cohort1 %>% +#' requireCohortIntersectFlag(targetCohortTable = "cohort2", +#' targetCohortId = 1, +#' indexDate = "cohort_start_date", +#' window = c(-Inf, 0)) + generateMatchedCohortSet <- function(cdm, name, targetCohortName, @@ -64,6 +73,9 @@ generateMatchedCohortSet <- function(cdm, # Check cohort set ref cdm <- checkCohortSetRef(cdm, name, targetCohortName, matchSex, matchYearOfBirth, targetCohortId, n) + + # Rename cohort definition ids + cdm <- renameCohortDefinitionIds(cdm) } # Return return(cdm) @@ -440,3 +452,55 @@ checkCohortSetRef <- function(cdm, name, targetCohortName, matchSex, matchYearOf return(cdm) } + +renameCohortDefinitionIds <- function(cdm, name){ + new_cohort_set <- cdm[[name]] %>% + CDMConnector::cohort_set() %>% + dplyr::mutate(cohort_definition_id_new = target_cohort_id) %>% + arrange(cohort_definition_id_new) %>% + dplyr::mutate(cohort_definition_id_new = dplyr::row_number()) + + new_cohort_attrition <- cdm[[name]] %>% + CDMConnector::cohort_attrition() %>% + inner_join( + new_cohort_set %>% dplyr::select("cohort_definition_id","cohort_definition_id_new"), + by = "cohort_definition_id" + ) %>% + dplyr::select(-"cohort_definition_id") %>% + dplyr::rename("cohort_definition_id" = "cohort_definition_id_new") %>% + dplyr::relocate(cohort_definition_id) + + new_cohort_count <- cdm[[name]] %>% + CDMConnector::cohort_count() %>% + inner_join( + new_cohort_set %>% dplyr::select("cohort_definition_id","cohort_definition_id_new"), + by = "cohort_definition_id" + ) %>% + dplyr::select(-"cohort_definition_id") %>% + dplyr::rename("cohort_definition_id" = "cohort_definition_id_new") %>% + dplyr::relocate(cohort_definition_id) + + new_cohort <- cdm[[name]] %>% + inner_join( + new_cohort_set %>% dplyr::select("cohort_definition_id","cohort_definition_id_new"), + by = "cohort_definition_id", + copy = TRUE + ) %>% + dplyr::select(-"cohort_definition_id") %>% + dplyr::rename("cohort_definition_id" = "cohort_definition_id_new") %>% + dplyr::relocate(cohort_definition_id) %>% + CDMConnector::compute_query() %>% + CDMConnector::compute_query(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) + + new_cohort_set <- new_cohort_set %>% + dplyr::select(-"cohort_definition_id") %>% + dplyr::rename("cohort_definition_id" = "cohort_definition_id_new") %>% + dplyr::relocate(cohort_definition_id) + + cdm[[name]] <- CDMConnector::new_generated_cohort_set( + cohort_ref = new_cohort, + cohort_attrition_ref = new_cohort_attrition , + cohort_set_ref = new_cohort_set, + cohort_count_ref = new_cohort_count, + overwrite = TRUE) +} From 3346e99f6e91c3781e0e6f3113a3485eda62ced6 Mon Sep 17 00:00:00 2001 From: Marta Alcalde-Herraiz Date: Fri, 12 Jan 2024 18:20:48 +0000 Subject: [PATCH 5/5] Update generateMatchedCohortSet() --- R/generateMatchedCohortSet.R | 38 +++++++++++-------- man/generateMatchedCohortSet.Rd | 15 ++++++++ .../testthat/test-generateMatchedCohortSet.R | 8 ++-- vignettes/a03_age_sex_matching.Rmd | 1 - 4 files changed, 41 insertions(+), 21 deletions(-) diff --git a/R/generateMatchedCohortSet.R b/R/generateMatchedCohortSet.R index 538b3afa..ad7d768a 100644 --- a/R/generateMatchedCohortSet.R +++ b/R/generateMatchedCohortSet.R @@ -19,13 +19,17 @@ #' @examples #' library(DrugUtilisation) #' library(CohortConstructor) +#' library(dplyr) #' cdm <- mockDrugUtilisation(numberIndividuals = 100) -#' cdm$cohort1 %>% -#' requireCohortIntersectFlag(targetCohortTable = "cohort2", -#' targetCohortId = 1, -#' indexDate = "cohort_start_date", -#' window = c(-Inf, 0)) - +#' cdm <- cdm %>% +#' generateMatchedCohortSet(name = "new_matched_cohort", +#' targetCohortName = "cohort1", +#' targetCohortId = c(1,2), +#' matchSex = TRUE, +#' matchYearOfBirth = TRUE, +#' ratio = 2) +#' cdm$new_matched_cohort +#' generateMatchedCohortSet <- function(cdm, name, targetCohortName, @@ -75,7 +79,7 @@ generateMatchedCohortSet <- function(cdm, cdm <- checkCohortSetRef(cdm, name, targetCohortName, matchSex, matchYearOfBirth, targetCohortId, n) # Rename cohort definition ids - cdm <- renameCohortDefinitionIds(cdm) + cdm <- renameCohortDefinitionIds(cdm, name) } # Return return(cdm) @@ -456,46 +460,46 @@ checkCohortSetRef <- function(cdm, name, targetCohortName, matchSex, matchYearOf renameCohortDefinitionIds <- function(cdm, name){ new_cohort_set <- cdm[[name]] %>% CDMConnector::cohort_set() %>% - dplyr::mutate(cohort_definition_id_new = target_cohort_id) %>% - arrange(cohort_definition_id_new) %>% + dplyr::mutate(cohort_definition_id_new = .data$target_cohort_id) %>% + dplyr::arrange(.data$cohort_definition_id_new) %>% dplyr::mutate(cohort_definition_id_new = dplyr::row_number()) new_cohort_attrition <- cdm[[name]] %>% CDMConnector::cohort_attrition() %>% - inner_join( + dplyr::inner_join( new_cohort_set %>% dplyr::select("cohort_definition_id","cohort_definition_id_new"), by = "cohort_definition_id" ) %>% dplyr::select(-"cohort_definition_id") %>% dplyr::rename("cohort_definition_id" = "cohort_definition_id_new") %>% - dplyr::relocate(cohort_definition_id) + dplyr::relocate(.data$cohort_definition_id) new_cohort_count <- cdm[[name]] %>% CDMConnector::cohort_count() %>% - inner_join( + dplyr::inner_join( new_cohort_set %>% dplyr::select("cohort_definition_id","cohort_definition_id_new"), by = "cohort_definition_id" ) %>% dplyr::select(-"cohort_definition_id") %>% dplyr::rename("cohort_definition_id" = "cohort_definition_id_new") %>% - dplyr::relocate(cohort_definition_id) + dplyr::relocate(.data$cohort_definition_id) new_cohort <- cdm[[name]] %>% - inner_join( + dplyr::inner_join( new_cohort_set %>% dplyr::select("cohort_definition_id","cohort_definition_id_new"), by = "cohort_definition_id", copy = TRUE ) %>% dplyr::select(-"cohort_definition_id") %>% dplyr::rename("cohort_definition_id" = "cohort_definition_id_new") %>% - dplyr::relocate(cohort_definition_id) %>% + dplyr::relocate(.data$cohort_definition_id) %>% CDMConnector::compute_query() %>% CDMConnector::compute_query(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) new_cohort_set <- new_cohort_set %>% dplyr::select(-"cohort_definition_id") %>% dplyr::rename("cohort_definition_id" = "cohort_definition_id_new") %>% - dplyr::relocate(cohort_definition_id) + dplyr::relocate(.data$cohort_definition_id) cdm[[name]] <- CDMConnector::new_generated_cohort_set( cohort_ref = new_cohort, @@ -503,4 +507,6 @@ renameCohortDefinitionIds <- function(cdm, name){ cohort_set_ref = new_cohort_set, cohort_count_ref = new_cohort_count, overwrite = TRUE) + + return(cdm) } diff --git a/man/generateMatchedCohortSet.Rd b/man/generateMatchedCohortSet.Rd index baa9aac7..39a61b7a 100644 --- a/man/generateMatchedCohortSet.Rd +++ b/man/generateMatchedCohortSet.Rd @@ -41,3 +41,18 @@ Generate a new cohort matched cohort from a preexisting target cohort. The new cohort will contain individuals not included in the target cohort with same year of birth (matchYearOfBirth = TRUE) and same sex (matchSex = TRUE). } +\examples{ +library(DrugUtilisation) +library(CohortConstructor) +library(dplyr) +cdm <- mockDrugUtilisation(numberIndividuals = 100) +cdm <- cdm \%>\% + generateMatchedCohortSet(name = "new_matched_cohort", + targetCohortName = "cohort1", + targetCohortId = c(1,2), + matchSex = TRUE, + matchYearOfBirth = TRUE, + ratio = 2) +cdm$new_matched_cohort + +} diff --git a/tests/testthat/test-generateMatchedCohortSet.R b/tests/testthat/test-generateMatchedCohortSet.R index 704857b9..47d2f248 100644 --- a/tests/testthat/test-generateMatchedCohortSet.R +++ b/tests/testthat/test-generateMatchedCohortSet.R @@ -291,19 +291,19 @@ test_that("test exactMatchingCohort with a ratio bigger than 1", { ratio = 4) expect_true(a[["new_cohort"]] %>% - dplyr::filter(cohort_definition_id %in% c(1,3)) %>% + dplyr::filter(cohort_definition_id %in% c(1,2)) %>% dplyr::summarise(subject_id) %>% dplyr::distinct() %>% dplyr::pull() %>% length() == 10) expect_true(a[["new_cohort"]] %>% - dplyr::filter(cohort_definition_id %in% c(2,4)) %>% + dplyr::filter(cohort_definition_id %in% c(3,4)) %>% dplyr::summarise(subject_id) %>% dplyr::distinct() %>% dplyr::pull() %>% length() == 10) expect_true(a[["new_cohort"]] %>% - dplyr::filter(cohort_definition_id %in% c(1,3)) %>% + dplyr::filter(cohort_definition_id %in% c(1,2)) %>% dplyr::summarise(cohort_start_date) %>% dplyr::distinct() %>% dplyr::pull() %>% length() == 2) expect_true(a[["new_cohort"]] %>% - dplyr::filter(cohort_definition_id %in% c(2,4)) %>% + dplyr::filter(cohort_definition_id %in% c(3,4)) %>% dplyr::summarise(cohort_start_date) %>% dplyr::distinct() %>% dplyr::pull() %>% length() == 2) diff --git a/vignettes/a03_age_sex_matching.Rmd b/vignettes/a03_age_sex_matching.Rmd index 26efe14b..a49591b9 100644 --- a/vignettes/a03_age_sex_matching.Rmd +++ b/vignettes/a03_age_sex_matching.Rmd @@ -33,7 +33,6 @@ As we will use `cohort1` to explore `generateMatchedCohortSet()`, let us first u ```{r} CDMConnector::cohort_set(cdm$cohort1) ``` -Notice that there are three cohorts within this tibble, with id's going from 1 to 3. # Use generateMatchedCohortSet() to create an age-sex matched cohort Let us first see an example of how this function works. For its usage, we need to provide a `cdm` object, the `targetCohortName`, which is the name of the table containing the cohort of interest, and the `name` of the new generated tibble containing the cohort and the matched cohort. We will also use the argument `targetCohortId` to specify that we only want a matched cohort for `cohort_definition_id = 1`.