-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_clean_data.R
213 lines (172 loc) · 13.3 KB
/
01_clean_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# Program Information ----------------------------------------------------
#
# Program: 01_sensitivity_clean_data.R
# Description: this script does some data cleaning relevant for all designs, and outputs a dataset for
# SCRI sensitivity analyses.
# It uses some code from Svetlana's original programming to ensure consistency in the data cleaning
# Requirements: dataset in RData format called D3_study_population_SCRI, note, should be in g_intermediate
# Output: this outputs a single dataset called sccs_data_extract to g_intermediate
# 0. HOUSEKEEPING ---------------------------------------------------------
# current script assumes wd is the project directory
if(!any(ls()=="thisdir")) thisdir <- paste0(getwd(),"/")
if(!any(ls()=="dirtemp")) dirtemp <- paste0(thisdir,"g_intermediate/")
if(!any(ls()=="diroutput")) diroutput <- paste0(thisdir,"g_output/")
# specify the input data name
raw_data_name <- "D3_study_population_SCRI"
# ensure required folders are created
dir.create(file.path(paste0(dirtemp, "sccs_sensitivity")), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path(paste0(thisdir,"log_files/sccs_sensitivity")), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path(paste0(diroutput, "sccs_sensitivity")), showWarnings = FALSE, recursive = TRUE)
# 1. IMPORT DATA ----------------------------------------------------------
# Import data
# CONFIRM: IS THIS RDATA OR CSV FOR DAPS??
load(paste0(dirtemp, raw_data_name, ".RData"))
scri_data_extract <- eval(parse(text = raw_data_name))
# 2. DATA CLEANING ------------------------------------------------------
## data cleaning which should apply to all case series, irrespective of DAP and outcome
## QC and sense checks
# Check key variables exist and are expected format, otherwise throw warning
key_vars <- c("date_vax1", "date_vax2", "date_vax3", "study_entry_date", "study_exit_date")
# check the vaccine, entry, and exit dates exist
invisible(lapply(key_vars, function(x) if (!(x %in% colnames(scri_data_extract))) {
stop(paste(x," does not exist"))
}))
# Harmonise variable names; remove underscores
oldnames <- c(names(scri_data_extract[grepl("vax", names(scri_data_extract))]),
names(scri_data_extract[grepl("covid", names(scri_data_extract))]))
newnames <- gsub("vax_", "vax", oldnames)
newnames <- gsub("covid_19|covid_date", "covid19", newnames)
names(scri_data_extract)[match(oldnames, names(scri_data_extract))] <- newnames
# create list of date variable names for later use (except the study start date)
date_vars <- colnames(scri_data_extract)[grepl("date", colnames(scri_data_extract))==T]
date_vars <- date_vars[date_vars != "start_study_date"]
date_vars_days <- gsub("date","days",date_vars)
# convert numeric to date (dates may be numeric in BIFAP)
scri_data_extract[date_vars] <- lapply(scri_data_extract[date_vars],
function(x) if (is.numeric(x)) as.Date(x, origin="1970-01-01") else x)
# check for characters
invisible(lapply(colnames(scri_data_extract[date_vars]), function(x) {
if (is.character(scri_data_extract[[x]])) {
message(paste(x, "is character, check the format for converting dates as program assumes YYYY-MM-DD"))
}
}))
# convert character to date (assumed format YYYY-MM-DD)
scri_data_extract[date_vars] <- lapply(scri_data_extract[date_vars],
function(x) if (is.character(x)) as.Date(x) else x)
# double check that formats are as expected
message("check the formats of key variables, these should be a date format")
lapply(scri_data_extract[date_vars], function(x) class(x))
# create a variable for study start date
# only relevant where someone enters late, not applicable to this study as there is no changing eligibility in age which would allow someone late entry
start_scri_date <- as.Date("2020-09-01")
# the following code is retained in case we do want to allow for late entry, it is not run because the simulated data only have people entering after the study start date
# scri_data_extract$start_study_date <- pmax(scri_data_extract$study_entry_date, start_scri_date, na.rm = T)
scri_data_extract$start_study_date <- start_scri_date
# get max number of vaccine doses
nvax <- names(scri_data_extract)[ substring(names(scri_data_extract),1,8)=="date_vax" ]
nvax <- max(as.numeric(substring(nvax,9)))
# Basic dataset information
message(paste("There are", nrow(scri_data_extract), "individuals in this dataset"))
message(paste("The maximum number of vaccine doses was", nvax))
# flag how common late entry into the dataset is, anticipated to be very low or zero
if(any(scri_data_extract$study_entry_date > scri_data_extract$start_study_date))
message(paste("There are", sum(scri_data_extract$study_entry_date > scri_data_extract$start_study_date), "persons with study entry date after the SCRI start date"))
# change the date variables to days from the calendar start date of our observation period
scri_data_extract[date_vars_days] <- lapply(scri_data_extract[date_vars],
function(x){round(difftime(x, start_scri_date, units = "days"), 0)})
# Confirm that the first vaccine dose occurs after the study entry date, remove rows if this does not happen
if(any(scri_data_extract$start_study_date > scri_data_extract$date_vax1 & !is.na(scri_data_extract$date_vax1) & !is.na(scri_data_extract$start_study_date), na.rm=T )){
message(paste("'study_entry_days' after the vax1 for ", sum(scri_data_extract$start_study_date > scri_data_extract$date_vax1, na.rm=T), "rows. They are deleted!"))
nrow0<-nrow(scri_data_extract)
scri_data_extract <- scri_data_extract[ scri_data_extract$start_study_date <= scri_data_extract$date_vax1 & scri_data_extract$vax1==1 & !is.na(scri_data_extract$start_study_date),] # if study_entry_days > vax1 ==> delete rows
message(c(new_nrow=nrow(scri_data_extract), old_nrow=nrow0))
}
# Check the time difference (in days) between vaccine doses
# It should be at least 14 days; print warning if this is not the case
scri_data_extract$dose_diff = as.numeric(difftime(scri_data_extract$date_vax2, scri_data_extract$date_vax1,units="days"))
message("Distribution of time between dose 1 and 2")
summary(scri_data_extract$dose_diff)
if(any(min(scri_data_extract$dose_diff[!is.na(scri_data_extract$dose_diff)])<14))
message(paste("There are ", sum(scri_data_extract$dose_diff<14, na.rm=T), " persons with (date_vax2 - date_vax1) < 14 days."))
# do the same for dose 3 to dose 2
# add an extra check to check whether the planned control time of 88 days is available
scri_data_extract$dose_diff2 = as.numeric(difftime(scri_data_extract$date_vax3, scri_data_extract$date_vax2,units="days"))
message("Distribution of time between dose 2 and 3")
summary(scri_data_extract$dose_diff2)
if(any(min(scri_data_extract$dose_diff2[!is.na(scri_data_extract$dose_diff2)])<14))
message(paste("There are ", sum(scri_data_extract$dose_diff2<14, na.rm=T), " persons with (date_vax3 - date_vax2) < 14 days."))
if(any(min(scri_data_extract$dose_diff2[!is.na(scri_data_extract$dose_diff2)])<88))
message(paste("There are ", sum(scri_data_extract$dose_diff2<14, na.rm=T), " persons with (date_vax3 - date_vax2) < 88 days (needed control time for post vaccine SCRI)"))
# Check whether someone's study exit date occurs before the vaccine, remove rows if this does happen
if(any(scri_data_extract$study_exit_days < scri_data_extract$days_vax1 & !is.na(scri_data_extract$days_vax1), na.rm = T)) {
message(paste0("There are ",sum(scri_data_extract$study_exit_days < scri_data_extract$days_vax1,na.rm=T)," persons with 'study_exit_time' before vax1. They are deleted!"))
nrow0<-nrow(scri_data_extract)
scri_data_extract <- scri_data_extract[scri_data_extract$days_vax1 <= scri_data_extract$study_exit_days & !is.na(scri_data_extract$days_vax1),] # if study_exit_days < vax1 ==> delete rows
}
# number of people with second vaccine dose before end of study (cond is just used in the check for clarity)
if(any((cond <- !is.na(scri_data_extract$days_vax2) & scri_data_extract$study_exit_days < scri_data_extract$days_vax2) )){
message(paste0("There are ",sum( cond, na.rm=T)," persons with 'study_exit_time' before vax2!"))
scri_data_extract$study_exit_days[cond] <- scri_data_extract$days_vax2[cond]
}
# number of people with myocarditis after end of study (cond is just used in the check for clarity)
if(any(scri_data_extract$study_exit_date < scri_data_extract$myocarditis_date & !is.na(scri_data_extract$myocarditis_date), na.rm = T)) {
message(paste0("There are ",sum(scri_data_extract$study_exit_date < scri_data_extract$myocarditis_date,na.rm=T)," persons with 'study_exit_time' before myocarditis_date."))
}
# cleaning of vaccine doses (only applies if more than 2)
# follows conventions set for myo/perio analyses, so have retained
if(nvax > 2){
while(any(cond <- !is.na(scri_data_extract$days_vax2) & (scri_data_extract$days_vax2 - scri_data_extract$days_vax1) < 5)){
message(paste(sum(cond),"'dose2' were replace with next dose because dose2 is less than 5 days after dose1."))
for(iv in 3:nvax){
scri_data_extract[cond, paste0("date_vax",iv-1) ] <- scri_data_extract[cond, paste0("date_vax",iv) ]
scri_data_extract[cond, paste0("days_vax",iv-1) ] <- scri_data_extract[cond, paste0("days_vax",iv) ]
scri_data_extract[cond, paste0("type_vax",iv-1) ] <- scri_data_extract[cond, paste0("type_vax",iv) ]
scri_data_extract[cond, paste0("vax" ,iv-1) ] <- scri_data_extract[cond, paste0("vax" ,iv) ]
}
}
}
# update the basic dataset summaries after the above cleaning
scri_data_extract$dose_diff <- as.numeric(difftime(scri_data_extract$date_vax2, scri_data_extract$date_vax1 ,units="days"))
message("Distribution of time between dose 1 and 2 after data cleaning")
summary(scri_data_extract$dose_diff)
if(any(min(scri_data_extract$dose_diff[!is.na(scri_data_extract$dose_diff)])<14))
message(paste("There are ", sum(scri_data_extract$dose_diff<14, na.rm=T), " persons with (date_vax2 - date_vax1) < 14 days after data cleaning"))
scri_data_extract$dose_diff2 = as.numeric(difftime(scri_data_extract$date_vax3, scri_data_extract$date_vax2,units="days"))
message("Distribution of time between dose 2 and 3 after data cleaning")
summary(scri_data_extract$dose_diff2)
if(any(min(scri_data_extract$dose_diff2[!is.na(scri_data_extract$dose_diff2)])<14))
message(paste("There are ", sum(scri_data_extract$dose_diff2<14, na.rm=T), " persons with (date_vax3 - date_vax2) < 14 days after data cleaning"))
if(any(min(scri_data_extract$dose_diff2[!is.na(scri_data_extract$dose_diff2)])<88))
message(paste("There are ", sum(scri_data_extract$dose_diff2<14, na.rm=T), " persons with (date_vax3 - date_vax2) < 88 days after data cleaning"))
# create a variable for the last possible vaccine date for the post vaccine SCRI analyses (max of dose 1 and 2, because control time will start after dose 2)
scri_data_extract$days_last_vax <- pmax(scri_data_extract$days_vax1, scri_data_extract$days_vax2, na.rm = T)
# create myopericarditis date variable
scri_data_extract$myopericarditis_date <- pmin(scri_data_extract$myocarditis_date, scri_data_extract$pericarditis_date,na.rm=T)
# create vaccine indicator variables
scri_data_extract$vax1 <- !is.na(scri_data_extract$date_vax1)
scri_data_extract$vax2 <- !is.na(scri_data_extract$date_vax2)
scri_data_extract$vax3 <- !is.na(scri_data_extract$date_vax3)
# what is the time distribution between doses?
scri_data_extract$diff_d1d2 <- as.numeric(scri_data_extract$date_vax2 - scri_data_extract$date_vax1)
scri_data_extract$diff_d2d3 <- as.numeric(scri_data_extract$date_vax3 - scri_data_extract$date_vax2)
# number of people with vaccine after event
scri_data_extract$dose1_afterevent <- (scri_data_extract$myopericarditis_date <= scri_data_extract$date_vax1)
scri_data_extract$dose2_afterevent <- ((scri_data_extract$myopericarditis_date <= scri_data_extract$date_vax2) & (scri_data_extract$myopericarditis_date > scri_data_extract$date_vax1) & !is.na(scri_data_extract$date_vax2))
scri_data_extract$dose3_afterevent <- ((scri_data_extract$myopericarditis_date <= scri_data_extract$date_vax3) & (scri_data_extract$myopericarditis_date > scri_data_extract$date_vax2) & !is.na(scri_data_extract$date_vax3))
# Time between event and doses?
scri_data_extract$diff_event_d1 <- as.numeric(scri_data_extract$date_vax1 - scri_data_extract$myopericarditis_date)
scri_data_extract$diff_event_d2 <- as.numeric(scri_data_extract$date_vax2 - scri_data_extract$myopericarditis_date)
scri_data_extract$diff_event_d3 <- as.numeric(scri_data_extract$date_vax3 - scri_data_extract$myopericarditis_date)
# create a numeric ID variable
scri_data_extract$numeric_id <- match(scri_data_extract$person_id, unique(scri_data_extract$person_id))
### throw a warning if duplicate PIDs
if(any(duplicated(scri_data_extract$numeric_id))) {
warning("there are unexpected duplicate PIDs in the data, check with DAP")
message("duplicate PIDs dropped to enable function to run")
scri_data_extract <- scri_data_extract %>%
distinct(numeric_id, .keep_all = TRUE) # should really be dropped by DAP
}
# 3. SAVE DATA ------------------------------------------------------------
# saving data with new name to prevent overwriting any prior files
sccs_data_extract <- scri_data_extract
save(sccs_data_extract,file = paste0(dirtemp,"sccs_sensitivity/","sccs_data_extract.RData"))