forked from ellessenne/comorbidity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
testing.R
104 lines (84 loc) · 2.74 KB
/
testing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
### R script for testing
## Load libraries
library(comorbidity)
library(data.table)
library(tictoc)
## User function to replace factor levels using regex matching
### 'replace' must be a named list of regex codes, unmatched codes are dropped.
xfactor <- function(x, replace) {
x <- factor(x)
levels_tmp <- levels(x)
for (i in seq_along(replace)) {
levels_tmp[grepl(replace[i], levels_tmp)] <- names(replace)[i]
}
levels(x) <- levels_tmp
x <- factor(x, levels = names(regex))
}
## Create example dataset
### Number of patients
n_ids <- 1000000
### Average number of codes per patient
n_codes <- 50
set.seed(1)
dfr <- data.frame(
id = sample(1:n_ids, size = n_ids * n_codes, replace = TRUE),
code = sample_diag(n_ids * n_codes),
stringsAsFactors = FALSE
)
dfr <- dfr[order(dfr$id), ]
### Set options
id <- "id"
code <- "code"
score <- "charlson"
icd <- "icd10"
regex <- comorbidity:::lofregex[[score]][[icd]]
## Using current scoring algorithm
tictoc::tic()
### Split by ID
x1 <- utils::unstack(dfr, form = stats::as.formula(paste(code, id, sep = "~")))
### Run scoring algorithm
x1 <- comorbidity:::.score(x1, id = id, score = score, icd = icd, parallel = TRUE, mc.cores = 4)
x1[, -1] <- lapply(x1[, -1], as.integer)
tictoc::toc()
## New algorithm using base R
x2 <- dfr
tictoc::tic()
x2$code_f <- xfactor(x = x2[, code], replace = regex)
x2 <- unique(x2[, c(id, "code_f")])
x2 <- reshape2::dcast(x2, id ~ code_f, length, value.var = "code_f", fill = 0)
x2$`NA` <- NULL
x2[, id] <- as.character(x2[, id])
x2[, -1] <- lapply(x2[, -1], as.integer)
tictoc::toc()
identical(x1, x2)
## A further gain of speed using the data.table package
x3 <- dfr
tictoc::tic()
setDT(x3)
x3[, code_f := xfactor(x3[, code], regex)]
x3 <- dcast.data.table(unique(x3[, .(id, code_f, value = 1L)]), id ~ code_f, fill = 0)
x3[, `NA` := NULL]
x3[, id := as.character(id)]
setDF(x3)
tictoc::toc()
identical(x1, x3)
## Base approach
x4 <- dfr
tictoc::tic()
### Convert code to factor and replace codes with comorbidity names
x4$code_f <- xfactor(x = x4[, code], replace = regex)
### Drop records without any comorbidities and collapse duplicate comorbidities.
x4_w <- unique(x4[!is.na(x4$code_f), c(id, "code_f")])
### Reshape data wide, correct order and names of comorbidity variables
x4_w <- x4_w[order(x4_w[, id]), ]
x4_w$flag <- 1L
x4_w <- reshape(x4_w, idvar = id, timevar = "code_f", direction = "wide", sep = "_")
names(x4_w) <- gsub("flag_", "", names(x4_w))
x4_w <- x4_w[, c(id, names(regex))]
### Add back in records without any comorbidities, set NA values to 0.
x4 <- merge(unique(x4[, id, drop = FALSE]), x4_w, by = id, all.x = TRUE)
x4[is.na(x4)] <- 0L
### Change id variable to character (to match current output)
x4[, id] <- as.character(x4[, id])
tictoc::toc()
identical(x1, x4)