-
Notifications
You must be signed in to change notification settings - Fork 0
/
ct_adjustment_example.R
164 lines (126 loc) · 6.85 KB
/
ct_adjustment_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#Running the different cell type adjustment methods on the simulated beta value matrix
#Loading the simulated data from the main simulation scenario. One could also load one of the other .RData files corresponding to other
#simulation scenarios.
# Data for simulation scenarios can be found at https://zenodo.org/record/34128#.VlMy8WSrSRs
# sim_beta is the matrix of simulated beta values (probes x samples).
# disease_status is a vector with the phenotype (0=control, 1=case, except in the continuous scenario dataset)
# true_disease is a character vector with the original patient disease status (data from before simulation). This is coded to levels D1 through D5.
# dmr contains the indices of CpGs that were simulated to be differentially methylated with the phenotype
load("simulated_data.RData")
#Random sample of data to speed up demonstration, if desired
set.seed(24601)
nprobes = 50000
rs = sample(1:nrow(sim_beta), nprobes)
sim_beta = sim_beta[rs,]
dmr = dmr[dmr %in% rs]
###################################################
#Model not adjusting for cell type
library(limma)
X.unadj = model.matrix(~disease_status)
lm.unadj = eBayes(lmFit(sim_beta, X.unadj))
p.unadj = lm.unadj$p.value[,2]
###################################################
#Code for reference-based and reference-free methods based on Houseman tutorial 2014
#http://people.oregonstate.edu/~housemae/software/TutorialLondon2014/
###################################################
#Reference-based method: http://bioconductor.org/packages/release/bioc/html/minfi.html
library(minfi)
library(FlowSorted.Blood.450k)
data(FlowSorted.Blood.450k.JaffeModelPars)
commondmr <- intersect(rownames(FlowSorted.Blood.450k.JaffeModelPars), rownames(sim_beta))
#Directly estimating cell type composition
cellcomps = minfi:::projectCellType(sim_beta[commondmr,], FlowSorted.Blood.450k.JaffeModelPars[commondmr,], lessThanOne=TRUE)
X.ref = model.matrix(~disease_status+cellcomps)
lm.ref = eBayes(lmFit(sim_beta, X.ref))
p.ref = lm.ref$p.value[,2]
###################################################
#Reference-free method: https://cran.r-project.org/web/packages/RefFreeEWAS/index.html
library(RefFreeEWAS)
#Estimated latent dimension
est_dim = EstDimRMT(t(scale(t(sim_beta - lm.unadj$coef %*% t(X.unadj)))), plot=FALSE)$dim
#Main function
mod = RefFreeEwasModel(sim_beta, cbind(1,disease_status), K=est_dim)
nboot = 100 #Number of bootstrap samples to obtain standard errors
boot = BootRefFreeEwasModel(mod, nboot)
#Extracting standard errors and calculating test statistic
se = apply(boot[,,"B",], 1:2, sd)
ts = abs(mod$Beta)/se
df = -diff(dim(model.matrix(~disease_status)))-apply(is.na(sim_beta),1,sum)
p.reffree = 2*pt(ts[,2], df=df, lower.tail=FALSE)
###################################################
#Surrogate variable analysis: https://www.bioconductor.org/packages/release/bioc/html/sva.html
library(sva)
#Null model matrix must be nested in the full model matrix
model_mat = model.matrix(~disease_status)
null_model_mat = model.matrix(~1, data.frame(disease_status))
#Main SVA function
SVA = sva(sim_beta, model_mat, null_model_mat, method="irw")
#Calculate p-values
update_model = cbind(model_mat, SVA$sv)
update_null_model = cbind(null_model_mat, SVA$sv)
p.sva = f.pvalue(sim_beta, update_model, update_null_model)
###################################################
#Independent Surrogate Variable Analysis: https://cran.fhcrc.org/web/packages/isva/index.html
library(isva)
ISVA = DoISVA(sim_beta, matrix(disease_status))
p.isva = ISVA$spv
###################################################
#Deconfounding: http://web.cbio.uct.ac.za/~renaud/CRAN/
library(deconf)
#Running deconfounding on subset to speed up demonstration
sampdec = sample(length(sim_beta[,1]), 1000)
tmpdec = deconfounding(sim_beta[sampdec,],3)
X.tmpdec = model.matrix(~disease_status+t(tmpdec$C$Matrix))
lm.tmpdec = eBayes(lmFit(sim_beta, X.tmpdec))
p.dec = lm.tmpdec$p.value[,2]
##################################################
#RUV: https://cran.r-project.org/web/packages/ruv/index.html
library(ruv)
#Choosing Jaffe CpG sites to be control probes for ruv
ruv_controls = which(rownames(sim_beta) %in% rownames(FlowSorted.Blood.450k.JaffeModelPars))
#Taking only the controls that do not correlate with the phenotype
subset_controls = ruv_controls[lm.unadj$p.value[ruv_controls,2]>0.01]
#Turning control probe vector into logical
ctl = rep(FALSE, dim(sim_beta)[1]); ctl[subset_controls] = TRUE
#getK estimates the latent dimension, but for this dataset it ends up being much too high
ruvK = getK(t(sim_beta), matrix(disease_status), ctl)$k
ruv_mod = RUV4(t(sim_beta), matrix(disease_status), ctl, k=ruvK)
p.ruv = ruv_mod$p
##################################################
#EWASher and CellCDecon need to be run externally... For EWASher, run the script "run_ewasher.sh", and for
#CellCDecon run the script "run_cellcdecon.sh". For both methods use the text files generated below as the input.
#EWASher http://research.microsoft.com/en-us/downloads/472fe637-7cb9-47d4-a0df-37118760ccd1/
#CellCDecon https://github.com/jameswagner/CellCDecon
#Function to save data to text files for use in EWASher and CellCDecon
makefiles_FLE = function(rSrcDir, data, phen, covar)
{
#write each input to file, then call RunFLEFromFiles
write.table(data, file = "datafileTmp.txt", col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
write.table(phen, file = "phenfileTmp.txt", col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
write.table(covar, file = "covarfileTmp.txt", col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
}
#Need to manipulate beta matrix to what EWASher and CCD expect
tmp_beta = data.frame(rownames(sim_beta), sim_beta)
colnames(tmp_beta) = c("ID", colnames(sim_beta))
#Directory to output files to
path_to_dir = "/home/data1/homeldi/kevin.mcgregor/research/marie_hudson/r_scripts/mixtures/github_tutorial"
#Make the EWASher input files, only phenotype (no covariates). The beta matrix file is used for
# CellCDecon as well
makefiles_FLE(path_to_dir, tmp_beta, data.frame(colnames(tmp_beta)[-1], disease_status), covar=NULL)
#Or save another version with patients' true disease (coded) from original 450K data before simulation
#as a covariate to compare performance of EWASher with/without the extra covariates
makefiles_FLE(path_to_dir, tmp_beta,
data.frame(colnames(tmp_beta)[-1], disease_status),
data.frame(1, colnames(tmp_beta[-1]), 1*(true_disease=="D1"), 1*(true_disease=="D2"),
1*(true_disease=="D3"), 1*(true_disease=="D4"), 1*(true_disease=="D5")) )
####################################################
#Generating qqplots for p-values for the different methods
library(qqman)
qq(p.unadj)
qq(p.ref)
qq(p.reffree)
qq(p.sva)
qq(p.isva)
qq(p.dec)
qq(p.ruv)
#CellCDecon and EWASher could be plotted as well if the p-values were loaded back into the workspace after running the methods externally.