-
Notifications
You must be signed in to change notification settings - Fork 0
/
deAnonymize_pilot_3_4.R
242 lines (193 loc) · 10.7 KB
/
deAnonymize_pilot_3_4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
library(Seurat)
##
## Data loading
##
get.data <- function(pilot, lane) {
data <- read.table(paste0("~/Documents/Pilot ",pilot,"/deAnonymizer/pilot",pilot,"lane",lane,"_results_deanonymize_chrALL.txt"), header = T, sep = "\t", fill=T)
rownames(data) <- paste0(sub("(.+)\\-{1}\\d{1}", "\\1", data$cell_id), "_lane", lane)
data$lane <- lane
sample.names <- get.sample.names(data)
data$assigned_sample.s. <- set.sample.name(data$Singlet_samp, sample.names)
data.xy <- read.table(paste0("~/Documents/Pilot ",pilot,"/xy/lane_",lane,".txt"), header = T)
data.xy <- data.xy[order(data.xy$cell),] # order
data.exp <- Read10X(paste0("~/Documents/Pilot ",pilot,"/data/lane_",lane,"/"))
data.seurat <- new("seurat", raw.data = data.exp)
data.seurat <- Setup(data.seurat, min.cells = 0, min.genes = 0, project = paste0("pilot",pilot,".lane",lane), do.scale = F, do.center = F, names.field = 1, names.delim = "\\-")
data.meta <- FetchData(data.seurat, c("nUMI", "nGene"))
return(cbind(data, data.xy, data.meta))
}
get.sample.names <- function(data) {
sample.names <- (c(as.character(data[data$outcome=="Singlet" & data$Singlet_samp == 0,]$assigned_sample.s.)[1],
as.character(data[data$outcome=="Singlet" & data$Singlet_samp == 1,]$assigned_sample.s.)[1],
as.character(data[data$outcome=="Singlet" & data$Singlet_samp == 2,]$assigned_sample.s.)[1],
as.character(data[data$outcome=="Singlet" & data$Singlet_samp == 3,]$assigned_sample.s.)[1],
as.character(data[data$outcome=="Singlet" & data$Singlet_samp == 4,]$assigned_sample.s.)[1],
as.character(data[data$outcome=="Singlet" & data$Singlet_samp == 5,]$assigned_sample.s.)[1]))
return(sample.names)
}
set.sample.name <- function(id, names) names[id+1]
##
## Plot helper functions
##
source("./multiplot.R")
colors.x = c("#009ddb", "#edba1b", "#71bc4b","#153057", "#965ec8", "#e64b50")
give.n <- function(x) {
return(data.frame(y = unname(quantile(x, probs = 0.75, type = 3))-0.25, label = paste0("N=",length(x))))
}
boxplot.samples <- function(data, doublets=F, notch=T) {
colors <- colors.x[!is.na(get.sample.names(data))]
#if (!doublets) data <- subset(data, outcome=="Singlet")
if (!doublets) data <- subset(data, llkDoublet.llkSinglet < 0)
data$Singlet_samp <- as.character(data$Singlet_samp)
data$Singlet_samp[data$Singlet_samp == "0"] = "1 (M)"
data$Singlet_samp[data$Singlet_samp == "1"] = "2 (F)"
data$Singlet_samp[data$Singlet_samp == "2"] = "3 (M)"
data$Singlet_samp[data$Singlet_samp == "3"] = "4 (F)"
data$Singlet_samp[data$Singlet_samp == "4"] = "5 (F)"
data$Singlet_samp[data$Singlet_samp == "5"] = "6 (M)"
data$Singlet_samp <- as.factor(data$Singlet_samp)
data$Singlet_samp = factor(data$Singlet_samp, levels(data$Singlet_samp)[c(2,3,4,1,6,5)])
ggplot(data, aes(x=as.factor(Singlet_samp), y=y)) +
geom_boxplot(notch=notch, color = colors, outlier.alpha = 0.3) +
stat_summary(geom = 'point', fun.y=mean, shape = 23, size = 5, color = colors) +
theme_bw(base_family = 'Helvetica', base_size = 16) +
theme(panel.background = element_rect(fill = "#f6f6f7")) +
guides(colour = guide_legend(override.aes = list(size=12))) +
coord_cartesian(ylim = c(0, 12)) +
stat_summary(fun.data = give.n, geom = "text", color = "black") +
xlab("Samples") +
ylab("Y Reads") +
theme(plot.margin = unit(c(0.5,2,0.5,0.5), "cm"),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# ggplot(data, aes(x=as.factor(Singlet_samp), y=y)) +
# geom_boxplot()
# scale_x_discrete(labels=c(1:6)) +
# scale_color_manual(labels=c(1:6)) +
# scale_fill_manual(labels=c(1:6))
# theme(plot.margin = unit(c(0.5,4.5,0.5,0.5), "cm"))
}
plot.scores <- function(data) {
ggplot(data, aes(x=llkDoublet.llkSinglet, y=nGene, color = factor(Singlet_samp))) +
geom_point(alpha = 1, size = 0.6) +
theme_bw(base_size = 16, base_family = 'Helvetica') +
theme(legend.title=element_blank()) +
scale_color_manual(labels=1:6, values=colors.x) +
# scale_color_manual(labels=get.sample.names(data), values=colors.x) +
guides(colour = guide_legend(override.aes = list(size=8))) +
theme(panel.background = element_rect(fill = "#f6f6f7")) +
geom_vline(xintercept = 0, color="black") +
geom_vline(xintercept = 100, color="red") +
xlab("Doublet - singlet likelihood score")
#coord_cartesian(ylim = c(0, 3000), xlim = c(-250, 750))
}
multiplot(plot.scores(pilot3.lane1), boxplot.samples(pilot3.lane1, doublets = F), cols = 1)
plot.scores.dif <- function(data) {
ggplot(data, aes(x=llkDoublet.llkSinglet, y=difSingletSamp, color = factor(Singlet_samp))) +
geom_point(alpha = 1, size = 0.6) +
theme_bw(base_size = 13, base_family = 'Helvetica') +
theme(legend.title=element_blank()) +
scale_color_manual(labels=get.sample.names(data), values=colors.x) +
guides(colour = guide_legend(override.aes = list(size=8))) +
theme(panel.background = element_rect(fill = "#f6f6f7")) +
geom_vline(xintercept = 0, color="red") +
geom_vline(xintercept = 100, color="black") +
geom_hline(yintercept = 2, color="black")
#coord_cartesian(ylim = c(0, 3000), xlim = c(-250, 750))
}
plot.scores.ngenes <- function(data) {
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
ggplot(data, aes(x=llkDoublet.llkSinglet, y=llkDoublet, color = nGene)) +
geom_point(alpha = 1, size = 0.6) +
theme_bw(base_size = 13) +
scale_colour_gradientn(colours = myPalette(100)) +
theme(panel.background = element_rect(fill = "#f6f6f7")) +
geom_vline(xintercept = 0, color="red") +
geom_vline(xintercept = 100, color="black")
#coord_cartesian(ylim = c(-1000, 0), xlim = c(-250, 750))
}
##
## Load data
##
pilot4.lane1 <- get.data(pilot = 4, lane = 1)
pilot4.lane2 <- get.data(pilot = 4, lane = 2)
pilot4.lane3 <- get.data(pilot = 4, lane = 3)
pilot3.lane1 <- get.data(pilot = 3, lane = 1)
pilot3.lane2 <- get.data(pilot = 3, lane = 2)
pilot3.lane3 <- get.data(pilot = 3, lane = 3)
pilot3.lane4 <- get.data(pilot = 3, lane = 4)
pilot3.lane5 <- get.data(pilot = 3, lane = 5)
pilot3.lane6 <- get.data(pilot = 3, lane = 6)
pilot3.lane7 <- get.data(pilot = 3, lane = 7)
pilot3.lane8 <- get.data(pilot = 3, lane = 8)
# Combine and save the data
pilot3.de.anonymize <- rbind(pilot3.lane1, pilot3.lane2, pilot3.lane3, pilot3.lane4, pilot3.lane5, pilot3.lane6, pilot3.lane7, pilot3.lane8)
save(pilot3.de.anonymize, file = "./data/pilot3_deAnonymize.Rda")
##
## Workspace
##
dim(pilot3.de.anonymize)
divide.larger <- function(a, b) {
a <- abs(a)
b <- abs(b)
return(max(a, b) / min(a, b))
}
pilot4.lane1$difSingletSamp <- mapply(divide.larger, pilot4.lane1$llkSingletSamp1, pilot4.lane1$llkSingletSamp2)
pilot4.lane1$difSingletSamp <- mapply(divide.larger, pilot4.lane1$llkSingletSamp1, pilot4.lane1$llkSingletSamp2)
pilot4.lane2$difSingletSamp <- mapply(divide.larger, pilot4.lane2$llkSingletSamp1, pilot4.lane2$llkSingletSamp2)
pilot4.lane1.gonl$difSingletSamp <- mapply(divide.larger, pilot4.lane1.gonl$llkSingletSamp1, pilot4.lane1.gonl$llkSingletSamp2)
pilot3.lane2$difSingletSamp <- mapply(divide.larger, pilot3.lane2$llkSingletSamp1, pilot3.lane2$llkSingletSamp2)
pilot3.lane6$difSingletSamp <- mapply(divide.larger, pilot3.lane6$llkSingletSamp1, pilot3.lane6$llkSingletSamp2)
plot.scores.dif(pilot4.lane1)
plot.scores.dif(pilot4.lane1.gonl)
plot.scores.dif(pilot4.lane2)
plot.scores.dif(pilot3.lane2)
plot.scores.dif(pilot3.lane6)
pilot4.lane1.gonl <- read.table("~/Documents/Pilot 4/deAnonymizer/pilot4lane1_gonl_results_deanonymize_chrALL.txt", header = T, sep = "\t", fill=T)
boxplot(pilot4.lane1.gonl$nSNPs_tested, pilot4.lane1[pilot4.lane1$outcome!="Singlet",]$nSNPs_tested)
multiplot(plot.scores(pilot4.lane1), boxplot.samples(pilot4.lane1), cols = 1)
multiplot(plot.scores(pilot4.lane2), boxplot.samples(pilot4.lane2), cols = 1)
multiplot(plot.scores(pilot4.lane3), boxplot.samples(pilot4.lane3, doublets = F), cols = 1)
multiplot(plot.scores(pilot3.lane1), boxplot.samples(pilot3.lane1, doublets = F), cols = 1)
multiplot(plot.scores(pilot3.lane2), boxplot.samples(pilot3.lane2, doublets = F), cols = 1)
multiplot(plot.scores(pilot3.lane3), boxplot.samples(pilot3.lane3, doublets = F), cols = 1)
multiplot(plot.scores(pilot3.lane4), boxplot.samples(pilot3.lane4, doublets = F), cols = 1)
multiplot(plot.scores(pilot3.lane5), boxplot.samples(pilot3.lane5, doublets = F), cols = 1)
multiplot(plot.scores(pilot3.lane6), boxplot.samples(pilot3.lane6, doublets = F), cols = 1)
multiplot(plot.scores(pilot3.lane7), boxplot.samples(pilot3.lane7, doublets = F), cols = 1)
multiplot(plot.scores(pilot3.lane8), boxplot.samples(pilot3.lane8, doublets = F), cols = 1)
plot.scores(pilot3.lane4)
boxplot(pilot3.lane2[pilot3.lane2$outcome=="Doublet",]$y)
pilot4.lane3.doubt <- pilot4.lane3[pilot4.lane3$llkDoublet.llkSinglet > 0 & pilot4.lane3$llkDoublet.llkSinglet < 100,]
pilot4.lane2.doubt <- pilot4.lane2[pilot4.lane2$llkDoublet.llkSinglet > 0 & pilot4.lane2$llkDoublet.llkSinglet < 100,]
pilot4.lane1.doubt <- pilot4.lane1[pilot4.lane1$llkDoublet.llkSinglet > 0 & pilot4.lane1$llkDoublet.llkSinglet < 100,]
pilot4.lane3.doubt$llkSingletSamp1/pilot4.lane3.doubt$llkSingletSamp2
barplot(table(c(pilot4.lane3.doubt$Singlet_samp, pilot4.lane2.doubt$Singlet_samp)))
barplot(table(c(pilot4.lane3.doubt$Doub_samp1, pilot4.lane3.doubt$Doub_samp2, pilot4.lane2.doubt$Doub_samp1, pilot4.lane2.doubt$Doub_samp2)))
barplot(table(pilot4.lane1.doubt$Singlet_samp))
pilot3.lane4.doubt <- pilot3.lane4[pilot3.lane4$llkDoublet.llkSinglet > 0 & pilot3.lane4$llkDoublet.llkSinglet < 100,]
pilot3.lane4.sure <- pilot3.lane4[pilot3.lane4$llkDoublet.llkSinglet > 100,]
boxplot(pilot3.lane4.doubt$nUMI / pilot3.lane4.doubt$nGene, pilot3.lane4.sure$nUMI/pilot3.lane4.sure$nGene,
names = c("False positives", "True doublets"), ylab="UMIs / Gene", outline = F)
pilot3.lane4.doubt <- pilot3.lane4[pilot3.lane4$llkDoublet.llkSinglet > 0,]
nrow(pilot3.lane4.doubt) / nrow(pilot3.lane4)
nrow(pilot3.lane4.sure) / nrow(pilot3.lane4)
pilot3.lane1.doubt <- pilot3.lane1[pilot3.lane1$llkDoublet.llkSinglet > 0,]
pilot3.lane1.sure <- pilot3.lane1[pilot3.lane1$llkDoublet.llkSinglet > 100,]
nrow(pilot3.lane1.doubt) / nrow(pilot3.lane1)
nrow(pilot3.lane1.sure) / nrow(pilot3.lane1)
get.percentage <- function(lane) {
print(sum(lane$llkDoublet.llkSinglet > 0))
print(sum(lane$llkDoublet.llkSinglet > 0) / nrow(lane))
print(sum(lane$llkDoublet.llkSinglet > 100))
print(sum(lane$llkDoublet.llkSinglet > 100) / nrow(lane))
}
get.percentage(pilot3.lane1)
get.percentage(pilot3.lane2)
get.percentage(pilot3.lane3)
get.percentage(pilot3.lane4)
get.percentage(pilot3.lane5)
get.percentage(pilot3.lane6)
get.percentage(pilot3.lane7)
get.percentage(pilot3.lane8)