-
Notifications
You must be signed in to change notification settings - Fork 1
/
XIST.R
283 lines (239 loc) · 13.3 KB
/
XIST.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# Author: Aura Zelco
# Brief description:
# This script is used for calculating the number of samples used and the XIST expression
# Brief procedure:
# 1. reads the RDS and saves the RNA expression for XIST for DISCO and Velmeshev dataset (10-20 years on server, 2nd trimester no XIST)
# 2. Merges the XIST results into a df and adds the metadata imported at the same time as the RDSs
# 3. Calculates the number of samples for paper
# 4. Plots the XIST expression
# Documentation abbreviations:
# F and M: females and males
# ct: celltype
# df: dataframe
# ds: dataset
#---------------------------------------------------------------------------------------------------
# 0. Import Libraries
library(Seurat)
library(stringr)
library(tidyr)
library(ggplot2)
#---------------------------------------------------------------------------------------------------
####### ON FURU
out_path <- "UCSC/XIST/"
dir.create(out_path, recursive = T, showWarnings = F)
# Velmeshev 2nd trimester -> No XIST
# Velmeshev 10-20 years
velm_10_20_years <- readRDS("UCSC/Seurat_UCSC/Velmeshev/Velmeshev_2022_10_20_years.rds")
velm_10_20_years_xist <- GetAssayData(velm_10_20_years[["RNA"]], slot="data")["XIST",]
write.csv(as.data.frame(velm_10_20_years_xist), paste0(out_path, "Velmeshev_2022_10_20_years_XIST.csv"))
#---------------------------------------------------------------------------------------------------
####### LOCAL
# 1. Read local RDS and save XIST info in list, and cell info in another list
ds_paths <- c("Velmeshev_3rd_trimester"= "UCSC/Seurat_UCSC/",
"Velmeshev_0_1_years"="UCSC/Seurat_UCSC/",
"Velmeshev_1_2_years"= "UCSC/Seurat_UCSC/",
"Velmeshev_2_4_years"= "UCSC/Seurat_UCSC/",
"Velmeshev_Adults" = "UCSC/Seurat_UCSC/",
"DISCO" = "DISCOv1.0/")
ds_names <- c("Velmeshev_3rd_trimester"= "Velmeshev_2022_3rd_trimester",
"Velmeshev_0_1_years"= "Velmeshev_2022_0_1_years",
"Velmeshev_1_2_years"= "Velmeshev_2022_1_2_years",
"Velmeshev_2_4_years"= "Velmeshev_2022_2_4_years",
"Velmeshev_Adults"= "Velmeshev_2022_Adult",
"DISCO" = "brainV1.0_all_FM_filt")
xist_ls <- list()
cell_info <- list()
for (id in names(ds_paths)) {
print(id)
if (id == "DISCO") {
id_seurat <- readRDS(paste0(ds_paths[[id]], ds_names[[id]], ".rds"))
xist_ls <- append(xist_ls, list(data.frame("XIST"=GetAssayData(id_seurat[["RNA"]], slot="data")["XIST",])))
cell_info <- append(cell_info, list(read.csv(paste0(ds_paths[[id]][1], "DEGs_common/cell_info.csv"))))
rm(id_seurat)
} else {
id_seurat <- readRDS(paste0(ds_paths[[id]], ds_names[[id]], ".rds"))
xist_ls <- append(xist_ls, list(data.frame("XIST"=GetAssayData(id_seurat[["RNA"]], slot="data")["XIST",])))
cell_info <- append(cell_info, list(read.csv(paste0("UCSC/DEGs_adjust_pval/", ds_names[[id]], "/cell_info_", ds_names[[id]], ".csv"))))
rm(id_seurat)
}
}
names(xist_ls) <- names(ds_paths)
names(cell_info) <- names(ds_paths)
# 2. Manually add Velmeshev_10_20 years
xist_ls <- append(xist_ls, list("Velmeshev_10_20_years"=read.csv("UCSC/outputs/Velmeshev/Velmeshev_2022_10_20_years_XIST.csv")))
rownames(xist_ls$Velmeshev_10_20_years) <- xist_ls$Velmeshev_10_20_years$X
xist_ls$Velmeshev_10_20_years$X <- NULL
colnames(xist_ls$Velmeshev_10_20_years) <- "XIST"
cell_info <- append(cell_info, list("Velmeshev_10_20_years"=read.csv("UCSC/DEGs_adjust_pval/Velmeshev_2022_10_20_years/cell_info_Velmeshev_2022_10_20_years.csv")))
# 3. Merge XIST in df and formats the information
xist_df <- do.call(rbind, xist_ls)
xist_df <- cbind("id"=rownames(xist_df), xist_df)
rownames(xist_df) <- NULL
ids_split <-regmatches(xist_df$id, regexpr("\\.", xist_df$id), invert = TRUE)
ids_split <- as.data.frame(do.call(rbind, ids_split))
xist_df$group <- ids_split$V1
xist_df$sample_id <- ids_split$V2
xist_df <- cbind(xist_df[, c(3:4)], xist_df[, c(2)])
names(xist_df)[names(xist_df)=="xist_df[, c(2)]"] <- "XIST"
# 4. Adds the metadata about the cells
xist_df$info <- rep(NA, nrow(xist_df))
for (group in unique(xist_df$group)) {
print(group)
for (id in unique(xist_df[which(xist_df$group==group), "sample_id"])) {
xist_df[which(xist_df$group==group & xist_df$sample_id==id), "info"] <- cell_info[[group]][which(cell_info[[group]]$cell_id==id), "og_group"]
}
}
# 5. Uniforms the sex metadata
xist_df$info <- str_replace_all(xist_df$info, c("Female"="F", "Male"="M"))
# 6. Formats the ids so they can be split into barcodes and samples
xist_df$new_ids <- xist_df$sample_id
xist_df$new_ids <- str_replace_all(xist_df$new_ids, c("-1_"="-1/", "-1--"="-1/"))
xist_df <- separate(xist_df, new_ids, into=c("barcodes", "samples"), sep="/")
# 7. Creates sex column
xist_df$sex <- rep(NA, nrow(xist_df))
xist_df[grep("M_", xist_df$info), "sex"] <- "M"
xist_df[grep("F_", xist_df$info), "sex"] <- "F"
# 8. XIST RNA expression plot
disco <- as.data.frame(do.call(rbind, str_split(xist_df[which(xist_df$group=="DISCO"), "info"], "_")))
proj_ids <- unique(disco$V1)
dis_ids <- unique(disco$V3)
#for (i in 1:(length(names(cell_info)) -1)) {
# xist_df[which(xist_df$sample_id %in% cell_info[[i]]$cell_id), "group"] <- names(cell_info)[i]
#}
xist_df$new_group <- xist_df$group
for (proj in proj_ids) {
for (dis in dis_ids ) {
for (sex in c("F", "M")) {
if (any(grep(paste(proj, sex, dis, sep = "_"), xist_df[which(xist_df$group=="DISCO"), "info"]))) {
print(paste(proj, dis, sep = "_"))
xist_df[which(grepl(paste(proj, sex, dis, sep = "_"), xist_df$info)), "new_group"] <- paste(proj, dis, sep = "_")
}
}
}
}
xist_df$new_group <- str_replace_all(xist_df$new_group, "Normal", "Healthy")
groups_order <- c("Velmeshev_3rd_trimester","Velmeshev_0_1_years",
"Velmeshev_1_2_years", "Velmeshev_2_4_years",
"Velmeshev_10_20_years", "Velmeshev_Adults",
"GSE157827_Healthy","GSE174367_Healthy",
"PRJNA544731_Healthy", "GSE157827_Alzheimer's disease",
"GSE174367_Alzheimer's disease", "PRJNA544731_Multiple Sclerosis")
xist_df$new_group <- factor(xist_df$new_group, groups_order)
xist_df <- xist_df[order(xist_df$new_group), ]
xist_df$samples <- factor(xist_df$samples, unique(xist_df$samples))
xist_df <- xist_df[order(xist_df$samples), ]
pdf("Extra_figures/XIST_faceted.pdf", width = 15, height = 20)
print(
ggplot(xist_df, aes(samples, XIST, fill=sex)) +
geom_violin() +
facet_grid(new_group ~ sex, scales = "free", space = "free") +
labs(x="Samples", y="XIST RNA expression", fill="Sex") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
strip.text.y = element_text(size=8, face="bold", colour = "black", angle = 0),
strip.text.x = element_text(size=10, face="bold", colour = "black"),
plot.title = element_text(size=12, face="bold", colour = "black"),
axis.line = element_line(colour = "black"),
axis.title.x = element_blank(),
axis.text.x = element_text(size=8, colour = "black",angle = 90, vjust = 0.7, hjust=0.5),
axis.ticks.x=element_blank(),
axis.title.y = element_text(size=12, face="bold", colour = "black"),
legend.position = "bottom",
legend.title = element_text(size=12, face="bold", colour = "black"))
)
dev.off()
# 9. Number of samples
cell_info <- append(cell_info, list("Velmeshev_2nd_trimester"=read.csv("UCSC/DEGs_2nd_trimester/Velmeshev_2022_2nd_trimester/cell_info_Velmeshev_2022_2nd_trimester.csv")))
num_samples <- cell_info
num_samples$DISCO <- num_samples$DISCO[, c(1,2,5)]
num_samples <- do.call(rbind, num_samples)
num_samples <- cbind("group" = gsub('\\..*', '', rownames(num_samples)), num_samples)
rownames(num_samples) <- NULL
num_samples$X <- NULL
num_samples$new_ids <- num_samples$cell_id
num_samples$new_ids <- str_replace_all(num_samples$new_ids, c("-1_"="-1/", "-1--"="-1/"))
num_samples <- separate(num_samples, new_ids, into=c("barcodes", "samples"), sep="/")
num_samples$sex <- rep(NA, nrow(num_samples))
num_samples$og_group <- str_replace_all(num_samples$og_group, c("Female"="F", "Male"="M"))
num_samples[grep("M_", num_samples$og_group), "sex"] <- "M"
num_samples[grep("F_", num_samples$og_group), "sex"] <- "F"
disco <- as.data.frame(do.call(rbind, str_split(num_samples[which(num_samples$group=="DISCO"), "og_group"], "_")))
proj_ids <- unique(disco$V1)
dis_ids <- unique(disco$V3)
num_samples$new_group <- num_samples$group
for (proj in proj_ids) {
for (dis in dis_ids ) {
for (sex in c("F", "M")) {
if (any(grep(paste(proj, sex, dis, sep = "_"), num_samples[which(num_samples$group=="DISCO"), "og_group"]))) {
print(paste(proj, dis, sep = "_"))
num_samples[which(grepl(paste(proj, sex, dis, sep = "_"), num_samples$og_group)), "new_group"] <- paste(proj, dis, sep = "_")
}
}
}
}
num_samples$samples_count <- rep(NA, nrow(num_samples))
for (group in unique(num_samples$new_group)) {
for (sex in c("F", "M")) {
num_samples[which(num_samples$new_group==group & num_samples$sex==sex), "samples_count"] <- length(unique(num_samples[which(num_samples$new_group==group & num_samples$sex==sex), "samples"]))
}
}
num_samples_simplified <- num_samples[, c("new_group", "sex", "samples_count")]
num_samples_simplified <- num_samples_simplified[which(!duplicated(num_samples_simplified)),]
sum(num_samples_simplified$samples_count)
sum(num_samples_simplified[which(num_samples_simplified$sex=="F"), "samples_count"])
sum(num_samples_simplified[which(num_samples_simplified$sex=="M"), "samples_count"])
velm_4_10 <- read.csv("UCSC/outputs/Velmeshev_num_cells_per_age.csv")
velm_4_10 <- subset(velm_4_10, age=="4-10 years")
num_samples_simplified <- rbind(num_samples_simplified,
c("Velmeshev_4_10_years", "F", nrow(velm_4_10[which(velm_4_10$sex=="Female"),])),
c("Velmeshev_4_10_years", "M", nrow(velm_4_10[which(velm_4_10$sex=="Male"),])))
num_samples_simplified$samples_count <- as.numeric(num_samples_simplified$samples_count)
num_samples_simplified$new_group <- str_replace_all(num_samples_simplified$new_group, "Normal", "Healthy")
num_samples_simplified[which(grepl("^Velmeshev", num_samples_simplified$new_group)), "new_group"] <- paste(num_samples_simplified[which(grepl("^Velmeshev", num_samples_simplified$new_group)), "new_group"], "Healthy", sep = "_")
groups_order <- c("Velmeshev_2nd_trimester_Healthy", "Velmeshev_3rd_trimester_Healthy","Velmeshev_0_1_years_Healthy",
"Velmeshev_1_2_years_Healthy", "Velmeshev_2_4_years_Healthy", "Velmeshev_4_10_years_Healthy",
"Velmeshev_10_20_years_Healthy", "Velmeshev_Adults_Healthy",
"GSE157827_Healthy","GSE174367_Healthy",
"PRJNA544731_Healthy", "GSE157827_Alzheimer's disease",
"GSE174367_Alzheimer's disease", "PRJNA544731_Multiple Sclerosis")
num_samples_simplified$new_group <- factor(num_samples_simplified$new_group, groups_order)
num_samples_simplified <- num_samples_simplified[order(num_samples_simplified$new_group), ]
pdf("Extra_figures/num_samples.pdf")
print(
ggplot(num_samples_simplified, aes(new_group, samples_count, fill=sex)) +
geom_bar(stat = "identity", color="black", position = "dodge") +
geom_hline(yintercept = 3, linetype="dashed") +
labs(x="Groups", y="Number of samples", fill="Sex") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size=12, face="bold", colour = "black"),
axis.line = element_line(colour = "black"),
axis.text.x = element_text(size=14, colour = "black",angle = 90, vjust = 0.7, hjust=0.5),
axis.text.y = element_text(size=14, colour = "black"),
axis.title.y = element_text(size=16, face="bold", colour = "black"),
axis.title.x = element_text(size=16, face="bold", colour = "black"),
legend.position = "bottom",
legend.title = element_text(size=16, face="bold", colour = "black"))
)
dev.off()
png("Extra_figures/num_samples.png", res=300, width = 15, height = 10, units = "in")
print(
ggplot(num_samples_simplified, aes(new_group, samples_count, fill=sex)) +
geom_bar(stat = "identity", color="black", position = "dodge") +
geom_hline(yintercept = 3, linetype="dashed") +
labs(x="Datasets", y="Number of samples", fill="Sex") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size=12, face="bold", colour = "black"),
axis.line = element_line(colour = "black"),
axis.text.x = element_text(size=14, colour = "black",angle = 90, vjust = 0.7, hjust=0.5),
axis.text.y = element_text(size=14, colour = "black"),
axis.title.y = element_text(size=16, face="bold", colour = "black"),
axis.title.x = element_text(size=16, face="bold", colour = "black"),
legend.position = "bottom",
legend.title = element_text(size=16, face="bold", colour = "black"))
)
dev.off()