-
Notifications
You must be signed in to change notification settings - Fork 0
/
Step04_MapDatasets.R
181 lines (125 loc) · 5.98 KB
/
Step04_MapDatasets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# This script maps single cell datasets onto a reference dataset (Cain 2020), in
# order to get consistent broad and fine cell-type assignments across all the
# data sets. We use the broad_class and sub_class definitions from the Cain
# data set.
# Note: This script requires > 64 GB of RAM for some mappings.
library(Seurat)
library(SingleCellExperiment)
library(stringr)
library(dplyr)
source(file.path("functions", "FileIO_HelperFunctions.R"))
source(file.path("functions", "Step04_Mapping_HelperFunctions.R"))
query_datasets <- c("lau", "leng", "mathys", "seaRef")
reference_dataset <- "cain"
ref_seurat <- Load_MapReference(reference_dataset, "broad_class")
ref_seurat_subclass <- Load_MapReference(reference_dataset, "sub_class")
# Fix labels in the Cain pre-processed data ------------------------------------
# Endothelial cells, pericytes, and VLMCs were grouped under "Vascular" in
# the integrated reference. This just transfers that change onto the original
# pre-processed data and saves the object in the post-processed folder.
sce <- Load_PreprocessedData(reference_dataset, remove_excluded = TRUE)
# The cells should already be in the same order, but just in case
cell_order <- match(sce$cell_id, ref_seurat$cell_id)
sce$broad_class <- ref_seurat$broad_class[cell_order]
sce$broad_class <- factor(sce$broad_class)
subclass_meta <- lapply(ref_seurat_subclass, function(S) {
return(select([email protected], cell_id, broad_class, sub_class))
})
subclass_meta <- do.call(rbind, subclass_meta)
cell_order <- match(sce$cell_id, subclass_meta$cell_id)
sce$sub_class <- as.character(subclass_meta$sub_class[cell_order])
sce$sub_class <- factor(sce$sub_class)
Save_SingleCell(reference_dataset, sce)
rm(sce)
gc()
# Map the other data sets to the reference -------------------------------------
for (query_dataset in query_datasets) {
message(str_glue("Mapping dataset {query_dataset}..."))
query_sce <- Load_PreprocessedData(query_dataset, remove_excluded = TRUE)
# Set up one seurat object per sample
query_seurat <- CreateSeuratObject(counts(query_sce),
meta.data = data.frame(colData(query_sce)))
query_seurat <- SplitObject(query_seurat, split.by = "sample")
rm(query_sce)
gc()
## Map each individual sample to reference -----------------------------------
query_seurat <- lapply(query_seurat, function(query) {
samp_name <- unique(query$sample)
print(str_glue("{query_dataset} sample {samp_name}..."))
Map_Cells(query, ref_seurat,
dims_use = 1:30,
map_cols = list(broad_class = "broad_class"),
recompute_residuals = TRUE)
})
query_mapped <- Merge_Queries(query_seurat)
# Save mapping info for later inspection. We don't need to save the entire
# Seurat object, just the parts that were mapped.
to_save <- list(metadata = [email protected],
reductions = query_mapped@reductions)
saveRDS(to_save,
file.path(dir_tmp, str_glue("mapped_{query_dataset}_broad_class.rds")))
rm(query_seurat, query_mapped)
gc()
## Map sub classes -----------------------------------------------------------
# Uses all samples together due to low cell counts per sample, rather than
# splitting by sample
query_sce <- Load_PreprocessedData(query_dataset, remove_excluded = TRUE)
cell_order <- match(query_sce$cell_id, to_save$metadata$cell_id)
metadata <- to_save$metadata[cell_order, ]
query_seurat <- CreateSeuratObject(counts(query_sce),
meta.data = metadata)
rm(query_sce)
gc()
query_seurat <- SplitObject(query_seurat, split.by = "predicted.broad_class")
query_seurat <- lapply(names(query_seurat), function(X) {
Map_Cells(query_seurat[[X]], ref_seurat_subclass[[X]],
dims_use = 1:30,
map_cols = list(sub_class = "sub_class"),
recompute_residuals = TRUE)
})
query_mapped <- Merge_Queries(query_seurat)
to_save <- list(metadata = [email protected],
reductions = query_mapped@reductions)
saveRDS(to_save,
file.path(dir_tmp, str_glue("mapped_{query_dataset}_sub_class.rds")))
rm(query_seurat, query_mapped)
gc()
## Save the final result -----------------------------------------------------
query_sce <- Load_PreprocessedData(query_dataset, remove_excluded = TRUE)
cell_order <- match(query_sce$cell_id, to_save$metadata$cell_id)
metadata <- to_save$metadata[cell_order, ]
query_sce$original_broad_class <- query_sce$broad_class
query_sce$original_sub_class <- query_sce$sub_class
query_sce$broad_class <- factor(metadata$predicted.broad_class)
query_sce$sub_class <- factor(metadata$predicted.sub_class)
Save_SingleCell(query_dataset, query_sce)
rm(query_sce)
gc()
}
# Notes on initial sub_class mapping using all subclasses present in cain:
# lau has all subtypes
# lau has low (<20): Ast.5**, End.3**, End.5, End.6*
# leng has all subtypes
# leng has low (<20): Ast.5, End.3**, End.6
# mathys has no: End.3, End.6
# mathys has low (<20): Ast.4, End.2*, End.4, End.5**, Mic.5
# seaRef has no: Ast.4, Ast.5, End.2, End.3, End.6, Mic.3, Mic.5
# seaRef has low (<20): Ast.2**, End.4*, End.5**, Mic.4, Mic.5**, Oligo.4**
# Low or missing in all data sets: End.3, End.6
# Low or missing in 3 of 4 datasets: Ast.5, End.5
# Low or missing in 2 of 4 datasets: Ast.4, End.2, End.4, Mic.5
# Low or missing in seaRef only: Mic.4, Oligo.4
# After these results, I decided that the endothelial subclusters should
# be combined since most endothelial clusters were missing from at least two
# datasets.
# Mapping after combination of endothelial clusters:
# lau has all subtypes
# lau has low (<20): Ast.5**
# leng has all subtypes
# leng has low (<20): Ast.5
# mathys has all subtypes
# mathys has low (<20): Ast.4, Mic.5*
# seaRef is missing: Ast.4, Ast.5, Mic.3, Mic.5
# seaRef has low (<20): Ast.2**, Mic.4*, Oligo.4**
# These results led me to remove glial subtypes as well, resulting in only
# Excitatory, Inhibitory, and Vascular cells having subtypes.