-
Notifications
You must be signed in to change notification settings - Fork 0
/
Step06_FindMarkers.R
149 lines (107 loc) · 4.85 KB
/
Step06_FindMarkers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Finds cell type markers in single cell data using 4 different algorithms:
# Dtangle, Seurat/MAST, AutogeneS, and DESeq2
library(dplyr)
library(foreach)
library(doParallel)
library(stringr)
source(file.path("functions", "FileIO_HelperFunctions.R"))
source(file.path("functions", "General_HelperFunctions.R"))
# Settings ---------------------------------------------------------------------
# Which algorithms to run for marker finding
marker_types_run <- c("dtangle", "seurat", "autogenes", "deseq2", "excluded_genes")
# Run multiple parameter sets in parallel for dtangle/HSPE marker finding if
# dtangle_do_parallel is TRUE. Other algorithms can't be run in parallel.
# Assume most data sets use <20 GB of RAM per core, but seaRef will use > 100 GB
# for single cell.
dtangle_do_parallel <- TRUE
dtangle_n_cores <- 4
dtangle_clust_type <- "FORK" # Use PSOCK for non-Unix systems
# Which datasets to run on
datasets <- c("cain", "lau", "leng", "mathys", "seaRef")
# What granularities?
granularities <- c("broad_class", "sub_class")
# Dtangle/HSPE only: input types
dtangle_input_types <- c("singlecell", "pseudobulk")
# Dtangle/HSPE markers ---------------------------------------------------------
if ("dtangle" %in% marker_types_run) {
if (dtangle_do_parallel) {
cl <- makeCluster(dtangle_n_cores, type = dtangle_clust_type, outfile = "")
registerDoParallel(cl)
}
source(file.path("functions", "Step06a_FindMarkers_DtangleHSPE.R"))
FindMarkers_DtangleHSPE(datasets, granularities, dtangle_input_types)
if (dtangle_do_parallel) {
stopCluster(cl)
}
}
# Seurat / MAST markers --------------------------------------------------------
# Uses all available CPUs already, so this isn't run in parallel.
if ("seurat" %in% marker_types_run) {
source(file.path("functions", "Step06b_FindMarkers_Seurat.R"))
FindMarkers_Seurat(datasets, granularities)
}
# AutogeneS markers (requires python/reticulate) -------------------------------
# This section uses reticulate, so it's not run in parallel in case that causes
# issues with python environments.
if ("autogenes" %in% marker_types_run) {
source(file.path("functions", "Step06c_FindMarkers_AutogeneS.R"))
FindMarkers_AutogeneS(datasets, granularities)
}
# DESeq2 markers from pseudobulk -----------------------------------------------
if ("deseq2" %in% marker_types_run) {
source(file.path("functions", "Step06d_FindMarkers_DESeq2.R"))
FindMarkers_DESeq2(datasets, granularities)
}
# Find marker genes that change with diagnosis ---------------------------------
if ("excluded_genes" %in% marker_types_run) {
source(file.path("functions", "Step06e_FindMarkerExclusions.R"))
FindMarkerExclusions(datasets, granularities)
}
# Filter markers down to exclude diagnosis-influenced genes --------------------
for (dataset in datasets) {
exclusions_file <- file.path(dir_metadata,
str_glue("{dataset}_excluded_genes.rds"))
if (file.exists(exclusions_file)) {
exclusions <- readRDS(exclusions_file)
exclusions <- exclusions$genes
} else { # seaRef won't have exclusions based on diagnosis
exclusions <- c()
}
marker_files <- list.files(path = dir_markers,
pattern = dataset,
full.names = TRUE)
for (file in marker_files) {
markers <- readRDS(file)
markers$filtered_for_dx <- lapply(markers$filtered, setdiff, exclusions)
saveRDS(markers, file)
loss <- (1 - sum(lengths(markers$filtered_for_dx)) / sum(lengths(markers$filtered))) * 100
print(str_glue("{basename(file)}: removed {round(loss)}% of total markers"))
}
}
# Calculate ordering by correlation --------------------------------------------
# For every combination of reference data set, bulk data set, normalization,
# regression, and granularity
params_loop <- expand_grid(test_data_name = c("Mayo", "MSBB", "ROSMAP"),
normalization = c("cpm", "tmm", "tpm"),
regression_method = c("none", "edger", "lme", "dream"))
for (granularity in granularities) {
for (dataset in datasets) {
marker_files <- list.files(path = dir_markers,
pattern = str_glue("{dataset}_{granularity}"),
full.names = TRUE)
for (file in marker_files) {
markers <- readRDS(file)
markers$ordered_by_correlation <- list()
for (P in 1:nrow(params_loop)) {
params <- params_loop[P, ]
data <- Load_BulkData(params$test_data_name, params$normalization,
params$regression_method)
markers_ordered <- OrderMarkers_ByCorrelation(markers$filtered,
as.matrix(assay(data, "counts")))
markers$ordered_by_correlation[[paste(params, collapse = "_")]] <- markers_ordered
}
print(file)
saveRDS(markers, file)
}
}
}