-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.nf
292 lines (205 loc) · 6.45 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env nextflow
// Version 0.3
nextflow.enable.dsl = 1
params.inputFile = ""
params.outputDir = ""
params.singularity_bindpath=""
params.singularity_path= ""
params.output_runName = ""
params.skip_remap = false // if true, need to provide a common variant file in params.vcf
params.ref_fasta =""
params.vcf = "" // leave blank to skip common variant and set params.skip_remap to false.
params.threads = 8
// Range of k-values to run SoupOrCell process in parallel in list format
// eg. ['2', '3']
params.k_range = ''
kval_range = Channel.fromList(params.k_range).view()
// Make copy of k-value range for socSummary and runSoupOrCell
kval_range.into{kval_range_copy01; kval_range_copy02}
// Read in a .csv file that contains paths to sample directories and run names
run_info= Channel.fromPath(params.inputFile)
.splitCsv(header: true, sep: ',')
.map { row -> tuple( row.run_name, row.sample_path)}
.view()
// Make copy of sample paths for bam and barcode files
run_info.into{run_info_copy01; run_info_copy02}
/***
Process: Modify bam files to have appropriate tags to run SoupOrCell
Inputs: Sample .bam files
Published: None
Downstream: mergeBams
Notes:
***/
process processSample {
cache 'lenient'
memory '16 GB'
input:
tuple val(run_name), val(sample_path) from run_info_copy01
output:
file("*.bam") into processed_bams
script:
def sampleName = new File(sample_path).name
"""
module load python/3.12.1
reformat-sci-bam.py \
-i "${sample_path}" \
-r "${run_name}" \
-o "${run_name}_${sampleName}_socReady"
"""
}
/***
Process: Create a merged bam file from modified bams in previous process.
Inputs: - *.bams - all modified bams from each sample
Published: Merged bam and index bam files in .bam and .bai format.
Downstream: runSoupOrCell
Notes: Merged bams are saved to a temp folder that will be used for downstream soc analysis
***/
process mergeBams {
cache 'lenient'
memory '64 GB'
publishDir path: "${params.outputDir}/merged/", pattern: "*sorted*", mode: 'copy'
input:
file input_bam from processed_bams.collect()
output:
file("*sorted*") into bam_dir
script:
"""
# bash watch for errors
set -ueo pipefail
module load samtools/1.19
samtools merge -o "${params.output_runName}_merged.bam" ${input_bam}
samtools sort "${params.output_runName}_merged.bam" > "${params.output_runName}_merged_sorted.bam"
samtools index "${params.output_runName}_merged_sorted.bam"
"""
}
/***
Process: Get sample barcodes and filter for => 100 umis.
Append run name to barcode as found in modified bam tags.
Input: sample paths
Output: Filtered barcodes tsv with => 100 umis and run name appended to barcode
Downstream: runSoupOrCell
Published: None
Notes:
***/
process getBarcodes{
cache 'lenient'
memory '16 GB'
input:
tuple val(run_name), val(sample_path) from run_info_copy02
output:
file("*.tsv") into barcodes
script:
def sampleName = new File(sample_path).name
"""
# bash watch for errors
set -ueo pipefail
awk -v run="${run_name}" '\$2 >= 100 {print \$1"_"run}' "${sample_path}/umis_per_cell_barcode.txt" \
> "${run_name}_${sampleName}_barcode.tsv"
"""
}
/***
Process: Merge all sample barcodes with >= 100 umis.
Input: Filtered barcodes with >= 100 umis from each sample.
Output: Merged barcodes .tsv with from each sample.
Published: Merged barcodes in .tsv format
Notes:
***/
process mergeBarcodes{
cache 'lenient'
memory '16 GB'
publishDir path: "${params.outputDir}/merged/", pattern: "*.tsv", mode: 'copy'
input:
file input_files from barcodes.collect()
output:
file("*.tsv") into merged_barcodes
script:
"""
# bash watch for errors
set -ueo pipefail
cat ${input_files} > ${params.output_runName}_merged_barcodes.tsv
"""
}
/***
Process: Run SoupOrCell on merged bams files
Input: - .bam = Merged, sorted, and then indexed bam files.
- .tsv = Merged and filtered barcodes
Output: - alt.mtx
- ambient_rna.txt
- clusters_genotypes.vcf
- clustering.done
- clusters_tmp.tsv
- clusters.tsv
- common_variants_covered_tmp.vcf
- common_variants_covered.vcf
- consensus.done
- depth_merged.bed
- doublets.err
- ref.mtx
- troublet.done
- variants.done
- vatrix.done
Published: - alt.mtx
- ambient_rna.txt
- clusters_genotypes.vcf
- clustering.done
- clusters_tmp.tsv
- clusters.tsv
- common_variants_covered_tmp.vcf
- common_variants_covered.vcf
- consensus.done
- depth_merged.bed
- doublets.err
- ref.mtx
- troublet.done
- variants.done
- vatrix.done
Notes:
***/
process runSoupOrCell {
cache 'lenient'
memory '16 GB'
cpus = '12'
penv = 'serial'
input:
set file(input_bam), file (input_bai) from bam_dir
file input_barcode from merged_barcodes
val kval from kval_range_copy01
output:
file("soc_output_k*") into soc_output
val kval into kval_out
script:
"""
# bash watch for errors
set -ueo pipefail
mkdir -p soc_output_k"$kval"
singularity exec \
--bind ${params.singularity_bindpath} \
${params.singularity_path} \
souporcell_pipeline.py -i ${input_bam} -b ${input_barcode} -f ${params.ref_fasta} -t ${params.threads} -o soc_output_k${kval} -k ${kval} \
--skip_remap ${params.skip_remap} \
--common_variants ${params.vcf} \
"""
}
/**
Process: Run SoupOrCell summary for the number of droplets classified as doublets, amiguous and assigned to each cluster.
Input: - clusters.tsv
Output: souporcell_summary.tsv
Published: souporcell_summary.tsv
Notes:
**/
process socSummary {
cache 'lenient'
memory '32 GB'
publishDir path: "${params.outputDir}/", pattern: "soc_output_k*", mode: 'copy'
input:
val kval from kval_range_copy02
file soc_dir from soc_output
output:
file soc_dir into soc_summary
script:
"""
# bash watch for errors
set -ueo pipefail
souporcell_summary.sh ${soc_dir}/clusters.tsv > ${soc_dir}/souporcell_summary.tsv
"""
}