forked from JunyueC/sci-RNA-seq3_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 1
/
scRNA_seq_pipeline.sh
246 lines (189 loc) · 10.3 KB
/
scRNA_seq_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#original scripts from Jun, tweaked by Beth
#updated for ubuntu and python3 3/12/2024 by Beth
#!/bin/bash
# Define the location of the script path
script_path="/net/shendure/vol1/home/martin91/scripts/sciRNAseq3"
################################################################################
###Experiment-specific settings
# define the fastq folder including all fastq files
fastq_folder="<your_project_folder>/nobackup/fastq"
#where all the output is going, make sure not to put a slash on the end of this path:
all_output_folder="<your_project_folder>/nobackup/output"
# define the PCR group id after demultiplexing - These are the pcrwells that you gave from the sample sheet for demux, one on each line
pcrwell="<your_project_folder>/pcrwells.txt"
#RT samplesheet - a samplesheet with the headers: RTwell, RTindex, SampleName. Defines which samples went into each well of the RT plate.
RT_sample="<your_project_folder>/RTsamplesheet.csv"
#the reference sequences you are mapping to, these are generated by STAR, with a matching gtf file that is gzipped.
#change this depending on sample:
# define the location for the index files used in STAR - remember what organism youre doing here
# drosophila: /net/shendure/vol12/projects/sciRNAseq_script/index/STAR_drosophila_BDGP6/
# mouse: /net/shendure/vol10/projects/scRNA/nobackup/reference/index/STAR/STAR_mm10_RNAseq/ #deleted
# newer mouse: /net/shendure/vol10/nobackup/genome/STAR/GRCm38-p6-all
index="/net/shendure/vol10/nobackup/genome/STAR/GRCm38-p6-all"
# define the gtf file for gene counting
# drosophila: /net/shendure/vol12/projects/sciRNAseq_script/gtf_file/Drosophila_melanogaster.BDGP6.87.gtf.gz
# mouse: /net/shendure/vol1/home/martin91/nobackup/reference/mouse/gencode.vM22.chr_patch_hapl_scaff.annotation.gtf.gz
# newer mouse: /net/shendure/vol10/nobackup/genome/GTF/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz
gtf_file="/net/shendure/vol10/nobackup/genome/GTF/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz"
# the following script will be different for different organisms, because of the structure of the gtf file, the fly one is weird
# for mouse/human: countscript=$script_path/sciRNAseq_count.py
# for fly: countscript=$script_path/sciRNAseq_count_dro.py
countscript=$script_path/sciRNAseq_count.py
################################################################################
###Common settings
# define the core number for parallel processing
core=1
core_sam=3
# define the number of unique reads cutoff for splitting single cell
cutoff=200 # 200 is usual number, 100 if too many cells on a seq run
#define the mismatch rate for removing duplicates:
mismatch=1
# define the location of the ligation barcodes
ligation_barcode=$script_path/lig_384_bc.pickle2
#ligation wells (all 384)
ligwells=$script_path/ligationwellbarcode384.csv
# define the location of the RT barcodes
RT_barcode=$script_path/RT_384_bc.pickle2
# define the location of the combined RT and ligation barcodes
barcodes=$script_path/combined_384_wells.txt
# define the location of the R script for multi-core processing
R_script=$script_path/sci3_bash_input_ID_output_core.R
#define the bin of python if you want to use your own python install
#you will have to call this path in the other scripts that use python
#python_path="/net/shendure/vol12/projects/sciRNAseq_script/anaconda2/bin/"
#without this explicitly called, it is just going to use the python on the cluster python/3.12.1 requested in the modules below
############# make sure these modules are available, these are all on the gs ubuntu clusters now
#for ubuntu
module load modules modules-init modules-gs
module load samtools/1.19
module load bedtools/2.31.1
module load STAR/2.6.1d
module load R/4.3.2
module load python/3.12.1
module load cutadapt/4.6
module load fastqc/0.12.1
module load trim_galore/0.6.10
module load numpy/1.26.4
module load matplotlib/3.8.3
module load pandas/2.1.4
module load levenshtein/0.25.0
module load HTSeq/2.0.5
########### UMI attach
##this script take in a input folder, a sample ID, a output folder, a oligo-dT barcode file, a corresponding N5 barcode file, and
##it pass the factors to the python script
input_folder=$fastq_folder
output_folder=$all_output_folder/UMI_attach
#script=$script_path/UMI_barcode_attach_gzipped_with_dic_new_2RT.py #this is new for 2 RT primers
script=$script_path/UMI_barcode_attach_gzipped_with_dic_new.py # this is new for just 1 RT Primer
now=$(date)
echo "Current time : $now"
echo "changing the name of the fastq files..."
for sample in $(cat $pcrwell); do echo changing name $sample; mv $input_folder/*$sample*R1*.gz $input_folder/$sample.R1.fastq.gz; mv $input_folder/*$sample*R2*.gz $input_folder/$sample.R2.fastq.gz; done
echo "Attaching barcode and UMI...."
mkdir -p $output_folder
#$python_path/python $script $input_folder $pcrwell $output_folder $ligation_barcode $ligwells $RT_barcode $RT_sample $core
python $script $input_folder $pcrwell $output_folder $ligation_barcode $ligwells $RT_barcode $RT_sample $core
echo "Barcode transformed and UMI attached."
####make a samplelist file from your RT samplesheet
awk 'NR>1' $RT_sample | awk -F ',' '{print $3}' | sort -u > $all_output_folder/samplelist.txt
samplelist=$all_output_folder/samplelist.txt
################# Trimming the read2
trimmed_fastq=$all_output_folder/trimmed_fastq
UMI_attached_R2=$all_output_folder/UMI_attach
echo
echo "Start trimming the read2 file..."
echo $(date)
Rscript $R_script $script_path/sci3_trim.sh $UMI_attached_R2 $pcrwell $trimmed_fastq $core
# ############align the reads with STAR, filter the reads based on q > 30, and remove duplicates based on exactly UMI sequence and tagmentation site
trimmed_fastq=$all_output_folder/trimmed_fastq
input_folder=$trimmed_fastq
STAR_output_folder=$all_output_folder/STAR_alignment
filtered_sam_folder=$all_output_folder/filtered_sam
rmdup_sam_folder=$all_output_folder/rmdup_sam
#align read2 to the index file using STAR with default setting # you will need to make sure your session has enough memory (40G): qlogin -l mfree=4G -pe serial 10
echo "Start alignment using STAR..."
echo input folder: $input_folder
echo RTsamplesheet: $RT_sample
echo index file: $index
echo output_folder: $STAR_output_folder
#make the output folder
mkdir -p $STAR_output_folder
#remove the index from the memory #is this here just in case you have a previous index in memory? or is this a typo that it's here?
STAR --genomeDir $index --genomeLoad Remove
#start the alignment
for sample in $(cat $pcrwell); do echo Aligning $sample;STAR --runThreadN $core --outSAMstrandField intronMotif --genomeDir $index --readFilesCommand zcat --readFilesIn $input_folder/$sample*gz --outFileNamePrefix $STAR_output_folder/$sample --genomeLoad LoadAndKeep --outReadsUnmapped Fastx; done
#remove the index from the memory
STAR --genomeDir $index --genomeLoad Remove
echo "All alignment done."
#make the filter sam folder, and filter and sort the sam file
#make the flltered sam folder
echo
echo "Start filter and sort the sam files..."
echo input folder: $STAR_output_folder
echo output folder: $filtered_sam_folder
bash_script=$script_path/sci3_filter.sh
Rscript $R_script $bash_script $STAR_output_folder $pcrwell $filtered_sam_folder $core_sam
# make a folder for rmdup_sam_folder,
# Then for each filtered sam file, remove the duplicates based on UMI and barcode, chromatin number and position
echo
echo "Start removing duplicates..."
echo input folder: $filtered_sam_folder
echo output folder: $rmdup_sam_folder
mkdir -p $rmdup_sam_folder
bash_script=$script_path/sci3_rmdup_nomismatch.sh # for removing duplicates only considering exact match
Rscript $R_script $bash_script $filtered_sam_folder $pcrwell $rmdup_sam_folder $core $mismatch
#mv the reported files to the report/duplicate_read/ folder
mkdir -p $input_folder/../report/duplicate_read
mv $rmdup_sam_folder/*.csv $input_folder/../report/duplicate_read/
echo "removing duplicates completed.."
echo
echo "Alignment and sam file preprocessing are done."
# repeat the rmdup process to remove duplicates based on UMI distance
echo
echo "Start removing duplicates..."
echo input folder: $all_output_folder/rmdup_sam
echo output folder: $all_output_folder/rmdup_sam_2
mkdir -p $all_output_folder/rmdup_sam_2
bash_script=$script_path/sci3_rmdup.sh
filtered_sam_folder=$all_output_folder/rmdup_sam
rmdup_sam_folder=$all_output_folder/rmdup_sam_2
Rscript $R_script $bash_script $filtered_sam_folder $pcrwell $rmdup_sam_folder $core $mismatch
################# split the sam file based on the barcode, and mv the result to the report folder
sam_folder=$all_output_folder/rmdup_sam_2
output_folder=$all_output_folder/sam_splitted
echo
echo "Start splitting the sam file..."
echo samfile folder: $sam_folder
echo pcrwell list: $pcrwell
echo output folder: $output_folder
echo barcode file: $barcodes
echo cutoff value: $cutoff
bash_script=$script_path/sci3_split.sh
Rscript $R_script $bash_script $sam_folder $pcrwell $output_folder $core $barcodes $cutoff
#this is making a list of all the barcodes in the data
cat $output_folder/*sample_list.txt>$output_folder/All_samples.txt
cp $output_folder/All_samples.txt $output_folder/../barcode_samples.txt
# output the report the report/barcode_read_distribution folder
mkdir -p $output_folder/../report/barcode_read_distribution
mv $output_folder/*.txt $output_folder/../report/barcode_read_distribution/
mv $output_folder/*.png $output_folder/../report/barcode_read_distribution/
echo
echo "All sam file splitted."
################# gene count
# count reads mapping to genes
output_folder=$all_output_folder/report/gene_count/
input_folder=$all_output_folder/sam_splitted
cell_ID=$all_output_folder/barcode_samples.txt
echo "Start the gene count...."
python $countscript $gtf_file $input_folder $cell_ID $core
echo "Make the output folder and transfer the files..."
mkdir -p $output_folder
find $input_folder -name *.count -exec cat {} + > $output_folder/count.MM
find $input_folder -name '*.count' | xargs rm -f # comment this out if you want to see what's in it
find $input_folder -name *.report -exec cat {} + > $output_folder/report.MM
find $input_folder -name '*.report' | xargs rm -f
mv $input_folder/*_annotate.txt $output_folder/
echo "All output files are transferred~"
now=$(date)
echo "Current time : $now"
echo "load R/4.3.2 and then run the genecount processing script with Rscript gene_count_processing_sciRNAseq_CX.R"