scRNA_seq_pipeline.sh

#original scripts from Jun, tweaked by Beth
#updated for ubuntu and python3 3/12/2024 by Beth

#!/bin/bash

# Define the location of the script path
script_path="/net/shendure/vol1/home/martin91/scripts/sciRNAseq3" 


################################################################################
###Experiment-specific settings

# define the fastq folder including all fastq files
fastq_folder="<your_project_folder>/nobackup/fastq"
#where all the output is going, make sure not to put a slash on the end of this path:
all_output_folder="<your_project_folder>/nobackup/output"
# define the PCR group id after demultiplexing - These are the pcrwells that you gave from the sample sheet for demux, one on each line
pcrwell="<your_project_folder>/pcrwells.txt"
#RT samplesheet - a samplesheet with the headers: RTwell, RTindex, SampleName. Defines which samples went into each well of the RT plate.
RT_sample="<your_project_folder>/RTsamplesheet.csv"

#the reference sequences you are mapping to, these are generated by STAR, with a matching gtf file that is gzipped.
#change this depending on sample:
# define the location for the index files used in STAR - remember what organism youre doing here
# drosophila: /net/shendure/vol12/projects/sciRNAseq_script/index/STAR_drosophila_BDGP6/
# mouse: /net/shendure/vol10/projects/scRNA/nobackup/reference/index/STAR/STAR_mm10_RNAseq/ #deleted
# newer mouse: /net/shendure/vol10/nobackup/genome/STAR/GRCm38-p6-all
index="/net/shendure/vol10/nobackup/genome/STAR/GRCm38-p6-all"
# define the gtf file for gene counting
# drosophila: /net/shendure/vol12/projects/sciRNAseq_script/gtf_file/Drosophila_melanogaster.BDGP6.87.gtf.gz
# mouse: /net/shendure/vol1/home/martin91/nobackup/reference/mouse/gencode.vM22.chr_patch_hapl_scaff.annotation.gtf.gz
# newer mouse: /net/shendure/vol10/nobackup/genome/GTF/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz
gtf_file="/net/shendure/vol10/nobackup/genome/GTF/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz"

# the following script will be different for different organisms, because of the structure of the gtf file, the fly one is weird
# for mouse/human: countscript=$script_path/sciRNAseq_count.py
# for fly: countscript=$script_path/sciRNAseq_count_dro.py
countscript=$script_path/sciRNAseq_count.py

################################################################################
###Common settings


# define the core number for parallel processing
core=1
core_sam=3
# define the number of unique reads cutoff for splitting single cell
cutoff=200 # 200 is usual number, 100 if too many cells on a seq run

#define the mismatch rate for removing duplicates:
mismatch=1

# define the location of the ligation barcodes
ligation_barcode=$script_path/lig_384_bc.pickle2
#ligation wells (all 384)
ligwells=$script_path/ligationwellbarcode384.csv
# define the location of the RT barcodes
RT_barcode=$script_path/RT_384_bc.pickle2
# define the location of the combined RT and ligation barcodes
barcodes=$script_path/combined_384_wells.txt
# define the location of the R script for multi-core processing
R_script=$script_path/sci3_bash_input_ID_output_core.R


#define the bin of python if you want to use your own python install
#you will have to call this path in the other scripts that use python
#python_path="/net/shendure/vol12/projects/sciRNAseq_script/anaconda2/bin/"
#without this explicitly called, it is just going to use the python on the cluster python/3.12.1 requested in the modules below


############# make sure these modules are available, these are all on the gs ubuntu clusters now

#for ubuntu
module load modules modules-init modules-gs
module load samtools/1.19  
module load bedtools/2.31.1  
module load STAR/2.6.1d  
module load R/4.3.2  
module load python/3.12.1 
module load cutadapt/4.6  
module load fastqc/0.12.1  
module load trim_galore/0.6.10
module load numpy/1.26.4
module load matplotlib/3.8.3
module load pandas/2.1.4
module load levenshtein/0.25.0
module load HTSeq/2.0.5


########### UMI attach
##this script take in a input folder, a sample ID, a output folder, a oligo-dT barcode file, a corresponding N5 barcode file, and
##it pass the factors to the python script
input_folder=$fastq_folder
output_folder=$all_output_folder/UMI_attach
#script=$script_path/UMI_barcode_attach_gzipped_with_dic_new_2RT.py  #this is new for 2 RT primers 
script=$script_path/UMI_barcode_attach_gzipped_with_dic_new.py # this is new for just 1 RT Primer  

now=$(date)
echo "Current time : $now"
echo "changing the name of the fastq files..."
for sample in $(cat $pcrwell); do echo changing name $sample; mv $input_folder/*$sample*R1*.gz $input_folder/$sample.R1.fastq.gz; mv $input_folder/*$sample*R2*.gz $input_folder/$sample.R2.fastq.gz; done
echo "Attaching barcode and UMI...."
mkdir -p $output_folder
#$python_path/python $script $input_folder $pcrwell $output_folder $ligation_barcode $ligwells $RT_barcode $RT_sample $core
python $script $input_folder $pcrwell $output_folder $ligation_barcode $ligwells $RT_barcode $RT_sample $core
echo "Barcode transformed and UMI attached."

####make a samplelist file from your RT samplesheet 
awk 'NR>1' $RT_sample | awk -F ',' '{print $3}' | sort -u > $all_output_folder/samplelist.txt
samplelist=$all_output_folder/samplelist.txt

################# Trimming the read2
trimmed_fastq=$all_output_folder/trimmed_fastq
UMI_attached_R2=$all_output_folder/UMI_attach


echo
echo "Start trimming the read2 file..."
echo $(date)
Rscript $R_script $script_path/sci3_trim.sh $UMI_attached_R2 $pcrwell $trimmed_fastq $core


# ############align the reads with STAR, filter the reads based on q > 30, and remove duplicates based on exactly UMI sequence and tagmentation site
trimmed_fastq=$all_output_folder/trimmed_fastq
input_folder=$trimmed_fastq
STAR_output_folder=$all_output_folder/STAR_alignment
filtered_sam_folder=$all_output_folder/filtered_sam
rmdup_sam_folder=$all_output_folder/rmdup_sam


#align read2 to the index file using STAR with default setting # you will need to make sure your session has enough memory (40G): qlogin -l mfree=4G -pe serial 10
echo "Start alignment using STAR..."
echo input folder: $input_folder
echo RTsamplesheet: $RT_sample
echo index file: $index
echo output_folder: $STAR_output_folder
#make the output folder
mkdir -p $STAR_output_folder
#remove the index from the memory #is this here just in case you have a previous index in memory? or is this a typo that it's here?
STAR --genomeDir $index --genomeLoad Remove
#start the alignment
for sample in $(cat $pcrwell); do echo Aligning $sample;STAR --runThreadN $core --outSAMstrandField intronMotif --genomeDir $index --readFilesCommand zcat --readFilesIn $input_folder/$sample*gz --outFileNamePrefix $STAR_output_folder/$sample --genomeLoad LoadAndKeep --outReadsUnmapped Fastx; done
#remove the index from the memory
STAR --genomeDir $index --genomeLoad Remove
echo "All alignment done."

#make the filter sam folder, and filter and sort the sam file 
#make the flltered sam folder
echo
echo "Start filter and sort the sam files..."
echo input folder: $STAR_output_folder
echo output folder: $filtered_sam_folder


bash_script=$script_path/sci3_filter.sh 
Rscript $R_script $bash_script $STAR_output_folder $pcrwell $filtered_sam_folder $core_sam

# make a folder for rmdup_sam_folder, 
# Then for each filtered sam file, remove the duplicates based on UMI and barcode, chromatin number and position
echo
echo "Start removing duplicates..."
echo input folder: $filtered_sam_folder
echo output folder: $rmdup_sam_folder
mkdir -p $rmdup_sam_folder


bash_script=$script_path/sci3_rmdup_nomismatch.sh # for removing duplicates only considering exact match

Rscript $R_script $bash_script $filtered_sam_folder $pcrwell $rmdup_sam_folder $core $mismatch

#mv the reported files to the report/duplicate_read/ folder
mkdir -p $input_folder/../report/duplicate_read
mv $rmdup_sam_folder/*.csv $input_folder/../report/duplicate_read/
echo "removing duplicates completed.."
echo
echo "Alignment and sam file preprocessing are done."  

# repeat the rmdup process to remove duplicates based on UMI distance
echo
echo "Start removing duplicates..."
echo input folder: $all_output_folder/rmdup_sam
echo output folder: $all_output_folder/rmdup_sam_2
mkdir -p $all_output_folder/rmdup_sam_2


bash_script=$script_path/sci3_rmdup.sh
filtered_sam_folder=$all_output_folder/rmdup_sam
rmdup_sam_folder=$all_output_folder/rmdup_sam_2
Rscript $R_script $bash_script $filtered_sam_folder $pcrwell $rmdup_sam_folder $core $mismatch


################# split the sam file based on the barcode, and mv the result to the report folder
sam_folder=$all_output_folder/rmdup_sam_2
output_folder=$all_output_folder/sam_splitted


echo
echo "Start splitting the sam file..."
echo samfile folder: $sam_folder
echo pcrwell list: $pcrwell
echo output folder: $output_folder
echo barcode file: $barcodes
echo cutoff value: $cutoff


bash_script=$script_path/sci3_split.sh
Rscript $R_script $bash_script $sam_folder $pcrwell $output_folder $core $barcodes $cutoff

#this is making a list of all the barcodes in the data
cat $output_folder/*sample_list.txt>$output_folder/All_samples.txt
cp $output_folder/All_samples.txt $output_folder/../barcode_samples.txt

# output the report the report/barcode_read_distribution folder
mkdir -p $output_folder/../report/barcode_read_distribution
mv $output_folder/*.txt $output_folder/../report/barcode_read_distribution/
mv $output_folder/*.png $output_folder/../report/barcode_read_distribution/
echo
echo "All sam file splitted."


################# gene count
# count reads mapping to genes
output_folder=$all_output_folder/report/gene_count/
input_folder=$all_output_folder/sam_splitted
cell_ID=$all_output_folder/barcode_samples.txt
echo "Start the gene count...."
python $countscript $gtf_file $input_folder $cell_ID $core

echo "Make the output folder and transfer the files..."
mkdir -p $output_folder
find $input_folder -name *.count -exec cat {} + > $output_folder/count.MM
find $input_folder -name '*.count' | xargs rm -f  # comment this out if you want to see what's in it
find $input_folder -name *.report -exec cat {} + > $output_folder/report.MM
find $input_folder -name '*.report' | xargs rm -f
mv $input_folder/*_annotate.txt $output_folder/
echo "All output files are transferred~"
now=$(date)
echo "Current time : $now"
echo "load R/4.3.2 and then run the genecount processing script with Rscript gene_count_processing_sciRNAseq_CX.R"