Skip to content

Commit

Permalink
Revisions 1&2
Browse files Browse the repository at this point in the history
November 2023 - April 2024
  • Loading branch information
SziKayLeung authored Jun 20, 2024
2 parents f4f5cad + f7cb683 commit 8be7ab3
Show file tree
Hide file tree
Showing 67 changed files with 4,823 additions and 171 deletions.
20 changes: 20 additions & 0 deletions 0_utils/filter_default_reducecoverage.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"full-splice_match": [
{
"perc_A_downstream_TTS":[0,59]
}
],
"rest": [
{
"perc_A_downstream_TTS":[0,59],
"all_canonical":"canonical",
"RTS_stage":"FALSE"
},
{
"perc_A_downstream_TTS":[0,59],
"RTS_stage":"FALSE",
"min_cov":0
}
]
}

Binary file not shown.
25 changes: 25 additions & 0 deletions 0_utils/numReadsTargeted.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Sample Sample Number of Iso-Seq Reads Number of ONT Reads Genotype Age (months)
K19 Mouse 1 49638 - WT 4
K23 Mouse 2 51020 - WT 8
K21 Mouse 3 53727 - WT 6
K18 Mouse 4 63328 - TG 2
K20 Mouse 5 59153 - TG 4
K17 Mouse 6 50058 - WT 2
S19 Mouse 7 24293 738056 WT 4
K24 Mouse 8 25705 763528 TG 8
L22 Mouse 9 28081 807960 TG 8
M21 Mouse 10 27351 725260 WT 2
O18 Mouse 11 27135 824401 TG 2
O23 Mouse 12 24598 800543 WT 8
O22 Mouse 13 25283 765738 TG 6
P19 Mouse 14 22427 730010 WT 6
T20 Mouse 15 26239 819564 TG 6
Q20 Mouse 16 28114 1317511 TG 8
Q21 Mouse 17 23770 1080464 WT 2
S18 Mouse 18 27546 1131981 TG 2
S23 Mouse 19 24322 998458 WT 8
Q18 Mouse 20 32683 1295303 TG 6
Q17 Mouse 21 13861 666928 WT 6
L18 Mouse 22 16530 793802 TG 4
Q23 Mouse 23 18572 1117300 WT 4
T18 Mouse 24 24938 1150108 TG 4
13 changes: 13 additions & 0 deletions 0_utils/numReadsWhole.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Sample Sample Number of Reads Genotype Age (months)
K23 Mouse 2 351454 WT 8
K18 Mouse 4 338557 TG 2
K17 Mouse 6 329636 WT 2
K24 Mouse 8 350120 TG 8
L22 Mouse 9 328831 TG 8
M21 Mouse 10 339563 WT 2
O18 Mouse 11 212387 TG 2
O23 Mouse 12 329376 WT 8
Q20 Mouse 16 342173 TG 8
Q21 Mouse 17 227113 WT 2
S18 Mouse 18 358413 TG 2
S23 Mouse 19 354154 WT 8
4 changes: 4 additions & 0 deletions 0_utils/primer.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>primer_5p
AAGCAGTGGTATCAACGCAGAGTACATGGG
>primer_3p
GTACTCTGCGTTGATACCACTGCTT
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ run_sqanti3(){

# sqanti qc
echo "Processing Sample $1 for SQANTI2 QC"

# no kalliso file
if [ $8 == "rnaseq" ]; then
python ${SQANTI3_DIR}/sqanti3_qc.py -t 30 $3/$2 ${GENOME_GTF} ${GENOME_FASTA} --CAGE_peak ${CAGE_PEAK} --coverage "./*SJ.out.bed" --polyA_motif_list ${POLYA} --genename --isoAnnotLite --gff3 ${GFF3} --report skip &> $1.sqanti.qc.log
Expand Down
50 changes: 50 additions & 0 deletions A_Global_Transcriptome/1_IsoSeq_Pipeline/1b_J20_run_isoseq3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash
#SBATCH --export=ALL # export all environment variables to the batch job
#SBATCH -D . # set working directory to .
#SBATCH -p mrcq # submit to the parallel queue
#SBATCH --time=20:00:00 # maximum walltime for the job
#SBATCH -A Research_Project-MRC148213 # research project to submit under
#SBATCH --nodes=1 # specify number of nodes
#SBATCH --ntasks-per-node=16 # specify number of processors per node
#SBATCH --mail-type=END # send email at job completion
#SBATCH [email protected] # email address
#SBATCH --array=0-1 # 2 samples
#SBATCH --output=1b_J20_run_isoseq3-%A_%a.o
#SBATCH --error=1b_J20_run_isoseq3-%A_%a.e


# J20 samples

##-------------------------------------------------------------------------

# source config file and function script
module load Miniconda2/4.3.21
SC_ROOT=/lustre/projects/Research_Project-MRC148213/sl693/scripts/rTg4510/A_Global_Transcriptome
source $SC_ROOT/1_IsoSeq_Pipeline/rTg4510_isoseq.config
source $SC_ROOT/1_IsoSeq_Pipeline/01_source_functions.sh


##-------------------------------------------------------------------------

# run as array (defined in config file)
rawDir=/lustre/projects/Research_Project-MRC148213/sl693/rTg4510/1_raw/A_WholeTranscriptome/J20_PacBio
SAMPLE=${J20_ALL_SAMPLE_NAMES[${SLURM_ARRAY_TASK_ID}]}
J20_BAM_FILES=($rawDir/m54082_190302_104610.subreads.bam $rawDir/m54082_180816_074627.subreads.bam)
BAM_FILE=${J20_BAM_FILES[${SLURM_ARRAY_TASK_ID}]}


##-------------------------------------------------------------------------

# Isoseq3.4.0
# run_CCS_batch <input_ccs_bam> <prefix_output_name> <Output_directory>
# run_LIMA $Sample $Input_CCS_directory $Output_directory <"no_multiplex"/"multiplex">
# run_REFINE $Sample $Input_LIMA_directory $Output_directory
# run_CLUSTER $Sample $Input_REFINE_directory $Output_directory
run_CCS ${BAM_FILE} ${SAMPLE} ${WKD_ROOT}/1_isoseq3/1_ccs
run_LIMA ${SAMPLE} ${WKD_ROOT}/1_isoseq3/1_ccs ${WKD_ROOT}/1_isoseq3/2_lima "no_multiplex"
run_REFINE ${SAMPLE} ${WKD_ROOT}/1_isoseq3/2_lima ${WKD_ROOT}/1_isoseq3/3_refine
run_CLUSTER ${SAMPLE} ${WKD_ROOT}/1_isoseq3/3_refine ${WKD_ROOT}/1_isoseq3/4_cluster


##-------------------------------------------------------------------------
#run_star ${SAMPLE} ${RNASEQ_FILTERED_DIR} ${RNASEQ_MAPPED_DIR}
52 changes: 52 additions & 0 deletions A_Global_Transcriptome/1_IsoSeq_Pipeline/1c_J20_run_isoseq3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash
#SBATCH --export=ALL # export all environment variables to the batch job
#SBATCH -D . # set working directory to .
#SBATCH -p mrcq # submit to the parallel queue
#SBATCH --time=20:00:00 # maximum walltime for the job
#SBATCH -A Research_Project-MRC148213 # research project to submit under
#SBATCH --nodes=1 # specify number of nodes
#SBATCH --ntasks-per-node=16 # specify number of processors per node
#SBATCH --mail-type=END # send email at job completion
#SBATCH [email protected] # email address
#SBATCH --array=0-1 # 2 samples
#SBATCH --output=1c_J20_run_isoseq3-%A_%a.o
#SBATCH --error=1c_J20_run_isoseq3-%A_%a.e


# J20 samples: E18 and B21

##-------------------------------------------------------------------------

# source config file and function script
module load Miniconda2/4.3.21
SC_ROOT=/lustre/projects/Research_Project-MRC148213/lsl693/scripts/rTg4510/A_Global_Transcriptome
source $SC_ROOT/1_IsoSeq_Pipeline/rTg4510_isoseq.config
source $SC_ROOT/1_IsoSeq_Pipeline/01_source_functions.sh


##-------------------------------------------------------------------------

# run as array (defined in config file)
rawDir=/lustre/projects/Research_Project-MRC148213/lsl693/rTg4510/1_raw/A_WholeTranscriptome/J20_PacBio
J20_ALL_SAMPLE_NAMES=(E18 B21)
J20_BAM_FILES=($rawDir/m54082_180818_105629.subreads.bam $rawDir/m54082_190303_070925.subreads.bam)

SAMPLE=${J20_ALL_SAMPLE_NAMES[${SLURM_ARRAY_TASK_ID}]}
BAM_FILE=${J20_BAM_FILES[${SLURM_ARRAY_TASK_ID}]}


##-------------------------------------------------------------------------

# Isoseq3.4.0
# run_CCS_batch <input_ccs_bam> <prefix_output_name> <Output_directory>
# run_LIMA $Sample $Input_CCS_directory $Output_directory <"no_multiplex"/"multiplex">
# run_REFINE $Sample $Input_LIMA_directory $Output_directory
# run_CLUSTER $Sample $Input_REFINE_directory $Output_directory
run_CCS ${BAM_FILE} ${SAMPLE} ${WKD_ROOT}/1_isoseq3/1_ccs
run_LIMA ${SAMPLE} ${WKD_ROOT}/1_isoseq3/1_ccs ${WKD_ROOT}/1_isoseq3/2_lima "no_multiplex"
run_REFINE ${SAMPLE} ${WKD_ROOT}/1_isoseq3/2_lima ${WKD_ROOT}/1_isoseq3/3_refine
run_CLUSTER ${SAMPLE} ${WKD_ROOT}/1_isoseq3/3_refine ${WKD_ROOT}/1_isoseq3/4_cluster


##-------------------------------------------------------------------------
#run_star ${SAMPLE} ${RNASEQ_FILTERED_DIR} ${RNASEQ_MAPPED_DIR}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash
#SBATCH --export=ALL # export all environment variables to the batch job
#SBATCH -D . # set working directory to .
#SBATCH -p mrcq # submit to the parallel queue
#SBATCH --time=20:00:00 # maximum walltime for the job
#SBATCH -A Research_Project-MRC148213 # research project to submit under
#SBATCH --nodes=1 # specify number of nodes
#SBATCH --ntasks-per-node=16 # specify number of processors per node
#SBATCH --mail-type=END # send email at job completion
#SBATCH [email protected] # email address
#SBATCH --output=2b_map_annotate_isoform.o2
#SBATCH --error=2b_map_annotate_isoform.e2

# J20 C20, C21, B21 and E18 Iso-Seq pipeline


##-------------------------------------------------------------------------

# source config file and function script
module load Miniconda2/4.3.21
SC_ROOT=/lustre/projects/Research_Project-MRC148213/lsl693/scripts/rTg4510/A_Global_Transcriptome
LOGEN_ROOT=/lustre/projects/Research_Project-MRC148213/lsl693/scripts/LOGen
source $SC_ROOT/1_IsoSeq_Pipeline/rTg4510_isoseq.config
source $SC_ROOT/1_IsoSeq_Pipeline/01_source_functions.sh
export PATH=$PATH:${LOGEN_ROOT}/miscellaneous
export PATH=$PATH:${LOGEN_ROOT}/assist_isoseq_processing
export PATH=$PATH:${LOGEN_ROOT}/assist_ont_processing

J20_ALL_SAMPLE_NAMES=(B21 C20 C21 E18)

##-------------------------------------------------------------------------

# merging_at_refine <input_flnc_bam_dir> <output_directory> <output_J20NAME> <samples.....>
#merging_at_refine $WKD_ROOT/1_isoseq3/3_refine $WKD_ROOT/1_isoseq3/5_merged_cluster ${J20NAME} ${J20_ALL_SAMPLE_NAMES[@]}
#refine2fasta $WKD_ROOT/1_isoseq3/3_refine ${J20_ALL_SAMPLE_J20NAMES[@]}

# align individual samples
# run_pbmm2align <output_J20NAME> <clustered_dir> <mapped_dir>
#for i in ${J20_ALL_SAMPLE_NAMES[@]}; do run_pbmm2align $i $WKD_ROOT/1_isoseq3/4_cluster $WKD_ROOT/2_post_isoseq3/6_minimap; done

# filter_alignment <J20NAME> <mapped_dir>
#for i in ${J20_ALL_SAMPLE_NAMES[@]}; do filter_alignment $i $WKD_ROOT/2_post_isoseq3/6_minimap; done

# run_map_cupcakecollapse <sample_prefix_input/output_J20NAME> <isoseq3_input_directory> <mapping_output_directory> <tofu_output_directory>
#run_map_cupcakecollapse ${J20NAME} $WKD_ROOT/1_isoseq3/5_merged_cluster $WKD_ROOT/2_post_isoseq3/6_minimap $WKD_ROOT/2_post_isoseq3/7_tofu

# demux <J20NAME> <refine_dir> <cluster_report> <tofu_dir>
#demux ${J20NAME} $WKD_ROOT/1_isoseq3/3_refine $WKD_ROOT/1_isoseq3/5_merged_cluster/${J20NAME}.clustered.cluster_report.csv $WKD_ROOT/2_post_isoseq3/7_tofu


##-------------------------------------------------------------------------

source activate sqanti2_py3
cd $WKD_ROOT/2_post_isoseq3/9_sqanti3
python ${SQANTI3_DIR}/sqanti3_qc.py -t 30 $WKD_ROOT/2_post_isoseq3/7_tofu/${J20NAME}.collapsed.gff ${GENOME_GTF} ${GENOME_FASTA} --CAGE_peak ${CAGE_PEAK} --polyA_motif_list ${POLYA} --genename --isoAnnotLite --report skip &> ${J20NAME}.collapsed.sqanti.qc.log

74 changes: 39 additions & 35 deletions A_Global_Transcriptome/1_IsoSeq_Pipeline/rTg4510_isoseq.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,66 +19,70 @@

## Output name and relevant info
export NAME=WholeIsoSeq
export J20NAME=WholeJ20IsoSeq

## Output root directory filepath (ensure path exists)
export rTG4510=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/scripts/rTg4510
export WKD_ROOT=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/rTg4510/A_IsoSeq_Whole
export rTG4510=/lustre/projects/Research_Project-MRC148213/lsl693/scripts/rTg4510
export WKD_ROOT=/lustre/projects/Research_Project-MRC148213/lsl693/rTg4510/A_IsoSeq_Whole


## ---------------------------

## Source functions and scripts directory
export SC_ROOT=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/scripts/rTg4510/A_Global_Transcriptome
export GENERALFUNC=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/scripts/General
export TAMAFUNC=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/scripts/General/2_Transcriptome_Annotation/TAMA
export SC_ROOT=/lustre/projects/Research_Project-MRC148213/lsl693/scripts/rTg4510/A_Global_Transcriptome
#export GENERALFUNC=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/scripts/General
#export TAMAFUNC=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/scripts/General/2_Transcriptome_Annotation/TAMA


## ---------------------------

## Reference
export REFERENCE=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/references
export REFERENCE=/lustre/projects/Research_Project-MRC148213/lsl693/reference
export GENOME_FASTA=$REFERENCE/mouse/mm10.fa
export GENOME_GTF=$REFERENCE/annotation/gencode.vM22.annotation.gtf
export GENOME_LNCRNA_GTF=$REFERENCE/mouse/gencode.vM25.long_noncoding_RNAs.gtf
export STAR_REF_DIR=${REFERENCE}/STAR_main
#export GENOME_LNCRNA_GTF=$REFERENCE/mouse/gencode.vM25.long_noncoding_RNAs.gtf
#export STAR_REF_DIR=${REFERENCE}/STAR_main

# Primers and Probes
export FASTA=$REFERENCE/Primers/primer.fasta
export TARGETED_FASTA=$REFERENCE/Primers/targeted.primer.fasta
export FASTA=$rTG4510/0_utils/primer.fasta
#export TARGETED_FASTA=$REFERENCE/Primers/targeted.primer.fasta

# transgene sequences
source $rTG4510/B_Targeted_Transcriptome/1_IsoSeq_Pipeline/WT_TG_seq_differentiators.fa
#source $rTG4510/B_Targeted_Transcriptome/1_IsoSeq_Pipeline/WT_TG_seq_differentiators.fa

## ---------------------------

## Long read data (Iso-Seq)
SAMPLE_CONFIG=$SC_ROOT/1_IsoSeq_Pipeline/rTg4510_samples.tsv
export ALL_SAMPLE_NAMES=($(grep "^[^#;]" $SAMPLE_CONFIG | awk '{print $1}'))
export BAM_FILES=($(grep "^[^#;]" $SAMPLE_CONFIG | awk '{print $2}'))
#SAMPLE_CONFIG=$rTg4510/1_IsoSeq_Pipeline/rTg4510_samples.tsv
#export ALL_SAMPLE_NAMES=($(grep "^[^#;]" $SAMPLE_CONFIG | awk '{print $1}'))
#export BAM_FILES=($(grep "^[^#;]" $SAMPLE_CONFIG | awk '{print $2}'))

J20_SAMPLE_CONFIG=$rTG4510/0_utils/J20_samples.tsv
export J20_ALL_SAMPLE_NAMES=($(grep "^[^#;]" $J20_SAMPLE_CONFIG | awk '{print $1}'))
export J20_BAM_FILES=($(grep "^[^#;]" $J20_SAMPLE_CONFIG | awk '{print $2}'))


## ---------------------------

# Short read data (RNA-Seq)
RNASEQ_FILTERED_DIR=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/rTg4510/1_raw/C_rnaseq_raw/Tg4510_filtered
RNASEQ_SAMPLES_NAMES=$(awk '{print $1}' $SC_ROOT/1_IsoSeq_Pipeline/rTg4510_rnaseq_samples.tsv)
RNASEQ_SQ_INPUT=${ALL_SAMPLE_NAMES[@]}
RNASEQ_MAPPED_DIR=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/rTg4510/C_RNASeq/2_aligned/Matched_Whole
#RNASEQ_FILTERED_DIR=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/rTg4510/1_raw/C_rnaseq_raw/Tg4510_filtered
#RNASEQ_SAMPLES_NAMES=$(awk '{print $1}' $SC_ROOT/1_IsoSeq_Pipeline/rTg4510_rnaseq_samples.tsv)
#RNASEQ_SQ_INPUT=${ALL_SAMPLE_NAMES[@]}
#RNASEQ_MAPPED_DIR=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/rTg4510/C_RNASeq/2_aligned/Matched_Whole

## ---------------------------

## Software
export SOFTDIR=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/software
export SOFTDIR=/lustre/projects/Research_Project-MRC148213/lsl693/software

export CUPCAKE=$SOFTDIR/Post_Isoseq3/cDNA_Cupcake
export CUPCAKE=$SOFTDIR/cDNA_Cupcake
export ANNOTATION=$CUPCAKE/annotation
export SEQUENCE=$CUPCAKE/sequence
export PYTHONPATH=$PYTHONPATH:$SEQUENCE
export SQANTI2_dir=$SOFTDIR/Post_Isoseq3/SQANTI2
export SQANTI3_DIR=$SOFTDIR/SQANTI3
export SQ_Report=$SOFTDIR/Post_Isoseq3/SQANTI2/utilities/SQANTI_report2.R
export TAPPAS_dir=$SOFTDIR/TAPPAS
export TAMA_DIR=$SOFTDIR/tama/tama_go/filter_transcript_models
export SQ_Report=$SOFTDIR/SQANTI3/utilities/SQANTI_report2.R
#export TAPPAS_dir=$SOFTDIR/TAPPAS
#export TAMA_DIR=$SOFTDIR/tama/tama_go/filter_transcript_models

## ---------------------------

Expand All @@ -92,17 +96,17 @@ GFF3=$TAPPAS_dir/Mus_musculus_GRCm38_Ensembl_86.gff3
## ---------------------------

## Internal Scripts
DEMUXFUNCTIONSGLOB=$GENERALFUNC/2_Transcriptome_Annotation/Cupcake_Demultiplex.R
TAMAMERGE=$TAMAFUNC/TAMA_Merge_Prepare.R
TAMASUBSET=$GENERALFUNC/2_Transcriptome_Annotation/TAMA/tama_sqanti_classgtfsubset.R
TAMASUBSETFASTA=$GENERALFUNC/2_Transcriptome_Annotation/TAMA/tama_sqanti_fastasubset.py
ISMREMOVE=$GENERALFUNC/2_Transcriptome_Annotation/3ISM_remove_classification.R
SQSUBSET=$GENERALFUNC/2_Transcriptome_Annotation/sqanti_classgtfsubset.R
SQCOUNT=$GENERALFUNC/2_Transcriptome_Annotation/subset_casecontrol_by_counts.R
SQCOUNT_SAMPLE=$GENERALFUNC/2_Transcriptome_Annotation/subset_sample_by_counts.R
ISOCOL=$GENERALFUNC/2_Transcriptome_Annotation/colour_common_targeted_transcripts.py
MODKAL=$GENERALFUNC/2_Transcriptome_Annotation/TabSeparated_Kallisto.R
TALEXP=$GENERALFUNC/5_TappAS_Differential/talon2tappas_expression.R
#DEMUXFUNCTIONSGLOB=$GENERALFUNC/2_Transcriptome_Annotation/Cupcake_Demultiplex.R
#TAMAMERGE=$TAMAFUNC/TAMA_Merge_Prepare.R
#TAMASUBSET=$GENERALFUNC/2_Transcriptome_Annotation/TAMA/tama_sqanti_classgtfsubset.R
#TAMASUBSETFASTA=$GENERALFUNC/2_Transcriptome_Annotation/TAMA/tama_sqanti_fastasubset.py
#ISMREMOVE=$GENERALFUNC/2_Transcriptome_Annotation/3ISM_remove_classification.R
#SQSUBSET=$GENERALFUNC/2_Transcriptome_Annotation/sqanti_classgtfsubset.R
#SQCOUNT=$GENERALFUNC/2_Transcriptome_Annotation/subset_casecontrol_by_counts.R
#SQCOUNT_SAMPLE=$GENERALFUNC/2_Transcriptome_Annotation/subset_sample_by_counts.R
#ISOCOL=$GENERALFUNC/2_Transcriptome_Annotation/colour_common_targeted_transcripts.py
#MODKAL=$GENERALFUNC/2_Transcriptome_Annotation/TabSeparated_Kallisto.R
#TALEXP=$GENERALFUNC/5_TappAS_Differential/talon2tappas_expression.R

cd $WKD_ROOT; mkdir -p 1_isoseq3 2_post_isoseq3
cd $WKD_ROOT/1_isoseq3; mkdir -p 1_ccs 2_lima 3_refine 4_cluster 5_merged_cluster
Expand Down
Loading

0 comments on commit 8be7ab3

Please sign in to comment.