forked from drtamermansour/horse_trans
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.sh
1765 lines (1647 loc) · 111 KB
/
main.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/sh
## construction of the basic diretory structure
git clone https://github.com/drtamermansour/horse_trans.git
cd horse_trans
horse_trans=$(pwd)
mkdir -p $horse_trans/{resources,prepdata,extdata,tissue_merge,refGenome,public_assemblies,downloads} ## you should have 2 folders already (scripts&track_hub) by cloning the original repository.
## read the user configurations
source $horse_trans/user_config.txt
cat $horse_trans/user_config.txt
## create a config file to contain all the pathes to be used by all pipelines
> $horse_trans/config.txt
echo "script_path=$horse_trans/scripts" >> $horse_trans/config.txt
echo "resources=$horse_trans/resources" >> $horse_trans/config.txt
echo "prepData=$horse_trans/prepdata" >> $horse_trans/config.txt
echo "extData=$horse_trans/extdata" >> $horse_trans/config.txt
echo "tissue_merge=$horse_trans/tissue_merge" >> $horse_trans/config.txt
echo "genome_dir=$horse_trans/refGenome" >> $horse_trans/config.txt
echo "pubAssemblies=$horse_trans/public_assemblies" >> $horse_trans/config.txt
echo "track_hub=$horse_trans/track_hub" >> $horse_trans/config.txt
source $horse_trans/config.txt
## UCSC kent commands are used a lot through out the pipeline.
## http://genome-source.cse.ucsc.edu/gitweb/?p=kent.git;a=blob;f=src/userApps/README
## http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/FOOTER
## wget -r --no-directories ftp://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/
###########################################################################################
#### prepare the raw fastq files:
## Every tissue has a separate folder carrying its name (maximum 14 letter) in $prepData.
## Every RNAseq libarary should have a separate folder into the corresponding tissue folder.
## The libarary folder name should start with PE_ or SE_ according to the sequencoing type
## Then it should should have the read length followed by underscore
## Then it should have fr.unstranded_ , fr.firststrand_ , fr.secondstrand_ according to lib type
## Then the owner name (or names separted by dots) followed by underscore
## Then the date of sequencing as MMDDYYYY
## The raw data files should be kept in a folder named fastq_data in the libarary folder
## The raw data files should be prepared so that they have enconding "Sanger / illumina 1.9"
## The file names should fit the format *_R1_*.fastq.gz & *_R2_*.fastq.gz for PE reads or *_SR_*.fastq.gz for SE
## All reads in every given file should belong to ONE sequencing lane.
## If there are sample replicates from different lanes, you can add a text file called "replicates.txt"
## A line in this file should have the names of one sample replicates with space separation. (only the *_R1_*.fastq.gz for PE reads or *_SR_*.fastq.gz for SE)
## The first syllbus (the part of the file name before _R1_ , _R2_ or _SR_) should be unique
## a representive work out to one SRA reposatories
newLib=$"PBMCs/PE_49_fr.unstranded_bioproj.265983_10302014"
SRA_URL=$"ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/ERR/ERR653"
mkdir -p $rawData/$newLib
cd $rawData/PBMCs/PE_49_fr.unstranded_bioproj.265983_10302014
bash $script_path/prep_proj.sh $SRA_URL $extData $script_path ## the script assume no sample replicates
###########################################################################################
## define the list of working directory and the list samples in each. This is where you can edit the output list file(s) to restrict the processing for certain target(s)
rm -f $horse_trans/working_list.txt
for work_dir in $prepData/*/{PE_*,SE_*}; do if [ -d $work_dir/fastq_data ]; then
echo $work_dir >> $horse_trans/working_list.txt
rm -f $work_dir/fastq_data/sample_list.txt
for f in $work_dir/fastq_data/{*_R1_*.fastq.gz,*_SR_*.fastq.gz}; do if [ -f $f ]; then
echo $f >> $work_dir/fastq_data/sample_list.txt; fi; done;
fi; done;
## get read length statistics
headers=$(Rscript -e 'cat("Tissue", "Library", "No_of_samples", "No_of_reads(M)", "Total_bp_count(Gb)", "Min_length", "Max_length", "Median_length", "Mean_length", sep="\t");')
echo "$headers" > $horse_trans/raw_statistics.txt
while read work_dir; do
echo $work_dir
cd $work_dir/fastq_data
sample_list=$work_dir/fastq_data/sample_list.txt
cat sample_list.txt | xargs zcat | awk '{if(NR%4==2) print length($1)}' > input.readslength.txt
stat=$(Rscript -e 'd<-scan("input.readslength.txt", quiet=TRUE); cat(round(length(d)/10^6,2), round(sum(d)/10^9,2), min(d), max(d), median(d), mean(d), sep="\t");')
lib=$(basename $work_dir)
tissue=$(dirname $work_dir | xargs basename)
no=$(cat $sample_list | wc -l)
echo "$tissue"$'\t'"$lib"$'\t'"$no"$'\t'"$stat" >> $horse_trans/raw_statistics.txt
done < $horse_trans/working_list.txt
## prepare sorted working list according to read length
tail -n+2 $horse_trans/raw_statistics.txt | sort -k8,8nr | awk -v myRoot=$prepData '{ print myRoot"/"$1"/"$2 }' > $horse_trans/working_list_sorted.txt
###########################################################################################
## prepare list of external data
rm -f $horse_trans/extdata_list.txt
for work_dir in $extData/*/{PE_*,SE_*}; do if [ -d $work_dir/fastq_data ]; then
echo $work_dir >> $horse_trans/extdata_list.txt
rm -f $work_dir/fastq_data/sample_list.txt
for f in $work_dir/fastq_data/{*_R1_*.fastq.gz,*_SR_*.fastq.gz}; do if [ -f $f ]; then
echo $f >> $work_dir/fastq_data/sample_list.txt; fi; done;
fi; done;
###########################################################################################
#### Read trimming
## This step requires input working_list & sample_list and output folder
## the module should produce trimmed reads in the form of (*_R1_*.pe.se.fq and *_R2_*.pe.fq) for PE lib and (*_SR_*.se.fq) for SE
## The output is an sample list with *_R1_*.pe.se.fq and *_SR_*.se.fq files
## Mild trimming with Trimmomatic using sliding window
while read work_dir; do
echo $work_dir
mkdir -p $work_dir/trimmed_RNA_reads
cd $work_dir/trimmed_RNA_reads
lib=$(basename $work_dir | cut -d"_" -f 1) ## PE or SE
sample_list=$work_dir/fastq_data/sample_list.txt
bash ${script_path}/run_adapter_trimmer.sh $sample_list $lib $platform
done < $horse_trans/working_list.txt
## Check for successful trimming and trouble shooting the failed trimming jobs (requires T_Trim.e)
while read work_dir; do
cd $work_dir/trimmed_RNA_reads
sample_list=$work_dir/fastq_data/failedSamples.txt ## define the path of empty file
bash $script_path/check_ad_trim.sh "$sample_list"
x=$(cat $sample_list | wc -l)
if [ $x -ne 0 ]; then
lib=$(basename $work_dir | cut -d"_" -f 1)
echo "Failed jobs in: "$work_dir
bash ${script_path}/run_adapter_trimmer.sh $sample_list $lib $platform
fi
done < $horse_trans/working_list.txt
#### merge singletones
while read work_dir; do if [ -d $work_dir/trimmed_RNA_reads ]; then
cd $work_dir/trimmed_RNA_reads
## change /2 to /1 in s2_se then combine single reads
for f in $work_dir/trimmed_RNA_reads/*_R1_*.se.fq; do if [ -f $f ]; then echo $f; f2=$(basename $f | sed 's/_R1_/_R2_/'); newf=$(basename $f | sed 's/_R1_/_R_/'); sed 's/\/2$/\/1/g' $f2 > $f2.temp; cat $f $f2.temp > $newf; fi; done;
rm -f *.se.fq.temp
## merge the single reads to the end of s1_pe file to make s1_pe_se
for f in $work_dir/trimmed_RNA_reads/*_R1_*.pe.fq; do if [ -f $f ]; then echo $f; fr=$(basename $f | sed 's/_R1_/_R_/'); fr2=$(echo $fr | sed 's/.pe.fq/.se.fq/'); newf=$(basename $f | sed 's/.pe.fq/.pe.se.fq/'); cat $f $fr2 > $newf; fi; done;
#rm {*_R1_*.pe.fq,*.se.fq} ## all what you need *_R1_*.pe.se.fq and *_R2_*.pe.fq
fi
done < $horse_trans/working_list.txt
## define the list samples for subsequent analysis
## This is where you can edit the output list file(s) to restrict the processing for certain target(s)
while read work_dir; do if [ -d $work_dir/trimmed_RNA_reads ]; then
rm -f $work_dir/trimmed_RNA_reads/sample_list.txt
for f in $work_dir/trimmed_RNA_reads/{*_R1_*.pe.se.fq,*_SR_*.se.fq}; do if [ -f $f ]; then
echo $f >> $work_dir/trimmed_RNA_reads/sample_list.txt; fi; done;
fi; done < $horse_trans/working_list.txt
###########################################################################################
## Assess read length statistics after trimming
headers=$(Rscript -e 'cat("Tissue", "Library", "No_of_samples", "No_of_reads(M)", "Total_bp_count(Gb)", "Min_length", "Max_length", "Median_length", "Mean_length", sep="\t");')
echo "$headers" > $horse_trans/trimmed_statistics.txt
while read work_dir; do
echo $work_dir
cd $work_dir/trimmed_RNA_reads
sample_list=$work_dir/trimmed_RNA_reads/sample_list.txt
lib=$(basename $work_dir)
tissue=$(dirname $work_dir | xargs basename)
no=$(cat $sample_list | wc -l)
libType=$(basename $work_dir | cut -d"_" -f 1) ## PE or SE
if [ "$libType" = $"PE" ]; then
## stats of the R1 files
cat sample_list.txt | sed 's/.pe.se.fq/.pe.fq/'| xargs cat | awk '{if(NR%4==2) print length($1)}' > trimmed_R1.readslength.txt
stat=$(Rscript -e 'd<-scan("trimmed_R1.readslength.txt", quiet=TRUE); cat(round(length(d)/10^6,2), round(sum(d)/10^9,2), min(d), max(d), median(d), mean(d), sep="\t");')
echo "$tissue"$'\t'"$lib"$'_R1\t'"$no"$'\t'"$stat" >> $horse_trans/trimmed_statistics.txt
## stats of the R2 files
cat sample_list.txt | sed 's/\(.*\)_R1_\(.*\).pe.se.fq/\1_R2_\2.pe.fq/'| xargs cat | awk '{if(NR%4==2) print length($1)}' > trimmed_R2.readslength.txt
stat=$(Rscript -e 'd<-scan("trimmed_R2.readslength.txt", quiet=TRUE); cat(round(length(d)/10^6,2), round(sum(d)/10^9,2), min(d), max(d), median(d), mean(d), sep="\t");')
echo "$tissue"$'\t'"$lib"$'_R2\t'"$no"$'\t'"$stat" >> $horse_trans/trimmed_statistics.txt
## stats of the singletones files
cat sample_list.txt | sed 's/\(.*\)_R1_\(.*\).pe.se.fq/\1_R_\2.se.fq/'| xargs cat | awk '{if(NR%4==2) print length($1)}' > trimmed_SE.readslength.txt
stat=$(Rscript -e 'd<-scan("trimmed_SE.readslength.txt", quiet=TRUE); cat(round(length(d)/10^6,2), round(sum(d)/10^9,2), min(d), max(d), median(d), mean(d), sep="\t");')
echo "$tissue"$'\t'"$lib"$'_SE\t'"$no"$'\t'"$stat" >> $horse_trans/trimmed_statistics.txt
elif [ "$libType" = $"SE" ]; then
cat sample_list.txt | xargs cat | awk '{if(NR%4==2) print length($1)}' > trimmed.readslength.txt
stat=$(Rscript -e 'd<-scan("trimmed.readslength.txt", quiet=TRUE); cat(round(length(d)/10^6,2), round(sum(d)/10^9,2), min(d), max(d), median(d), mean(d), sep="\t");')
echo "$tissue"$'\t'"$lib"$'\t'"$no"$'\t'"$stat" >> $horse_trans/trimmed_statistics.txt
fi; done < $horse_trans/working_list.txt
###########################################################################################
## get the referenece genome
cd $genome_dir
wget --timestamping 'ftp://hgdownload.cse.ucsc.edu/goldenPath/equCab2/bigZips/chromFa.tar.gz' -O chromFa.tar.gz
tar xvzf chromFa.tar.gz
## prepare Bowtie2Index (for Tophat mapping)
mkdir -p $genome_dir/Bowtie2Index && cd $genome_dir/Bowtie2Index
cat ../*.fa > genome.fa
bash ${script_path}/run_bowtie2-build.sh genome.fa genome $platform
echo "genome=$genome_dir/Bowtie2Index/genome.fa" >> $horse_trans/config.txt
echo "Bowtie2_genome_index_base=$genome_dir/Bowtie2Index/genome" >> $horse_trans/config.txt
source $horse_trans/config.txt
## prepare BWA index (for GATK variant analysis)
mkdir -p $genome_dir/BwaIndex && cd $genome_dir/BwaIndex
cat ../*.fa > genome.fa
bash ${script_path}/run_bwa-index.sh genome.fa
echo "Bwa_ref=$genome_dir/BwaIndex/genome.fa" >> $horse_trans/config.txt
source $horse_trans/config.txt
## prepare GATK dictionary and index (for GATK variant analysis)
mkdir -p $genome_dir/gatkIndex && cd $genome_dir/gatkIndex
cat ../*.fa > genome.fa
bash ${script_path}/run_gatk-index.sh genome.fa
echo "gatk_ref=$genome_dir/gatkIndex/genome.fa" >> $horse_trans/config.txt
echo "gatk_ref_index=$genome_dir/gatkIndex/genome.fa.fai" >> $horse_trans/config.txt
echo "gatk_ref_dict=$genome_dir/gatkIndex/genome.dict" >> $horse_trans/config.txt
source $horse_trans/config.txt
###########################################################################################
## map the genomes: create liftover files to allow mapping of NCBI annotation to UCSC tracks
## http://genomewiki.ucsc.edu/index.php/LiftOver_Howto
bash $script_path/mapGenome.sh $genome ## ends by creating ncbi/NCBItoUCSC_map.sorted.chain
###########################################################################################
## Create GTF file for refGenes track of UCSC
## generation of GTF from UCSC tables using the guidelines of genomewiki.ucsc
## http://genomewiki.ucsc.edu/index.php/Genes_in_gtf_or_gff_format
## The commands of wikigenome are modified to match the table schemes
## Note: using genepredToGTF can resolve duplicates of transcript ids but not for gene ids so the 10 fields genepred format which uses transcript names as gene names produce no duplicates but the extended genepred uses separte gene names from column 12 and susceptible for gene name duplication
cd $genome_dir
wget http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/refGene.txt.gz
ucscTable=$"refGene.txt.gz"
output_GTF=$"refGene.gtf"
#bash ${script_path}/ucscTableToGTF.sh $ucscTable $output_GTF
zcat $ucscTable | cut -f2-16 | $script_path/UCSC_kent_commands/genePredToGtf file stdin $output_GTF
echo "refGTF_file=$genome_dir/refGene.gtf" >> $horse_trans/config.txt
output_GTF=$"refGene_transcripts.gtf"
#bash ${script_path}/ucscTableToGTF2.sh $ucscTable $output_GTF
zcat $ucscTable | cut -f2-11 | $script_path/UCSC_kent_commands/genePredToGtf file stdin $output_GTF
echo "refTransGTF_file=$genome_dir/refGene_transcripts.gtf" >> $horse_trans/config.txt
zcat $ucscTable | cut -f2-16 | $script_path/genePredToBed > refGene.bed
echo "refBED_file=$genome_dir/refGene.bed" >> $horse_trans/config.txt
source $horse_trans/config.txt
## Get the NCBI annotation files
wget ftp://ftp.ncbi.nih.gov/genomes/Equus_caballus/GFF/ref_EquCab2.0_top_level.gff3.gz
gunzip ref_EquCab2.0_top_level.gff3.gz
sed 's/28908588/28908590/' ref_EquCab2.0_top_level.gff3 > ref_EquCab2.0_top_level_edit.gff3
$script_path/UCSC_kent_commands/gff3ToGenePred -useName ref_EquCab2.0_top_level_edit.gff3 ref_EquCab2.0_top_level.gpred
## exclude non RNA entries e.g. CDs with no parant transcripts, gene_segments, ..
egrep "^rna|^NM|^NR|^XM|^XR" ref_EquCab2.0_top_level.gpred > ref_EquCab2.0_top_level_rna.gpred
$script_path/UCSC_kent_commands/liftOver ref_EquCab2.0_top_level_rna.gpred $genome_dir/ncbi/NCBItoUCSC_map.sorted.chain ref_EquCab2.0_top_level_mapped_rna.gpred unMapped -genePred
$script_path/UCSC_kent_commands/genePredToGtf file ref_EquCab2.0_top_level_mapped_rna.gpred ref_EquCab2.0_top_level_rna.gtf
echo "ncbiGTF_file=$genome_dir/ref_EquCab2.0_top_level_rna.gtf" >> $horse_trans/config.txt
cat ref_EquCab2.0_top_level_mapped_rna.gpred | $script_path/genePredToBed > ref_EquCab2.0_top_level_mapped_rna.bed
echo "ncbiBED_file=$genome_dir/ref_EquCab2.0_top_level_mapped_rna.bed" >> $horse_trans/config.txt
$script_path/UCSC_kent_commands/gff3ToGenePred ref_EquCab2.0_top_level_edit.gff3 ref_EquCab2.0_top_level_noName.gpred
## exclude non RNA entries e.g. CDs with no parant transcripts, gene_segments, ..
grep "^rna" ref_EquCab2.0_top_level_noName.gpred > ref_EquCab2.0_top_level_noName_rna.gpred
$script_path/UCSC_kent_commands/liftOver ref_EquCab2.0_top_level_noName_rna.gpred ncbi/NCBItoUCSC_map.sorted.chain ref_EquCab2.0_top_level_mapped_noName_rna.gpred unMapped_noName -genePred
$script_path/UCSC_kent_commands/genePredToGtf file ref_EquCab2.0_top_level_mapped_noName_rna.gpred ref_EquCab2.0_top_level_noName_rna.gtf
echo "ncbiNoNameGTF_file=$genome_dir/ref_EquCab2.0_top_level_noName_rna.gtf" >> $horse_trans/config.txt
cat ref_EquCab2.0_top_level_mapped_noName_rna.gpred | $script_path/genePredToBed > ref_EquCab2.0_top_level_mapped_noName_rna.bed
echo "ncbiNoNameBED_file=$genome_dir/ref_EquCab2.0_top_level_mapped_noName_rna.bed" >> $horse_trans/config.txt
source $horse_trans/config.txt
## Get the ensemble GTF files
wget http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/ensGene.txt.gz
ucscTable=$"ensGene.txt.gz"
output_GTF=$"ensGene.gtf"
#bash ${script_path}/ucscTableToGTF.sh $ucscTable $output_GTF
zcat $ucscTable | cut -f2-16 | $script_path/UCSC_kent_commands/genePredToGtf file stdin $output_GTF
echo "ensGTF_file=$genome_dir/ensGene.gtf" >> $horse_trans/config.txt
zcat $ucscTable | cut -f2-16 | $script_path/genePredToBed > ensGene.bed
echo "ensBED_file=$genome_dir/ensGene.bed" >> $horse_trans/config.txt
source $horse_trans/config.txt
#wget ftp://ftp.ensembl.org/pub/release-80/gtf/equus_caballus/Equus_caballus.EquCab2.80.gtf.gz
#gunzip Equus_caballus.EquCab2.80.gtf.gz
#echo "ensGTF_file=$genome_dir/Equus_caballus.EquCab2.80.gtf" >> $horse_trans/config.txt
#source $horse_trans/config.txt
###########################################################################################
## get known variants
mkdir $genome_dir/knowVar
cd $genome_dir/knowVar
wget --timestamping 'ftp://ftp.ensembl.org/pub/release-82/variation/vcf/equus_caballus/Equus_caballus.vcf.gz' -O Equus_caballus.vcf.gz
wget --timestamping 'ftp://ftp.ensembl.org/pub/release-82/variation/vcf/equus_caballus/Equus_caballus_structural_variations.vcf.gz' -O Equus_caballus_structural_variations.vcf.gz
wget --timestamping 'ftp://ftp.ensembl.org/pub/release-82/variation/vcf/equus_caballus/Equus_caballus_incl_consequences.vcf.gz' -O Equus_caballus_incl_consequences.vcf.gz
gunzip *.gz
## change the name of the chromosomes to match the UCSC genome (bet keep the file co-ordinates 1-based)
grep -v "^#" Equus_caballus_structural_variations.vcf | awk -F "\t" -v OFS='\t' '{ print "chr"$1,$2,$3,$4,$5,$6,$7,$8 }' > Equus_caballus_structural_variations_fixedChrNames.vcf
sed -i 's/chrMT/chrM/g' Equus_caballus_structural_variations_fixedChrNames.vcf
perl $script_path/sortByRef.pl Equus_caballus_structural_variations_fixedChrNames.vcf $gatk_ref_index > Equus_caballus_structural_variations_fixedChrNames_sorted.vcf
grep "^#" Equus_caballus_structural_variations.vcf > Equus_caballus_structural_variations_final.vcf
cat Equus_caballus_structural_variations_fixedChrNames_sorted.vcf >> Equus_caballus_structural_variations_final.vcf
echo "knownIndels=$genome_dir/knowVar/Equus_caballus_structural_variations_final.vcf" >> $horse_trans/config.txt
grep -v "^#" Equus_caballus.vcf | awk -F "\t" -v OFS='\t' '{ print "chr"$1,$2,$3,$4,$5,$6,$7,$8 }' > Equus_caballus_fixedChrNames.vcf
sed -i 's/chrMT/chrM/g' Equus_caballus_fixedChrNames.vcf
perl $script_path/sortByRef.pl Equus_caballus_fixedChrNames.vcf $gatk_ref_index > Equus_caballus_fixedChrNames_sorted.vcf
grep "^#" Equus_caballus.vcf > Equus_caballus_final.vcf
#cat Equus_caballus_fixedChrNames_sorted.vcf >> Equus_caballus_final.vcf
grep "TSA=SNV" Equus_caballus_fixedChrNames_sorted.vcf >> Equus_caballus_final.vcf
echo "knownSNPs=$genome_dir/knowVar/Equus_caballus_final.vcf" >> $horse_trans/config.txt
source $horse_trans/config.txt
###########################################################################################
### Initiate the basic structure for horse track hubs
echo "UCSCgenome=equCab2" >> $horse_trans/config.txt
echo "chromSizes=$genome_dir/$UCSCgenome.chrom.sizes" >> $horse_trans/config.txt
source $horse_trans/config.txt
## fetch the UCSC database to get the chromosome sizes
bash ${script_path}/calcChromSizes.sh $UCSCgenome $chromSizes
## Create the basic directory structure of the track hubs
mkdir -p $track_hub/$UCSCgenome/BigBed
###########################################################################################
## Track for public assemblies
mkdir -p $pubAssemblies/Hestand_2014 && cd $pubAssemblies/Hestand_2014
wget http://server1.intrepidbio.com/FeatureBrowser/gtffilereader/record/-4027466309079733025/7666675698.gtf
mkdir -p $pubAssemblies/NCBI && cd $pubAssemblies/NCBI
cp $ncbiGTF_file ncbiAnn.gtf
mkdir -p $pubAssemblies/ISME.PBMC && cd $pubAssemblies/ISME.PBMC
wget http://europepmc.org/articles/PMC4366165/bin/pone.0122011.s005.zip
unzip *.zip
## create list of public assemblies
rm -f $pubAssemblies/public_assemblies.txt
for tissue in $pubAssemblies/*; do
echo "$pubAssemblies" "${tissue#$pubAssemblies/}" >> $pubAssemblies/public_assemblies.txt;
done
####################
## convert the gtf files into BigBed files & copy the BigBed files to the track hub directory
update=0 ## 0 means do not update Bigbed files & 1 means update
rm -f $horse_trans/public_assemblies.txt
while read ass_path assembly; do
echo $assembly
cd $ass_path/$assembly
if [[ ! -f "*.BigBed" || "$update" -eq 1 ]];then
targetAss=$(ls *.gtf)
if [ -f "$targetAss" ];then
bash $script_path/gtfToBigBed.sh "$targetAss" "$genome_dir/$UCSCgenome.chrom.sizes" "$script_path"
else echo "can not find target assembly"; break;fi
if [ -f ${targetAss%.gtf}.BigBed ];then
identifier=$(echo $assembly | sed 's/\//_/g' | sed 's/_output//g')
cp ${targetAss%.gtf}.BigBed $track_hub/$UCSCgenome/BigBed/${identifier}.BigBed
else echo "could not make merged.BigBed file"; break; fi
fi
echo $ass_path/$assembly >> $horse_trans/public_assemblies.txt;
done < $pubAssemblies/public_assemblies.txt
## initiate a given track hub
hub_name=$"HorseTrans_public_assemblies"
shortlabel=$"public_assemblies"
longlabel=$"Publically available assemblies"
email=$"[email protected]"
cd $track_hub
bash $script_path/create_trackHub.sh "$UCSCgenome" "$hub_name" "$shortlabel" "$longlabel" "$email"
## edit the trackDb
current_libs=$track_hub/current_libs_$shortlabel
current_tissues=$track_hub/current_tiss_$shortlabel
trackDb=$track_hub/$UCSCgenome/trackDb_$shortlabel.txt
lib_assemblies=$pubAssemblies/public_assemblies.txt
tiss_assemblies=$horse_trans/emptyTemp.txt
bash $script_path/edit_trackDb.sh $current_libs $current_tissues $trackDb $lib_assemblies $tiss_assemblies
###########################################################################################
### Install homology search databases
## download protein database such as Swissprot (fast) or Uniref90 (slow but more comprehensive)
mkdir -p $genome_dir/ptnDB
cd $genome_dir/ptnDB
wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
gunzip uniprot_sprot.fasta.gz
#wget ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz
#gunzip uniref90.fasta.gz
echo "refPtn=$genome_dir/ptnDB/uniprot_sprot.fasta" >> $horse_trans/config.txt
#echo "refPtn=$genome_dir/ptnDB/uniref90.fasta" >> $horse_trans/config.txt
source $horse_trans/config.txt
bash $script_path/make_ptnDB.sh $refPtn
#bash $script_path/make_ptnDB.sh $refPtn
## download pfam database
wget ftp://ftp.broadinstitute.org/pub/Trinity/Trinotate_v2.0_RESOURCES/Pfam-A.hmm.gz
gunzip Pfam-A.hmm.gz
echo "refPfam=$genome_dir/ptnDB/Pfam-A.hmm" >> $horse_trans/config.txt
source $horse_trans/config.txt
bash $script_path/make_PfamDB.sh $refPfam
###########################################################################################
## build Tophat transcriptome-index
echo "transcriptome_index=$genome_dir/trans_index/equ" >> $horse_trans/config.txt
source $horse_trans/config.txt
bash ${script_path}/run_buildTransIndex.sh "$refGTF_file" "$transcriptome_index" "$Bowtie2_genome_index_base" "$platform"
###########################################################################################
#### Mapping
## This step requires input working_list & sample_list and output folder
## the module should produce mapped reads in the form of tophat_<sampleID>/accepted.bam
## The output is an sample list with accepted.bam files
## refGene guided Tophat mapping per sample
while read work_dir; do
echo $work_dir
mkdir -p $work_dir/tophat_output
cd $work_dir/tophat_output
lib=$(basename $work_dir | cut -d"_" -f 1) ## PE or SE
strand=$(basename $work_dir | cut -d"_" -f 3 | sed 's/\./-/') ## fr-unstranded, fr-firststrand or fr-secondstrand
sample_list=$work_dir/trimmed_RNA_reads/sample_list.txt
bash ${script_path}/run_tophat.sh "$sample_list" "$lib" "$strand" "$Bowtie2_genome_index_base" "$transcriptome_index" "$script_path" ## Tophat sort by coordinate
done < $horse_trans/working_list.txt
## Check for successful tophat runs and trouble shooting the failed tophat jobs
while read work_dir; do
cd $work_dir/tophat_output
sample_list=$work_dir/trimmed_RNA_reads/sample_list.txt
failedSample_list=$work_dir/trimmed_RNA_reads/tophat_failedSamples.txt
> $failedSample_list ## erase previouslly failed samples if any
##bash $script_path/check_tophat.sh "$failedSample_list" ## require tophat-[SP]E.e & .o
bash $script_path/check2_tophat.sh "$sample_list" "$failedSample_list" ## check output log files
x=$(cat $failedSample_list | wc -l)
if [ $x -ne 0 ]; then
lib=$(basename $work_dir | cut -d"_" -f 1)
strand=$(basename $work_dir | cut -d"_" -f 3 | sed 's/\./-/')
echo "Failed tophat jobs in: "$work_dir
bash ${script_path}/run_tophat.sh "$failedSample_list" "$lib" "$strand" "$Bowtie2_genome_index_base" "$transcriptome_index" "$script_path"
fi
done < $horse_trans/working_list.txt
##################
## create summary for tophat run
headers=$(Rscript -e 'cat("Tissue", "Library", "min_mapping", "max_mapping", "min_concordance", "max_concordance", sep="\t");')
echo "$headers" > $horse_trans/tophat_summary.txt
while read work_dir; do
> $work_dir/tophat_output/allsample_summary.txt
> $work_dir/tophat_output/allsample_summary_detailed.txt
for f in $work_dir/tophat_output/tophat_*; do
echo ${f} >> $work_dir/tophat_output/allsample_summary.txt
cd ${f}
grep "overall read mapping rate" align_summary.txt >> ../allsample_summary.txt
grep "concordant pair alignment rate" align_summary.txt >> ../allsample_summary.txt
echo ${f} >> $work_dir/tophat_output/allsample_summary_detailed.txt
cat align_summary.txt >> ../allsample_summary_detailed.txt
done
mapping=$(grep "overall read mapping rate" $work_dir/tophat_output/allsample_summary.txt | awk '{ print $1 }' | sort -n | sed -e 1b -e '$!d' | tr "\n" "\t")
conc=$(grep "concordant pair alignment rate" $work_dir/tophat_output/allsample_summary.txt | awk '{ print $1 }' | sort -n | sed -e 1b -e '$!d' | tr "\n" "\t")
lib=$(basename $work_dir)
tissue=$(dirname $work_dir | xargs basename)
echo "$tissue"$'\t'"$lib"$'\t'"$mapping""$conc" >> $horse_trans/tophat_summary.txt
cat $work_dir/tophat_output/allsample_summary_detailed.txt >> $horse_trans/tophat_summary_detailed.txt
done < $horse_trans/working_list.txt
##################
## define the list samples.
## This is where you can edit the output list file(s) to restrict the processing for certain target(s)
while read work_dir; do if [ -d $work_dir/tophat_output ]; then
rm -f $work_dir/tophat_output/preMerge_sample_list.txt
for f in $work_dir/tophat_output/tophat_*/accepted_hits.bam; do if [ -f $f ]; then
echo $f >> $work_dir/tophat_output/preMerge_sample_list.txt; fi; done;
fi; done < $horse_trans/working_list.txt
###########################################################################################
## Add Read group headers
while read work_dir; do
echo $work_dir
lib=$(basename $work_dir)
sample_list=$work_dir/tophat_output/preMerge_sample_list.txt
replicates_list=$work_dir/fastq_data/replicates.txt
bash ${script_path}/run_readGroupInfo_illumina.sh "$sample_list" "$replicates_list" "$lib" "${script_path}/readGroupInfo_illumina.sh"
done < $horse_trans/working_list.txt
## Check for successful adding of groupread
## To be added
## Tip: the line before last in .e file has "AddOrReplaceReadGroups done"
## for f in $prepData/*/*/tophat_output/tophat_*/AddOrReplaceReadGroups.e*; do grep "AddOrReplaceReadGroups done" $f | wc -l; done
###########################################################################################
## merge replicates
while read work_dir; do
echo $work_dir
cd $work_dir/tophat_output
replicates_list=$work_dir/fastq_data/replicates.txt
if [ -f $replicates_list ]; then
bash ${script_path}/run_mergeBAM.sh "$replicates_list" "${script_path}/mergeBAM.sh" ## picard tools sort by coordinate
else echo "No replicates file in" $work_dir;
fi; done < $horse_trans/working_list.txt
## check for successful merging
## To be added
##################
## define the list samples.
## This is where you can edit the output list file(s) to restrict the processing for certain target(s)
while read work_dir; do if [ -d $work_dir/tophat_output ]; then
rm -f $work_dir/tophat_output/sample_list.txt
for f in $work_dir/tophat_output/tophat_*; do if [ -d $f ]; then
echo $f >> $work_dir/tophat_output/sample_list.txt; fi; done;
fi; done < $horse_trans/working_list.txt
###########################################################################################
#### pipeline_OneSampleAtaTime_Tophat2.Cufflinks.Cuffmerge
### Run Cufflinks: output transcripts.gtf in the same tophat_sample folder
while read work_dir; do
echo $work_dir
cd $work_dir/tophat_output
sample_list=$work_dir/tophat_output/sample_list.txt
#bash ${script_path}/run_cufflinks_wRef.sh "$sample_list" "$refGTF_file" "$script_path/cufflinks.sh";
bash ${script_path}/run_cufflinks_noRef.sh "$sample_list" "$script_path/cufflinks_noRef.sh";
done < $horse_trans/working_list.txt
## Check for successful Cufflinks runs and trouble shooting the failed Cufflinks jobs (requires cufflinks.e)
while read work_dir; do
cd $work_dir/tophat_output
failedSample_list=$work_dir/tophat_output/failedSamples.txt ## define the path of empty file
bash $script_path/check_cufflinks.sh "$failedSample_list"
x=$(cat $failedSample_list | wc -l)
if [ $x -ne 0 ]; then
echo "Failed Cufflinks jobs in: "$work_dir
cat $failedSample_list
#bash ${script_path}/run_cufflinks_wRef.sh "$failedSample_list" "$refGTF_file" "$script_path/cufflinks.sh";
bash ${script_path}/run_cufflinks_noRef.sh "$failedSample_list" "$script_path/cufflinks_noRef2.sh"
fi; done < $horse_trans/working_list.txt
## downsample the failed samples to peak coverage 200x
## convert the tophat output BAM files into Fastq files
while read work_dir; do
cd $work_dir/tophat_output
failedSample_list=$work_dir/tophat_output/failedSamples.txt ## define the path of empty file
bash $script_path/check_cufflinks.sh "$failedSample_list"
x=$(cat $failedSample_list | wc -l)
if [ $x -ne 0 ]; then
echo "Failed Cufflinks jobs in: "$work_dir
cat $failedSample_list
lib=$(basename $work_dir | cut -d"_" -f 1)
bash ${script_path}/run_BamToFastq.sh "$failedSample_list" "$lib" "-" "$script_path/restore_mapped_trimmed.sh"
fi; done < $horse_trans/working_list.txt
## run digital normalization of lab specific tissues (need to be updated to use sample list and check for success)
kmer=20
cutoff=200
while read work_dir; do
lib=$(basename $work_dir | cut -d"_" -f 1)
while read data_dir; do if [ -d $data_dir ];then
cd $data_dir
bash ${script_path}/run_diginorm.sh "$lib" "$data_dir" "$kmer" "$cutoff" "$script_path"
fi; done < $work_dir/tophat_output/failedSamples.txt
done < $horse_trans/working_list.txt
## split the interleaved reads
while read work_dir; do
sample_list=$work_dir/tophat_output/failedSamples.txt
x=$(cat $sample_list | wc -l)
lib=$(basename $work_dir | cut -d"_" -f 1) ## PE or SE
if [ $x -ne 0 ] && [ "$lib" = $"PE" ]; then
echo $work_dir
bash ${script_path}/run_split_reads.sh "$sample_list" $script_path/split_reads.sh
fi; done < $horse_trans/working_list.txt
## merge singletons and change the file names to fir the tophat script
while read work_dir; do
sample_list=$work_dir/tophat_output/failedSamples.txt
x=$(cat $sample_list | wc -l)
lib=$(basename $work_dir | cut -d"_" -f 1) ## PE or SE
if [ $x -ne 0 ] && [ "$lib" = $"PE" ]; then
echo $work_dir
f=$(ls *_R1_001.pe.fq)
base=${f%_R1_001.pe.fq}
cat $f allsingletons.fq.keep > "$base"_R1_001.pe.se.fq;
elif [ $x -ne 0 ] && [ "$lib" = $"SE" ]; then
mv allsingletons.fq.keep allsingletons_SR_001.se.fq
fi; done < $horse_trans/working_list.txt
## run Tophat on each sample
while read work_dir; do
x=$(cat $work_dir/tophat_output/failedSamples.txt | wc -l)
if [ $x -ne 0 ];then
while read data_dir; do if [ -d $data_dir ];then
echo $data_dir/*_R1_001.pe.se.fq
fi; done < $work_dir/tophat_output/failedSamples.txt > $work_dir/tophat_output/failedSamples_forTophat.txt
cat $work_dir/tophat_output/failedSamples_forTophat.txt
mkdir -p $work_dir/tophat_output/digi_tophat_output
cd $work_dir/tophat_output/digi_tophat_output
lib=$(basename $work_dir | cut -d"_" -f 1) ## PE or SE
strand=$(basename $work_dir | cut -d"_" -f 3 | sed 's/\./-/') ## fr-unstranded, fr-firststrand or fr-secondstrand
sample_list=$work_dir/tophat_output/failedSamples_forTophat.txt
bash ${script_path}/run_tophat.sh "$sample_list" "$lib" "$strand" "$Bowtie2_genome_index_base" "$transcriptome_index" "$script_path"
fi; done < $horse_trans/working_list.txt
## define the list samples.
while read work_dir; do if [ -d $work_dir/tophat_output/digi_tophat_output ]; then
rm -f $work_dir/tophat_output/digi_tophat_output/sample_list.txt
for f in $work_dir/tophat_output/digi_tophat_output/tophat_*; do if [ -d $f ]; then
echo $f >> $work_dir/tophat_output/digi_tophat_output/sample_list.txt; fi; done;
fi; done < $horse_trans/working_list.txt
### Run Cufflinks: output transcripts.gtf in the same tophat_sample folder
while read work_dir; do
echo $work_dir
cd $work_dir/tophat_output/digi_tophat_output
sample_list=$work_dir/tophat_output/digi_tophat_output/sample_list.txt
#bash ${script_path}/run_cufflinks_wRef.sh "$sample_list" "$refGTF_file" "$script_path/cufflinks.sh";
bash ${script_path}/run_cufflinks_noRef.sh "$sample_list" "$script_path/cufflinks_noRef_mask.sh";
done < $horse_trans/working_list.txt
## relocate the cufflinks analysis results
while read work_dir; do
while read sample_dir;do if [ -d $sample_dir ];then
echo $sample_dir
target_dir=$work_dir/tophat_output/$(basename $sample_dir)
cp $sample_dir/{transcripts.gtf,skipped.gtf,*.fpkm_tracking,cufflinks.[oe]*} $target_dir/.
fi; done < $work_dir/tophat_output/digi_tophat_output/sample_list.txt
done < $horse_trans/working_list.txt
############
## Assess computational utilization of cufflinks
cufflinks_utlization=$prepData/${cufflinks_run}_cufflinks_utlization.txt
> $cufflinks_utlization
while read work_dir; do
cd $work_dir/tophat_output
sample_list=$work_dir/tophat_output/sample_list.txt
bash ${script_path}/assess_cufflinks_utlization.sh "$sample_list" "$cufflinks_utlization"
done < $horse_trans/working_list.txt
############
## relocate the cufflinks analysis results
while read work_dir; do if [ -d $work_dir/tophat_output ]; then
cd $work_dir/tophat_output
for dir in $work_dir/tophat_output/tophat_*; do if [ -f "$dir"/transcripts.gtf ]; then
cd $dir
mkdir $cufflinks_run && \
mv "$dir"/{transcripts.gtf,skipped.gtf,*.fpkm_tracking,cufflinks.[oe]*} $cufflinks_run/.; fi; done
fi; done < $horse_trans/working_list.txt
########################
### Run cuffmerge: merge the sample assemblies and output merged.gtf in tophat_output/$cufflinks_run/$cuffmerge_run
cuffmerge_output=$cufflinks_run/$cuffmerge_run
while read work_dir; do if [ -d $work_dir/tophat_output ]; then
cd $work_dir/tophat_output
for dir in $work_dir/tophat_output/tophat_*; do if [ -f "$dir"/$cufflinks_run/transcripts.gtf ]; then
echo "$dir"/$cufflinks_run/transcripts.gtf; fi; done > ${cufflinks_run}_assemblies.txt
rm -fR $cuffmerge_output
#bash ${script_path}/cuffmerge_withRefGene.sh "$genome" "$cuffmerge_output" "${cufflinks_run}_assemblies.txt" "$refGTF_file"
isoformfrac=0.05 ## value 1-0.05 & default= 0.05
bash ${script_path}/cuffmerge_noGTF.sh "$genome" "$cuffmerge_output" "$isoformfrac" "${cufflinks_run}_assemblies.txt"
fi; done < $horse_trans/working_list.txt
## http://cole-trapnell-lab.github.io/cufflinks/cuffcompare/#transfrag-class-codes
##################
### cuffmerge the lab-specific tissue assemblies into tissue specific assemblies
# prepare list of target tissues
rm -f $horse_trans/multi_lib_tissues.txt
for tissue_dir in $prepData/*; do if [[ -d $tissue_dir && $(ls $tissue_dir | wc -l) -gt 1 ]]; then
echo $tissue_dir >> $horse_trans/multi_lib_tissues.txt; fi; done;
# For every target tissue, cuffmerge the lab-spefific assemblies into one assembly in $tissue_merge/cuffmerge/TISSUE.NAME/withORwithoutRefGuidence.gtf
tissue_Cuffmerge=$tissue_merge/cuffmerge
mkdir -p $tissue_Cuffmerge
cd $tissue_Cuffmerge
while read tissue_dir; do
tissue=$(basename $tissue_dir)
mkdir -p $tissue
rm -f $tissue_dir/${cufflinks_run}_assemblies.txt
for dir in $tissue_dir/*; do if [ -d "$dir" ]; then
cat "$dir"/tophat_output/${cufflinks_run}_assemblies.txt >> $tissue_dir/${cufflinks_run}_assemblies.txt; fi; done
cuffmerge_output=$tissue/$cufflinks_run/$cuffmerge_run
rm -fR $cuffmerge_output
#bash ${script_path}/cuffmerge_withRefGene.sh "$genome" "$cuffmerge_output" "$tissue_dir/${cufflinks_run}_assemblies.txt" "$refGTF_file"
isoformfrac=0.05 ## value 1-0.05 & default= 0.05
bash ${script_path}/cuffmerge_noGTF.sh "$genome" "$cuffmerge_output" "$isoformfrac" "$tissue_dir/${cufflinks_run}_assemblies.txt"
done < $horse_trans/multi_lib_tissues.txt
##################
### cuffmerge all assemblies into one total assembly
# save the assembly in $horse_trans/total_merge/cuffmerge/withORwithoutRefGuidence.gtf
cd $tissue_Cuffmerge
rm -f $prepData/${cufflinks_run}_assemblies.txt
while read work_dir; do if [ -d $work_dir/tophat_output ]; then
cat "$work_dir"/tophat_output/${cufflinks_run}_assemblies.txt >> $prepData/${cufflinks_run}_assemblies.txt
fi; done < $horse_trans/working_list.txt
dist_dir="all_tissues_frac$isoformfrac"
mkdir -p $dist_dir
cuffmerge_output=$dist_dir/$cufflinks_run/$cuffmerge_run
#bash ${script_path}/cuffmerge_withRefGene.sh "$genome" "$cuffmerge_output" "$prepData/${cufflinks_run}_assemblies.txt" "$refGTF_file"
bash ${script_path}/cuffmerge_noGTF.sh "$genome" "$cuffmerge_output" "$isoformfrac" "$prepData/${cufflinks_run}_assemblies.txt"
## copy the assembly to the download folder
cp $tissue_Cuffmerge/$cuffmerge_output/merged.gtf $horse_trans/downloads/allTissues_assemblies_and_stats/unfiltered_Alltissues_Assembly.GTF
#Construct the transcript fasta file && calculate statistics
cd $tissue_Cuffmerge/$cuffmerge_output
bash $script_path/run_genome_to_cdna_fasta.sh "merged.gtf" "$genome" "transcripts.fasta" "$script_path/genome_to_cdna_fasta.sh"
bash $script_path/run_seq_stats.sh "transcripts.fasta" "unfiltered_transcriptome.MatzStat"
cp unfiltered_transcriptome.MatzStat $horse_trans/downloads/allTissues_assemblies_and_stats/.
###################
## create a hub for non filtered data
## create list of assemblies from each library
## This is where you can edit the list to restrict the processing for certain target(s)
rm -f $prepData/${cufflinks_run}_${cuffmerge_run}_merged_raw.assemblies.txt
while read work_dir; do
dir=$work_dir/tophat_output/$cufflinks_run/$cuffmerge_run
if [ -d $dir ]; then
echo "$prepData" "${dir#$prepData/}" >> $prepData/${cufflinks_run}_${cuffmerge_run}_merged_raw.assemblies.txt;
fi; done < $horse_trans/working_list.txt
## create list of assemblies for tissues of multiple libraries
rm -f $tissue_Cuffmerge/${cufflinks_run}_${cuffmerge_run}_tissue_raw.assemblies.txt
for tissue in $tissue_Cuffmerge/*/$cufflinks_run/$cuffmerge_run; do if [ -d $tissue ]; then
echo "$tissue_Cuffmerge" "${tissue#$tissue_Cuffmerge/}" >> $tissue_Cuffmerge/${cufflinks_run}_${cuffmerge_run}_tissue_raw.assemblies.txt;
fi; done
####################
## convert the gtf files into BigBed files & copy the BigBed files to the track hub directory
update=1 ## 0 means do not update Bigbed files & 1 means update
rm -f $horse_trans/Tophat_${cufflinks_run}_${cuffmerge_run}_raw.assemblies.txt
while read ass_path assembly; do
echo $assembly
if [ -d "$ass_path/$assembly" ];then
cd $ass_path/$assembly
else echo "can not find $ass_path/$assembly"; break;fi
if [[ ! -f "merged.BigBed" || "$update" -eq 1 ]];then
targetAss=$"merged.gtf"
if [ -f "$targetAss" ];then
bash $script_path/gtfToBigBed.sh "$targetAss" "$genome_dir/$UCSCgenome.chrom.sizes" "$script_path"
else echo "can not find merged.gtf"; break;fi
if [ -f $"merged.BigBed" ];then
identifier=$(echo $assembly | sed 's/\//_/g' | sed 's/_output//g')
cp merged.BigBed $track_hub/$UCSCgenome/BigBed/${identifier}.BigBed
else echo "could not make merged.BigBed file"; break; fi
fi
echo $ass_path/$assembly >> $horse_trans/Tophat_${cufflinks_run}_${cuffmerge_run}_raw.assemblies.txt;
done < <(cat $prepData/${cufflinks_run}_${cuffmerge_run}_merged_raw.assemblies.txt \
$tissue_Cuffmerge/${cufflinks_run}_${cuffmerge_run}_tissue_raw.assemblies.txt)
## initiate a given track hub for cufflinks_run="nonGuided_Cufflinks"
hub_name=$"HorseTrans_TNGCuffUnfilt"
shortlabel=$"TNGCuffUnfilt"
longlabel=$"Single samlpe refGuided Tophat - nonGuided Cufflinks/Cuffmerge - Unfiltered"
email=$"[email protected]"
cd $track_hub
bash $script_path/create_trackHub.sh "$UCSCgenome" "$hub_name" "$shortlabel" "$longlabel" "$email"
## edit the trackDb
current_libs=$track_hub/current_libs_$shortlabel
current_tissues=$track_hub/current_tiss_$shortlabel
trackDb=$track_hub/$UCSCgenome/trackDb_$shortlabel.txt
lib_assemblies=$prepData/${cufflinks_run}_${cuffmerge_run}_merged_raw.assemblies.txt
tiss_assemblies=$tissue_Cuffmerge/${cufflinks_run}_${cuffmerge_run}_tissue_raw.assemblies.txt
bash $script_path/edit_trackDb.sh $current_libs $current_tissues $trackDb $lib_assemblies $tiss_assemblies
##########################
## filtering of single exon transfrag completely present within introns of other multiexon transcripts (class code i,e,o): 37464 transcripts
cd $tissue_Cuffmerge/$cuffmerge_output
bash $script_path/gtfToBed.sh "merged.gtf" "$script_path"
cat merged.bed | awk '{if($10>1) print $4}' > multiExon_id
mkdir -p filtered/NoIntronicFrag/prep
grep -F -w -f multiExon_id merged.gtf > filtered/NoIntronicFrag/prep/multiExon.gtf
cd filtered/NoIntronicFrag/prep
assembly="$tissue_Cuffmerge/$cuffmerge_output/merged.gtf"
identifier="nonGuided_Cufflinks_multiExon"
bash ${script_path}/run_cuffcompare.sh "$assembly" "$identifier" "multiExon.gtf" "$script_path/cuffcompare.sh"
mv $(dirname $assembly)/{$identifier.*.refmap,$identifier.*.tmap} .
tail -n+2 nonGuided_Cufflinks_multiExon.merged.gtf.tmap | awk '{if($3!="e" && $3!="i" && $3!="o") print $5}' > keepit.id
grep -F -w -f keepit.id $assembly > ../merged.gtf
## copy the filtered assembly to the download folder
cp $tissue_Cuffmerge/$cuffmerge_output/filtered/NoIntronicFrag/merged.gtf $horse_trans/downloads/allTissues_assemblies_and_stats/filtered1_NoIntronicFrag_Alltissues.GTF
#Construct the transcript fasta file && calculate statistics
cd $tissue_Cuffmerge/$cuffmerge_output/filtered/NoIntronicFrag
bash $script_path/run_genome_to_cdna_fasta.sh "merged.gtf" "$genome" "transcripts.fasta" "$script_path/genome_to_cdna_fasta.sh"
bash $script_path/run_seq_stats.sh "transcripts.fasta" "filtered1_NoIntronicFrag_transcriptome.MatzStat"
cp filtered1_NoIntronicFrag_transcriptome.MatzStat $horse_trans/downloads/allTissues_assemblies_and_stats/.
##########################
## Using Salmon to eliminate low-expressed transcripts
## exclude isoforms with TPM less than 5% of the total TPM of each locus: 41543 transcript
mkdir -p $tissue_Cuffmerge/$cuffmerge_output/filtered/highExp/prep
cd $tissue_Cuffmerge/$cuffmerge_output/filtered/highExp/prep
assembly="$tissue_Cuffmerge/$cuffmerge_output/filtered/NoIntronicFrag/merged.gtf"
bash $script_path/run_genome_to_cdna_fasta.sh "$assembly" "$genome" "transcripts.fa" "$script_path/genome_to_cdna_fasta.sh"
bash $script_path/run_salmonIndex.sh "horse_index" "transcripts.fa" ${script_path}/salmonIndex.sh
#qsub -v index="horse_index",transcriptome="transcripts.fa" ${script_path}/salmonIndex.sh
while read work_dir; do
lib=$(basename $work_dir | cut -d"_" -f 1) ## PE or SE
strand=$(basename $work_dir | cut -d"_" -f 3 | sed 's/\./-/') ## fr-unstranded, fr-firststrand or fr-secondstrand
identifier=$(echo $work_dir | rev | cut -d"/" -f 1,2 | rev | sed 's/\//_/')
seq_dir=$work_dir/trimmed_RNA_reads
bash ${script_path}/run_salmon.sh "$lib" "$strand" "horse_index" "$identifier" "$seq_dir" "$script_path"
done < $horse_trans/working_list.txt
find ./*.quant -name \*.sf -exec grep -H "mapping rate" {} \; | sort > salmonQuant_summary.txt
python $script_path/gather-counts.py -i "$(pwd)"
echo "transcript"$'\t'"length" > transcripts.lengthes
sf=$(find ./*.quant -name \*.sf | head -n1)
cat $sf | grep -v "^#" | awk -F "\t" -v OFS='\t' '{print $1,$2}' >> transcripts.lengthes
#grep "^>" transcripts.fa | sed 's/>//g' > transTogene.map
#module load R/3.0.1
#Rscript ${script_path}/calcTPM.R "$(pwd)"
grep "^>" transcripts.fa | awk -F'[> ]' '{print $3,$2}' > gene_transcript_map
identifier="" ## leave the identifier empty if you want to calculate TPM for all files
bash $script_path/run_calcTPM.sh "$(pwd)" "$identifier" "transcripts.lengthes" "gene_transcript_map" ${script_path}/calcTPM2.R
#Rscript ${script_path}/calcTPM.R "$(pwd)"
cat dataSummary_comp | tail -n+2 | awk '{if($10 >= 5)print $3}' > keepit.id
grep -F -w -f keepit.id $assembly > ../merged.gtf
## copy the filtered assembly to the download folder
cp $tissue_Cuffmerge/$cuffmerge_output/filtered/highExp/merged.gtf $horse_trans/downloads/allTissues_assemblies_and_stats/filtered2_hiExp_Alltissues.GTF
#Construct the transcript fasta file && calculate statistics
cd $tissue_Cuffmerge/$cuffmerge_output/filtered/highExp
bash $script_path/run_genome_to_cdna_fasta.sh "merged.gtf" "$genome" "transcripts.fasta" "$script_path/genome_to_cdna_fasta.sh"
bash $script_path/run_seq_stats.sh "transcripts.fasta" "filtered2_hiExp_transcriptome.MatzStat"
cp filtered2_hiExp_transcriptome.MatzStat $horse_trans/downloads/allTissues_assemblies_and_stats/.
###################
## define gene model supportive evidences
filtered2_hiExpGTF=$tissue_Cuffmerge/$cuffmerge_output/filtered/highExp/merged.gtf
## Run Cuffcompare with public annotations
cuffcompare=$tissue_Cuffmerge/$cuffmerge_output/filtered/supported/cuffcompare
mkdir -p $cuffcompare
cd $cuffcompare
while read root_dir asm_dir; do echo $asm_dir $root_dir/$asm_dir/*.gtf;done < $pubAssemblies/public_assemblies.txt > assmblies.txt
echo ensGTF_file ${ensGTF_file} >> assmblies.txt
echo refGTF_file ${refGTF_file} >> assmblies.txt
echo "filtered2_hiExp" $filtered2_hiExpGTF >> assmblies.txt
while read ref_name ref_assembly;do
while read asm_name assembly;do if [ "$assembly" != "$ref_assembly" ];then
mkdir -p $cuffcompare/$asm_name.vs.$ref_name
cd $cuffcompare/$asm_name.vs.$ref_name
identifier=$asm_name.vs.$ref_name
echo $identifier
#echo "$assembly" "$identifier" "${ref_assembly}" >> $horse_trans/cuffcompare/temp
bash ${script_path}/run_cuffcompare.sh "$assembly" "$identifier" "${ref_assembly}" "$script_path/cuffcompare.sh"
fi; done < $cuffcompare/assmblies.txt
done < $cuffcompare/assmblies.txt
## Transcriptome annoatation table: marge all the tmap files for each annoation as a quary aganist all other annotations as references
cd $cuffcompare
while read asm_name assembly;do
## make a transcript to gene map
echo -e "transcript.ID\tgene.ID\tchr\tstrand" > $asm_name.transTogene
cat $assembly | awk -F '[\t"]' 'BEGIN{OFS="\t";} {print $12,$10,$1,$7}' | uniq >> $asm_name.transTogene
cp $asm_name.transTogene $asm_name.merge
while read ref_name ref_assembly;do if [ "$assembly" != "$ref_assembly" ];then
identifier=$asm_name.vs.$ref_name
echo $identifier
mv $(dirname $assembly)/{$identifier.*.refmap,$identifier.*.tmap} $cuffcompare/$identifier/.
Rscript -e 'args=(commandArgs(TRUE)); data1=read.table(args[1],sep="\t",header=T,row.names=NULL);'\
'data2=read.table(args[2], header=T,row.names=NULL,sep="\t");'\
'colnames(data2)[-c(5,11,12)]=as.vector(sapply(args[3], paste0, colnames(data2)[-c(5,11,12)]));'\
'dataMerge=merge(data1,data2[,c(5,11,12,2,1,13,3)],by.x="transcript.ID",by.y="cuff_id",all.x=T, all.y=T);'\
'write.table(dataMerge,args[1], sep="\t", quote=F, row.names=F, col.names=T);' $asm_name.merge $cuffcompare/$identifier/$identifier.*.tmap $ref_name.
fi; done < $cuffcompare/assmblies.txt
cat $asm_name.merge | cut -f 1-10,13-16,19-22,25-28,31-34 > $asm_name.merge.reduced
done < $cuffcompare/assmblies.txt
#########
## compare The filtered assembly to non-horse Refgene tranascripts & gene prediction models
consAna=$tissue_Cuffmerge/$cuffmerge_output/filtered/supported/consAna
mkdir $consAna
cd $consAna
wget http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/xenoRefGene.txt.gz
zcat xenoRefGene.txt.gz | cut -f2-16 | $script_path/UCSC_kent_commands/genePredToGtf file stdin xenoRefGene.gtf
wget http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/transMapAlnRefSeq.txt.gz
zcat transMapAlnRefSeq.txt.gz | cut -f2-22 > transMapAlnRefSeq.psl
$script_path/UCSC_kent_commands/pslToBed transMapAlnRefSeq.psl transMapAlnRefSeq.bed
$script_path/UCSC_kent_commands/bedToGenePred transMapAlnRefSeq.bed transMapAlnRefSeq.GenePred
$script_path/UCSC_kent_commands/genePredToGtf file transMapAlnRefSeq.GenePred transMapAlnRefSeq.gtf
rm transMapAlnRefSeq.psl transMapAlnRefSeq.bed transMapAlnRefSeq.GenePred
wget http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/augustusGene.txt.gz
zcat augustusGene.txt.gz | cut -f2-16 | $script_path/UCSC_kent_commands/genePredToGtf file stdin augustusGene.gtf
wget http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/geneid.txt.gz
zcat geneid.txt.gz | cut -f2-16 | $script_path/UCSC_kent_commands/genePredToGtf file stdin geneid.gtf
wget http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/genscan.txt.gz
zcat genscan.txt.gz | cut -f2-16 | $script_path/UCSC_kent_commands/genePredToGtf file stdin genscan.gtf
wget http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/nscanGene.txt.gz
zcat nscanGene.txt.gz | cut -f2-16 | $script_path/UCSC_kent_commands/genePredToGtf file stdin nscanGene.gtf
rm -f pred_assemblies.txt
for asm in *.gtf; do
echo "${asm%.gtf}" "$(pwd)/${asm}" >> pred_assemblies.txt;
done
asm_name=filtered2_hiExp
assembly=$filtered2_hiExpGTF
while read ref_name ref_assembly;do
mkdir -p $consAna/$asm_name.vs.$ref_name
cd $consAna/$asm_name.vs.$ref_name
identifier=$asm_name.vs.$ref_name
echo $identifier
#echo "$assembly" "$identifier" "${ref_assembly}" >> $horse_trans/cuffcompare/temp
bash ${script_path}/run_cuffcompare.sh "$assembly" "$identifier" "${ref_assembly}" "$script_path/cuffcompare.sh"
done < $consAna/pred_assemblies.txt
## marge all the tmap files for each annoation as a quary aganist all other annotations as references
cd $consAna
## make a transcript to gene map
echo -e "transcript.ID\tgene.ID\tchr\tstrand" > $consAna/$asm_name.cons.merge
cat $assembly | awk -F '[\t"]' 'BEGIN{OFS="\t";} {print $12,$10,$1,$7}' | uniq >> $consAna/$asm_name.cons.merge
while read ref_name ref_assembly;do
identifier=$asm_name.vs.$ref_name
echo $identifier
mv $(dirname $assembly)/{$identifier.*.refmap,$identifier.*.tmap} $consAna/$identifier/.
Rscript -e 'args=(commandArgs(TRUE)); data1=read.table(args[1],sep="\t",header=T,row.names=NULL);'\
'data2=read.table(args[2], header=T,row.names=NULL,sep="\t");'\
'colnames(data2)[-c(5,11,12)]=as.vector(sapply(args[3], paste0, colnames(data2)[-c(5,11,12)]));'\
'dataMerge=merge(data1,data2[,c(5,11,12,2,1,13,3)],by.x="transcript.ID",by.y="cuff_id",all.x=T, all.y=T);'\
'write.table(dataMerge,args[1], sep="\t", quote=F, row.names=F, col.names=T);' $asm_name.cons.merge $consAna/$identifier/$identifier.*.tmap "$ref_name."
done < $consAna/pred_assemblies.txt
cat $consAna/$asm_name.cons.merge | cut -f 1-10,13-16,19-22,25-28,31-34,37-40 > $consAna/$asm_name.cons.merge.reduced
#########
## run Transdecoder to predict ORFs with homology options
#bash $script_path/run_transdecoder.sh <(echo "$tissue_Cuffmerge/$cuffmerge_output/filtered") $genome $refPtn $refPfam $script_path/transdecoder.sh
transdecoder="$tissue_Cuffmerge/$cuffmerge_output/filtered/supported/transdecoder"
mkdir $transdecoder
cd $transdecoder
#Construct the transcript fasta file
bash $script_path/run_genome_to_cdna_fasta.sh "$filtered2_hiExpGTF" "$genome" "transcripts.fasta" "$script_path/genome_to_cdna_fasta.sh"
# extract the long open reading frames
bash $script_path/run_getLongORFs.sh "transcripts.fasta" "$script_path/getLongORFs.sh"
## identify ORFs with homology to known proteins via blast and pfam searches.
mkdir tempPep && cd tempPep
cp ../transcripts.fasta.transdecoder_dir/longest_orfs.pep .
$script_path/splitFasta.pl longest_orfs.pep 50
for pep in subset[1-5][0-9]_longest_orfs.pep;do
label=${pep%_longest_orfs.pep}
bash $script_path/run_blast.sh "$pep" "$refPtn" $label."blastp.outfmt6" "$script_path/blastp.sh"
bash $script_path/run_hmmscan.sh "$pep" "$refPfam" $label."pfam.domtblout" "$script_path/hmmscan.sh"
done
cat subset*.blastp.outfmt6 >> ../blastp.outfmt6
cat subset*.pfam.domtblout >> ../pfam.domtblout
## predict the likely coding regions
cd ../
bash $script_path/run_transdecoderPredict.sh "transcripts.fasta" "pfam.domtblout" "blastp.outfmt6" "$script_path/transdecoderPredict.sh"
# convert the transcript structure GTF file to an alignment-GFF3 formatted file
$script_path/decoderUtil/cufflinks_gtf_to_alignment_gff3.pl "$filtered2_hiExpGTF" > transcripts.gff3
# generate a genome-based coding region annotation file
$script_path/decoderUtil/cdna_alignment_orf_to_genome_orf.pl transcripts.fasta.transdecoder.gff3 transcripts.gff3 transcripts.fasta 1> transcripts.fasta.transdecoder.genome.gff3 2> sterr
grep "Warning" sterr > warnings.txt && rm sterr
# convert the genome-based gene-gff3 file to bed
$script_path/decoderUtil/gff3_file_to_bed.pl transcripts.fasta.transdecoder.genome.gff3 > transcripts.fasta.transdecoder.genome.bed
tail -n+2 transcripts.fasta.transdecoder.bed | awk -F '\t' '{print $1}' > Trans_ID
tail -n+2 transcripts.fasta.transdecoder.bed | awk -F '[\t:]' '{print $6}' | awk -F '_' '{print $1}' > ORF_len
paste Trans_ID ORF_len > all_ORFs
sort -k1,1 -k2,2rg all_ORFs | sort -u -k1,1 --merge > longest_ORFs
wc -l longest_ORFs # 65211 ## the number of transcripts with likely coding sequences
wc -l all_ORFs # 150528 ## all signifcant ORFs
##########
## Exclusion of non-coding intergenic transcripts
## exclude the transcripts without ORF unless matching or has an exonic overlap with other assembly
cd $tissue_Cuffmerge/$cuffmerge_output/filtered/supported
Rscript -e 'args=(commandArgs(TRUE)); data1=read.table(args[1],header=T,sep="\t");'\
'data2=read.table(args[2],header=T,sep="\t");'\
'data3=read.table(args[3],header=F);'\
'check=c("=","j","o","x","c");'\
'unsup1=subset(data1,!(NCBI.class_code %in% check | ensGTF_file.class_code %in% check));'\
'unsup2=subset(data2,!(augustusGene.class_code %in% check | geneid.class_code %in% check | genscan.class_code %in% check | nscanGene.class_code %in% check | transMapAlnRefSeq.class_code %in% check | xenoRefGene.class_code %in% check));'\
'unsup_both=intersect(unsup1$transcript.ID,unsup2$transcript.ID);'\
'unsup_all=unsup_both[!(unsup_both %in% data3$V1)];'\
'write.table(unsup_all,"unsup", quote=F, row.names=F, col.names=F);' $cuffcompare/filtered2_hiExp.merge.reduced $consAna/filtered2_hiExp.cons.merge.reduced $transdecoder/Trans_ID
grep -v -F -w -f unsup $filtered2_hiExpGTF > merged.gtf ## 38507
## copy the filtered assembly to the download folder
cp merged.gtf $horse_trans/downloads/allTissues_assemblies_and_stats/filtered3_sup_Alltissues.GTF
#Construct the transcript fasta file && calculate statistics
bash $script_path/run_genome_to_cdna_fasta.sh "merged.gtf" "$genome" "transcripts.fasta" "$script_path/genome_to_cdna_fasta.sh"
bash $script_path/run_seq_stats.sh "transcripts.fasta" "filtered3_sup_transcriptome.MatzStat"
cp filtered3_sup_transcriptome.MatzStat $horse_trans/downloads/allTissues_assemblies_and_stats/.
###################
## Removal of likely erroneous transcripts
filtered3_supGTF=$tissue_Cuffmerge/$cuffmerge_output/filtered/supported/merged.gtf
refined="$tissue_Cuffmerge/$cuffmerge_output/filtered/refined"
mkdir -p $refined
cd $refined
## exclusion of Mitochondrail contigs
cat $filtered3_supGTF | awk '$1 != "chrM"' > noChrM.gtf ## removed 6 transcripts in 2 genes
bash $script_path/gtfToBed.sh "noChrM.gtf" "$script_path"
## exclusion of small transfrags
awk -F'\t' '{print $4","$11}' noChrM.bed | awk -F',' '{for(i=2;i<=NF;i++) t+=$i; if (t < 201)print $1; t=0}' > small_IDs ## 192 transcript in 184 genes
grep -v -F -w -f small_IDs noChrM.gtf > merged.gtf
## exclusion of transcripts with undefined strand
#awk -F'\t' '{if($6 == ".")print $4;}' noChrM.bed > unstranded_IDs ## additional 6618 transcripts in 6616 genes (58 transcripts shared with small IDs)
#grep -v -F -w -f unstranded_IDs noSmall.gtf > merged.gtf
## copy the filtered assembly to the download folder
cp merged.gtf $horse_trans/downloads/allTissues_assemblies_and_stats/filtered4_refined_Alltissues.GTF
#Construct the transcript fasta file && calculate statistics
bash $script_path/run_genome_to_cdna_fasta.sh "merged.gtf" "$genome" "transcripts.fasta" "$script_path/genome_to_cdna_fasta.sh"
bash $script_path/run_seq_stats.sh "transcripts.fasta" "filtered4_refined_Alltissues.MatzStat"
cp filtered4_refined_Alltissues.MatzStat $horse_trans/downloads/allTissues_assemblies_and_stats/.
###################
## proposed addational filter
## run the complex loci detection module aganist NCBI, ensambl and non-horse tracks
## for each loci extend the margins to the maxmimum overlapping loci and exclude from the refined transcripome GTF
## cut the target BAM region
## restore FASTQ data
## run trinity for local assembly
## +/- back mapping module to exclude rare transcripts
## align the transcript to the genome and create GTF file then combine to the refined transcriptome
###################
## merge with public assemblies
RNAseqSupTrans=$tissue_Cuffmerge/$cuffmerge_output/filtered/refined
mergedTrans=$tissue_Cuffmerge/$cuffmerge_output/filtered/mergedTrans
mkdir -p $mergedTrans
cd $mergedTrans
## remove redundant transcripts (22296)
cat $RNAseqSupTrans/merged.gtf $ncbiGTF_file $ensGTF_file > mergeTrans_redundant.gtf
module load cufflinks/2.2.1
> final.redundant_list
cp mergeTrans_redundant.gtf mergeTrans_NonRed.gtf
redundants=1
while [ $redundants -gt 0 ];do
echo $redundants
cuffcompare -T -V -o "testRedundancy" mergeTrans_NonRed.gtf &> cuffcompare_redundant.log
grep "made redundant by" cuffcompare_redundant.log | awk '{print $2,$7}' | sed 's/)//g' > tmp.redundant_list ## First column is the descarded transcript
cat tmp.redundant_list >> final.redundant_list
awk '{print $1}' final.redundant_list | grep -v -F -w -f - mergeTrans_redundant.gtf > mergeTrans_NonRed.gtf
redundants=$(cat tmp.redundant_list | wc -l)
done
#= Summary for dataset: mergeTrans_NonRed.gtf :
# Query mRNAs : 126369 in 47760 loci (97710 multi-exon transcripts)
# (18089 multi-transcript loci, ~2.6 transcripts per locus)
#grep "^[NX]\|^rna" final.redundant_list > ncbi.redundant
#grep "^ENSECAT" final.redundant_list > ens.redundant
#grep "^TCONS" final.redundant_list > refined.redundant
#for f in *.redundant; do
# wc -l $f
# echo "made redundant by ncbi: " $(awk '{print $2}' $f | grep "^[NX]\|^rna" | wc -l)
# echo "made redundant by ensembl: " $(awk '{print $2}' $f | grep "^ENSECAT" | wc -l)
# echo "made redundant by refined Transcriptome: " $(awk '{print $2}' $f | grep "^TCONS" | wc -l)
#done > redundancy_report