forked from chiulab/surpi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
abyss_minimus.sh
executable file
·138 lines (127 loc) · 6.39 KB
/
abyss_minimus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/bin/bash
#
# abyss_minimus.sh
#
# This script performs de novo asembly on a fasta file using AbySS and Minimo
# Chiu Laboratory
# University of California, San Francisco
# January, 2014
#
# (1) accepts fasta file with multiple barcodes
# (2) splits fasta file into individual barcodes, for each barcode: runs abyss (both on the entire file, and on separate splits of the file).
# (3) outputs contig assemblies, including all.$1.unitigs.cut$2.$3-mini.fa
# (4) user specifies length cutoff after abyss assembly, and cutoff after minimo assembly, as well as number of cores and kmer value
#
# Copyright (C) 2014 Samia N Naccache - All Rights Reserved
# SURPI has been released under a modified BSD license.
# Please see license file for details.
# Last revised 5/21/2014
if [ $# -lt 5 ]
then
echo "usage <1.fasta> <2.cutoff_post Abyss> <3.cutoff_post Minimo> <4.#cores> <5.kmer>"
exit 65
fi
k="$5"
scriptname=${0##*/}
#### create list of barcodes in the fasta file in the following format: #barcode
echo -e "$(date)\t$scriptname\tStarting split barcode"
START0=$(date +%s)
grep ">" $1 | sed 's/#/ /g' | awk '{print$2}' | sort | uniq | sed 's/\// /g' | awk '{print$1}' | sort | uniq | sed 's/^/#/g' > $1.barcodes
echo -e "$(date)\t$scriptname\tDone split barcode"
END0=$(date +%s)
diff=$(( $END0 - $START0 ))
echo -e "$(date)\t$scriptname\tsplit barcode Took $diff s"
###
### generate fasta file for every separate barcode (demultiplex)
for f in `cat $1.barcodes` ; do
grep -F "$f/" $1 -A 1 | sed '/--/d' > bar$f.$1
# split demultiplexed fasta file into 100,000 read sub-fastas
echo -e "$(date)\t$scriptname\tStarting split fasta"
START1=$(date +%s)
cp bar$f.$1 bar$f.$1_n # So that the unsplit demultiplexed file is also denovo assembled #
split_fasta.pl -i bar$f.$1 -o bar$f.$1 -n 100000
echo -e "$(date)\t$scriptname\tDone split fasta"
END1=$(date +%s)
diff=$(( $END1 - $START1 ))
echo -e "$(date)\t$scriptname\tsplit fasta Took $diff s"
###
### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file
echo -e "$(date)\t$scriptname\tStarting abyss on each 100k"
START2=$(date +%s)
for d in bar$f.$1_* ; do
abyss-pe k=$k name=$d.f se=$d np=$4 >& $d.abyss.log
done
echo -e "$(date)\t$scriptname\tDone abyss on each 100k"
END2=$(date +%s)
diff=$(( $END2 - $START2 ))
echo -e "$(date)\t$scriptname\tabyss Took $diff s"
###
### contigs from split files concatenated, after which reads smaller than the cutoff value are eliminated
echo -e "$(date)\t$scriptname\tStarting concatenating and cutoff of abyss output"
START3=$(date +%s)
#concatenating contig files from different fasta splits, and adding kmer infromation and barcode information to contig headers
cat bar$f.$1_*.f-unitigs.fa | sed 's/ /_/g' | sed "s/$/"_kmer"$k""/g;n" | sed "s/$/"$f"/g;n" > all.bar$f.$1.unitigs.fa
# only contigs larger than $2 are retained
cat all.bar$f.$1.unitigs.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$2') printf("%s\n%s\n",x,$0)}' > all.bar$f.$1.unitigs.cut$2.fa
echo -e "$(date)\t$scriptname\tDone concatenating and cutoff of abyss output"
END3=$(date +%s)
diff=$(( $END3 - $START3 ))
echo -e "$(date)\t$scriptname\tconcatenating and cutoff of abyss output Took $diff s"
###
### run Minimo (OLC assembler)
echo -e "$(date)\t$scriptname\tStarting Minimo"
START4=$(date +%s)
Minimo all.bar$f.$1.unitigs.cut$2.fa -D FASTA_EXP=1
echo -e "$(date)\t$scriptname\tDone Minimo"
END4=$(date +%s)
diff=$(( $END4 - $START4 ))
echo -e "$(date)\t$scriptname\tMinimo Took $diff s"
###########
echo -e "$(date)\t$scriptname\tStarting cat barcode addition and cutoff of minimo output"
START5=$(date +%s)
# Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 )
# then we add the relevant barcode to the end of the header contig. Contigs that survive untouched from abyss already have a barcode at the end of them, so that extra barcode is taken away
cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$1.unitigs.cut$2-minim.fa
# change generic name all-contigs.fa
mv all-contigs.fa all-contigs.fa.$f
# only contigs larger than $3 are retained
cat all.bar$f.$1.unitigs.cut$2-minim.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$3') printf("%s\n%s\n",x,$0)}' > all.bar$f.$1.unitigs.cut$2.$3-mini.fa
echo -e "$(date)\t$scriptname\tDone cat barcode addition and cutoff of minimo output"
END5=$(date +%s)
diff=$(( $END5 - $START5 ))
echo -e "$(date)\t$scriptname\tcat barcode addition and cutoff of minimo outputTook $diff s"
done
###
### concatenate deBruijn -> OLC contigs from all barcodes together
echo -e "$(date)\t$scriptname\tStarting concatenate all barcodes"
START9=$(date +%s)
echo -e "$(date)\t$scriptname\tDone concatenating all barcodes"
cat all.bar*.$1.unitigs.cut$2.$3-mini.fa > all.$1.unitigs.cut$2.$3-mini.fa
END9=$(date +%s)
diff=$(( $END9 - $START9 ))
echo -e "$(date)\t$scriptname\tbarcode concatenation Took $diff s"
# cleaning up files by organizing directories, moving files into directories, and removing temporary files
mkdir $1.dir
mv bar*$1*.fa $1.dir
if [ -e all.$1.contigs.abyssmini.cut$2.$3.e1.NR.RAPSearch.m8 ]
then
mv all.$1.contigs.abyssmini.cut$2.$3.e1.NR.RAPSearch.m8 $1.dir
fi
rm -f all.$1.contigs.abyssmini.cut$2.$3.e1.NR.RAPSearch.aln
rm -f all.$1.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
rm -f all.$1.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.seq
rm -f all.$1.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
rm -f all.$1.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.ex.fa
rm -f all.$1.unitigs.cut$2-contigs.sortlen.seq
rm -f all-contigs*
rm -f all.bar*.$1.unitigs.cut$2-minim.fa
rm -f $1.barcodes
rm -f all.$1.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.fasta
rm -f all.$1.contigs.abyssmini.cut$2.$3.e1.NR.RAPSearch.addseq.gi
rm -f all.$1.contigs.abyssmini.cut$2.$3.e1.NR.RAPSearch.addseq.gi.uniq
rm -f all.$1.contigs.abyssmini.cut$2.$3.e1.NR.RAPSearch.addseq.gi.taxonomy
rm -f bar#*.$1_*
rm -f bar#*$1*fasta
rm -f *$1.unitigs.cut$2.fa.runAmos.log
rm -f all.bar*.$1.unitigs.cut$2.$3-mini.fa
rm -f bar*$1