-
Notifications
You must be signed in to change notification settings - Fork 47
/
abyss_minimus.sh
executable file
·165 lines (151 loc) · 8.38 KB
/
abyss_minimus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/bin/bash
#
# abyss_minimus.sh
#
# This script performs de novo asembly on a fasta file using AbySS and Minimo
# Chiu Laboratory
# University of California, San Francisco
# January, 2014
#
# (1) accepts fasta file with multiple barcodes
# (2) splits fasta file into individual barcodes, for each barcode: runs abyss (both on the entire file, and on separate splits of the file).
# (3) outputs contig assemblies, including all.$inputfile.unitigs.cut$cutoff_post_Abyss.$3-mini.fa
# (4) user specifies length cutoff after abyss assembly, and cutoff after minimo assembly, as well as number of cores and kmer value
#
# Copyright (C) 2014 Samia N Naccache - All Rights Reserved
# SURPI has been released under a modified BSD license.
# Please see license file for details.
scriptname=${0##*/}
if [ $# -lt 5 ]
then
echo "Usage: $scriptname <1.fasta> <2.cutoff_post Abyss> <3.cutoff_post Minimo> <4.#cores> <5.kmer> <6.ignoreBarcodeInfo (Y/N)>"
exit
fi
###
inputfile=$1
cutoff_post_Abyss=$2
cutoff_post_Minimo=$3
cores=$4
kmer=$5
ignore_barcodes=$6
###
split_FASTA_size=100000
#### create list of barcodes in the fasta file in the following format: #barcode
echo -e "$(date)\t$scriptname\tDemultiplexing barcodes..."
START_DEMULTIPLEX=$(date +%s)
#The sed '/^#$/d' removes a blank #. This is needed in cases where barcodes are present along with reads that have no barcodes
grep ">" $inputfile | sed 's/#/ /g' | awk '{print$2}' | sort | uniq | sed 's/\// /g' | awk '{print$1}' | sort | uniq | sed 's/^/#/g' | sed '/^#$/d' > $inputfile.barcodes
if [ -s $inputfile.barcodes ] #if $inputfile.barcodes exists and has size>0, then we have multiple barcodes, but check if user wants to flatten & run together
then
if [ $ignore_barcodes = "Y" ]
then
FLATTEN_BARCODES="1"
echo "#" > $inputfile.barcodes
else
FLATTEN_BARCODES="0"
fi
else #$inputfile.barcodes doesn't exist, or size=0, then we only have 1 barcode, so flatten & assemble entire run together
FLATTEN_BARCODES="1"
echo "#" > $inputfile.barcodes
fi
echo -e "$(date)\t$scriptname\tCompleted demultiplexing barcodes."
END_DEMULTIPLEX=$(date +%s)
diff_DEMULTIPLEX=$(( END_DEMULTIPLEX - START_DEMULTIPLEX ))
echo -e "$(date)\t$scriptname\tBarcode demultiplex took $diff_DEMULTIPLEX s."
### generate fasta file for every separate barcode (demultiplex)
for f in `cat $inputfile.barcodes` ; do
if [ $FLATTEN_BARCODES = "1" ]
then
ln -s $inputfile bar$f.$inputfile
else
grep -E "$f(/|$)" $inputfile -A 1 --no-group-separator > bar$f.$inputfile
fi
# split demultiplexed fasta file into 100,000 read sub-fastas
echo -e "$(date)\t$scriptname\tSplitting FASTA into chunks of size: $split_FASTA_size."
START_SPLIT=$(date +%s)
cp bar$f.$inputfile bar$f.${inputfile}_n # So that the unsplit demultiplexed file is also denovo assembled #
split_fasta.pl -i bar$f.$inputfile -o bar$f.$inputfile -n $split_FASTA_size
echo -e "$(date)\t$scriptname\tCompleted splitting FASTA file."
END_SPLIT=$(date +%s)
diff_SPLIT=$(( $END_SPLIT - $START_SPLIT ))
echo -e "$(date)\t$scriptname\tSplitting FASTA took $diff_SPLIT s."
### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file
echo -e "$(date)\t$scriptname\tRunning abyss on each $split_FASTA_size chunk..."
START_ABYSS=$(date +%s)
for d in bar$f.${inputfile}_* ; do
abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log
done
echo -e "$(date)\t$scriptname\tCompleted running abyss on each $split_FASTA_size chunk."
END_ABYSS=$(date +%s)
diff_ABYSS=$(( END_ABYSS - START_ABYSS ))
echo -e "$(date)\t$scriptname\tAbyss took $diff_ABYSS s."
###
### contigs from split files concatenated, after which reads smaller than the cutoff value are eliminated
echo -e "$(date)\t$scriptname\tStarting concatenating and cutoff of abyss output"
START_CATCONTIGS=$(date +%s)
#concatenating contig files from different fasta splits, and adding kmer infromation and barcode information to contig headers
cat bar$f.${inputfile}_*.f-unitigs.fa | sed 's/ /_/g' | sed "s/$/"_kmer"$kmer""/g;n" | sed "s/$/"$f"/g;n" > all.bar$f.$inputfile.unitigs.fa
# only contigs larger than $cutoff_post_Abyss are retained
cat all.bar$f.$inputfile.unitigs.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$2') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa
echo -e "$(date)\t$scriptname\tDone concatenating and cutoff of abyss output"
END_CATCONTIGS=$(date +%s)
diff_CATCONTIGS=$(( END_CATCONTIGS - START_CATCONTIGS ))
echo -e "$(date)\t$scriptname\tConcatenating and cutoff of abyss output took $diff_CATCONTIGS s."
###
### run Minimo (OLC assembler)
echo -e "$(date)\t$scriptname\tStarting Minimo..."
START_MINIMO=$(date +%s)
Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1
echo -e "$(date)\t$scriptname\tCompleted Minimo."
END_MINIMO=$(date +%s)
diff_MINIMO=$(( END_MINIMO - START_MINIMO ))
echo -e "$(date)\t$scriptname\tMinimo took $diff_MINIMO s."
###########
echo -e "$(date)\t$scriptname\tStarting cat barcode addition and cutoff of minimo output"
START_MIN_PROCESS=$(date +%s)
# Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 )
# then we add the relevant barcode to the end of the header contig. Contigs that survive untouched from abyss already have a barcode at the end of them, so that extra barcode is taken away
cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa
# change generic name all-contigs.fa
mv all-contigs.fa all-contigs.fa.$f
# only contigs larger than $cutoff_post_Minimo are retained
cat all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$3') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa
echo -e "$(date)\t$scriptname\tDone cat barcode addition and cutoff of minimo output"
END_MIN_PROCESS=$(date +%s)
diff_MIN_PROCESS=$(( END_MIN_PROCESS - START_MIN_PROCESS ))
echo -e "$(date)\t$scriptname\tcat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s"
done
###
### concatenate deBruijn -> OLC contigs from all barcodes together
echo -e "$(date)\t$scriptname\tConcatenating all barcodes together..."
START_CAT=$(date +%s)
cat all.bar*.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa > all.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa
echo -e "$(date)\t$scriptname\tCompleted concatenating all barcodes."
END_CAT=$(date +%s)
diff_CAT=$(( END_CAT - START_CAT ))
echo -e "$(date)\t$scriptname\tBarcode concatenation took $diff_CAT s."
# cleaning up files by organizing directories, moving files into directories, and removing temporary files
mkdir $inputfile.dir
mv bar*$inputfile*.fa $inputfile.dir
if [ -e all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.m8 ]
then
mv all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.m8 $inputfile.dir
fi
rm -f all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.aln
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.seq
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.ex.fa
rm -f all.$inputfile.unitigs.cut$cutoff_post_Abyss-contigs.sortlen.seq
rm -f all-contigs*
rm -f all.bar*.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa
rm -f $inputfile.barcodes
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.fasta
rm -f all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.addseq.gi
rm -f all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.addseq.gi.uniq
rm -f all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.addseq.gi.taxonomy
rm -f bar#*.${inputfile}_*
rm -f bar#*$inputfile*fasta
rm -f *$inputfile.unitigs.cut$cutoff_post_Abyss.fa.runAmos.log
rm -f all.bar*.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa
rm -f bar*$inputfile