forked from chiulab/surpi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
snap_nt_combine.sh
executable file
·132 lines (112 loc) · 4.54 KB
/
snap_nt_combine.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/bash
#
# snap_nt_combine.sh
#
# This script runs SNAP against the NT database
#
# Chiu Laboratory
# University of California, San Francisco
# January, 2014
#
# This script will successively run SNAP against NT partitions and then combine the results
#
# Note: for the NT database, default FASTQ headers will cause segmentation fault in SNAP
# need to change FASTQ headers to gi only
# 1/16/13
#
# Note: you need to use SNAP 0.15 or a higher version which outputs the "d" value in the SAM file
#
# Copyright (C) 2014 Charles Chiu - All Rights Reserved
# Permission to copy and modify is granted under the BSD license
# Last revised 5/22/2014
expected_args=6
if [ $# -lt $expected_args ]
then
echo "Usage: snap_nt_combine.sh <FASTQ input file> <directory containing SNAP NT indexes> <number of cores> <free cache memory cutoff in GB> <SNAP d-value cutoff> <# of simultaneous SNAP runs>"
exit 65
fi
###
inputfile=$1
SNAP_NT_index_directory=$2
cores=$3
free_cache_cutoff=$4
SNAP_d_cutoff=$5
simultaneous_SNAPs=$6
###
scriptname=${0##*/}
echo -e "$(date)\t$scriptname\tStarting SNAP to NT"
START1=$(date +%s)
echo -e "$(date)\t$scriptname\tInput file: $inputfile"
nopathf=${inputfile##*/} # remove the path to file
echo -e "$(date)\t$scriptname\tAfter removing path: $nopathf"
basef=${nopathf%.fastq} # remove FASTQextension
echo -e "$(date)\t$scriptname\tAfter removing FASTQ extension: $basef"
echo -e "$(date)\t$scriptname\tMapping $basef to NT..."
# rm -f $basef.NT.sam # this is removing the output file, if it is present? Should not be necessary, but commenting out for now.
for snap_index in $SNAP_NT_index_directory/*; do
START2=$(date +%s)
nopathsnap_index=${snap_index##*/} # remove the path to file
echo -e "$(date)\t$scriptname\tStarting SNAP on $nopathsnap_index"
freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1)
echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]"
if [ $freemem -lt $free_cache_cutoff ]
then
echo -e "$(date)\t$scriptname\tClearing cache..."
dropcache
fi
START_SNAP=$(date +%s)
/usr/bin/time -o $basef.$nopathsnap_index.snap.log snap single $snap_index $basef.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.$nopathsnap_index.time.log
SNAP_DONE=$(date +%s)
snap_time=$(( SNAP_DONE - START_SNAP ))
echo -e "$(date)\t$scriptname\tCompleted running SNAP using $snap_index in $snap_time seconds."
echo -e "$(date)\t$scriptname\tRemoving headers..."
START_HEADER_REMOVAL=$(date +%s)
sed '/^@/d' $basef.$nopathsnap_index.sam > $basef.$nopathsnap_index.noheader.sam
END_HEADER_REMOVAL=$(date +%s)
header_removal_time=$(( END_HEADER_REMOVAL - START_HEADER_REMOVAL ))
echo -e "$(date)\t$scriptname\tCompleted removing headers in $header_removal_time seconds."
END2=$(date +%s)
diff=$(( END2 - START2 ))
echo -e "$(date)\t$scriptname\tMapping to $snap_index took $diff seconds"
done
#SNAP does not sort its results, so in order to compare files linearly, we need to sort them manually.
echo -e "$(date)\t$scriptname\tSorting..."
START_SORT=$(date +%s)
# did some testing with optimizing the number of simultaneous sorts - maximizing this number was the speediest, so no need for GNU parallel - SMF 5/22/14
# parallel.moreutils -j $cores -i bash -c "sort {} > {}.sorted;" -- *.noheader.sam
for file in *.noheader.sam
do
sort $file > $file.sorted &
done
for jobs in `jobs -p`
do
wait $job
done
END_SORT=$(date +%s)
sort_time=$(( END_SORT - START_SORT ))
echo -e "$(date)\t$scriptname\tCompleted sorting in $sort_time seconds."
# find the best alignment hit for each line
FILEARRAY=()
for snap_index in $SNAP_NT_index_directory/* ; do
nopathsnap_index=${snap_index##*/} # remove the path to file
FILEARRAY=("${FILEARRAY[@]}" "$basef.$nopathsnap_index.noheader.sam.sorted")
done
echo -e "$(date)\t$scriptname\tStarting comparison of all SAM files."
START_COMPARE=$(date +%s)
FILEARRAY=("${FILEARRAY[@]}" "$basef.NT.sam")
# find the best alignment hit for each line
compare_multiple_sam.py ${FILEARRAY[@]}
END1=$(date +%s)
comparison_time=$(( END1 - START_COMPARE ))
echo -e "$(date)\t$scriptname\tComparison took $comparison_time seconds."
echo -e "$(date)\t$scriptname\tDone with SNAP_NT "
diff=$(( END1 - START1 ))
echo -e "$(date)\t$scriptname\toutput written to $basef.NT.sam"
echo -e "$(date)\t$scriptname\tSNAP_NT took $diff seconds"
#delete intermediate SAM files
rm *.noheader.sam
rm *.noheader.sam.sorted
for snap_index in $SNAP_NT_index_directory/* ; do
nopathsnap_index=${snap_index##*/} # remove the path to file
rm *.$nopathsnap_index.sam
done