-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_nhmmer.pl
executable file
·246 lines (212 loc) · 8.41 KB
/
parse_nhmmer.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/env perl
#===============================================================================
=pod
=head1
FILE: parse_nhmmer.pl
USAGE: ./parse_nhmmer.pl -i nhmmer.out -g genome.fas [-d outdir] [-p prefix] [--nostats] [-f output-fasta-header]
./parse_nhmmer.pl \
-i run/hmmer/AbucgeM_genome.nhmmer.out \
-g run/plast/AbucgeM_genome.plast200.fas
perl src/parse_nhmmer.pl \
-i run/hmmer/AbucgeM_genome.nhmmer.out \
-g run/plast/AbucgeM_genome.plast200.fas \
-d out/AbucgeM_genome_hmmer \
-p 'AbucgeM' \
-f 'AbucgeM' \
--nostats
DESCRIPTION: Parse HMMER output from command:
$ nhmmer --tblout nhmmer.out --cpu 10 markers.hmm genome.fas > log.nhmmer.log
Input:
HMMER-output file (from command above), Genome-fasta-file
IMPORTANT: Expected header format (in fasta files used to create the
markers.hmm file) for this version (will use "marker_id"):
<marker_id>.the_rest
Examples:
1000.sate.default.pep2cds.removed.shortname.filtered
14365.sate.removed.intron.noout.aligned-allgap.filtered.fas.degap
Output:
Will write separate fasta files to output directory, one for each marker.
OPTIONS: -i <HMMER-output> Output from nhmmer.
-g <genome-fasta-file> fasta file (same genomic file used in the nhmmer search).
-d <output-directory> Name of output directory. Default: `Parse-HMMer-output`.
-p <prefix> Prefix for fasta (marker) output file. Default name is
based on marker ID (<marker_id.fas>).
-f <output-header> Provide string for fasta header: `>string`.
Default is to have the marker ID.
-s Give HMMer stats (default) in the output fasta header.
Use `--nostats` to supress.
-h Show help information.
-v Be verbose (default), or not (`--noverbose`).
REQUIREMENTS: ---
BUGS: ---
NOTES: ---
AUTHOR: Johan Nylander (JN), [email protected]
COMPANY: NBIS/NRM
VERSION: 1.0
CREATED: 09/17/2015 11:06:04 PM
REVISION: Wed 11 Dec 2019 10:02:09 AM CET
=cut
#===============================================================================
use strict;
use warnings;
use Getopt::Long;
exec( 'perldoc', $0 ) unless (@ARGV);
my %HoH = (); # key:scaffold header, val:hash:key:query name, vals:
my %query_hash = (); # key: name, val: count
my $directory = 'Parse-HMMer-output'; # Output directory
my $fastaheader = q{}; # Output fasta header string
my $genome = q{}; # Genome fasta file
my $infile = q{}; # Hmmer file
my $prefix = q{}; # File prefix for output fasta files
my $stats = 1; # HMMer stats in output fasta header
my $VERBOSE = 1;
GetOptions(
"directory=s" => \$directory,
"fastaheader=s" => \$fastaheader,
"genome=s" => \$genome,
"help" => sub { exec( "perldoc", $0 ); exit(0); },
"infile=s" => \$infile,
"prefix=s" => \$prefix,
"stats!" => \$stats,
"verbose!" => \$VERBOSE,
);
## Read HMM-output file
open my $INFILE, "<", $infile or die "$!\n";
print STDERR "Parsing HMMER output file $infile\n" if ($VERBOSE);
while (<$INFILE>) {
chomp;
next if /^#/;
my (
$target_name, $target_accession, $query_name,
$query_accession, $hmmfrom, $hmm_to,
$alifrom, $ali_to, $envfrom,
$env_to, $sq_len, $strand,
$E_value, $score, $bias,
$description_of_target
) = split /\s+/, $_;
## Find first occurence of each query_name. This would be the best hit (assuming no ties!)
if ( $query_hash{$query_name}++ ) {
next;
}
else {
my $details = "/target name=$target_name /accession=$target_accession /query name=$query_name /accession=$query_accession /hmmfrom=$hmmfrom /hmm to=$hmm_to /alifrom=$alifrom /ali to=$ali_to /envfrom=$envfrom /env to=$env_to /sq len=$sq_len /strand=$strand /E-value=$E_value /score=$score /bias=$bias /description of target=$description_of_target";
my $scaffold = $target_name;
$HoH{$scaffold}{$query_name}{'alifrom'} = $alifrom;
$HoH{$scaffold}{$query_name}{'ali to'} = $ali_to;
$HoH{$scaffold}{$query_name}{'strand'} = $strand;
$HoH{$scaffold}{$query_name}{'details'} = $details;
}
}
close($INFILE);
## Create output directory for fasta files
if ( -e $directory ) {
die "Warning. Directory $directory already exists.\n";
}
else {
unless ( mkdir $directory ) {
die "Unable to create $directory\n";
}
}
print STDERR "Created output directory $directory\n" if ($VERBOSE);
## Read genome file
open my $FASTA, "<", $genome or die "Cannot open fasta file: $genome $! \n";
print STDERR "Reading fasta file $genome\n" if ($VERBOSE);
my $def = $/;
$/ = ">";
while (<$FASTA>) {
chomp;
next if ( $_ eq '' );
my ( $id, @sequencelines ) = split /\n/;
$id =~ s/(\S+).*$/$1/;
if ( exists( $HoH{$id} ) ) {
my $sequence = '';
foreach my $line (@sequencelines) {
$sequence .= $line;
}
foreach my $key ( keys %{ $HoH{$id} } ) {
my ( $geneid, @rest ) = split /\./, $key; # highly format dependent!
## Set fasta file name
my $outfastafile = q{};
if ($prefix) {
$outfastafile = $prefix . ".$geneid.fas";
}
else {
$outfastafile = "$geneid.fas";
}
open my $OUTFILE, ">", "$directory/$outfastafile"
or die "Could not open fasta file for writing: $! \n";
print STDERR " writing $outfastafile\n" if ($VERBOSE);
## Print fasta header
print $OUTFILE ">";
if ($fastaheader) {
print $OUTFILE $fastaheader;
}
else {
print $OUTFILE $geneid;
}
if ($stats) {
print $OUTFILE " ", $HoH{$id}{$key}{'details'}, "\n";
}
else {
print $OUTFILE "\n";
}
## Print sequence
my ( $from, $to ) = q{};
if ( $HoH{$id}{$key}{'strand'} eq '+' ) {
$from = $HoH{$id}{$key}{'alifrom'} - 1;
$to = $HoH{$id}{$key}{'ali to'};
}
elsif ( $HoH{$id}{$key}{'strand'} eq '-' ) {
$from = $HoH{$id}{$key}{'ali to'} - 1;
$to = $HoH{$id}{$key}{'alifrom'};
}
my $len = $to - $from;
my $seq = substr( $sequence, $from, $len );
if ( $HoH{$id}{$key}{'strand'} eq '-' ) {
$seq = revcomp_iupac($seq);
}
$seq =~ s/(.{60})/$1\n/g;
print $OUTFILE $seq, "\n";
close($OUTFILE);
}
}
}
close($FASTA);
$/ = $def;
print STDERR "\nEnd of script\n" if ($VERBOSE);
sub revcomp {
my $seq = shift;
$seq = uc($seq);
$seq = scalar reverse $seq;
$seq =~ tr/GATC/CTAG/;
return $seq;
}
sub revcomp_iupac {
## revcomp_iupac
## Version: Wed 20 Mar 2019 10:59:00 AM CET
## Input: DNA string
## Returns: string, reversed and complemented
## Tries to handle IUPAC ambiguity symbols:
## IUPAC Code Meaning Complement
## A A T
## C C G
## G G C
## T/U T A
## M A or C K
## R A or G Y
## W A or T W
## S C or G S
## Y C or T R
## K G or T M
## V A or C or G B
## H A or C or T D
## D A or G or T H
## B C or G or T V
## N G or A or T or C N
my $seq = shift;
$seq = uc($seq);
$seq = scalar reverse $seq;
$seq =~ tr/ACGTMRWSYKVHDBN/TGCAKYWSRMBDHVN/;
return $seq;
}
__END__