-
Notifications
You must be signed in to change notification settings - Fork 1
/
concatenate_fastq.pl
executable file
·615 lines (512 loc) · 29.3 KB
/
concatenate_fastq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
#!/usr/bin/env perl
use warnings;
# select STDOUT; # When printing to STDOUT from multiple threads/forks/processes at a time, you need to make STDOUT "hot" instead of the default buffered. When buffered, the output can be garbled and intermingled, especially when redirecting STDOUT to a file. See http://www.perlmonks.org/?node_id=619092 and http://perl.plover.com/FAQs/Buffering.html . # This by itself slows things down by about 2-fold
$| = 1;
#Add use lib statement to assume there is a directory at the same level as bin in which the script is run, called 'lib'
use FindBin;
use lib "$FindBin::Bin/../lib";
use lib "$FindBin::Bin";
use strict;
use FileHandle;
use aomisc;
#use fastq_clustal;
use Cwd;
use diagnostics;
use Getopt::Long;
use Data::Dumper;
use File::Basename;
use Bio::Tools::Run::Alignment::Clustalw;
use Bio::AlignIO;
use Bio::SimpleAlign;
use Bio::SeqIO;
use File::Temp;
use Parallel::Loops;
# Andrew J. Oler, PhD
# Computational Biology Section
# Bioinformatics and Computational Biosciences Branch (BCBB)
# OCICB/OSMO/OD/NIAID/NIH
# Bethesda, MD 20892
#
#
#This package is free software; you can redistribute it and/or modify
#it under the terms of the GPL (either version 1, or at your option,
#any later version) or the Artistic License 2.0.
# Some code and ideas for reading two fastq files at the same time borrowed from http://www.dylanstorey.com/node/50
#Print out the options
if (@ARGV){ print STDERR "Arguments: ", join " ", @ARGV, "\n"; }
my $save = Cwd::cwd();
my $files;
my $verbose;
my $output;
my $gzip;
my $number = 0;
my $ref;
my $buffer = 20;
my $cpu = 1;
my $char = "N";
my $clustalw;
GetOptions('save=s' => \$save, 'output=s' => \$output, 'o=s' => \$output, 'verbose' => \$verbose, 'files=s' => \$files, 'gzip' => \$gzip, 'number=s' => \$number, 'n=s' => \$number, 'ref=s' => \$ref, 'r=s' => \$ref, 'buffer=s' => \$buffer, 'b=s' => \$buffer, 'cpu=s' => \$cpu, 'p=s' => \$cpu, 'char=s' => \$char, 'c=s' => \$char, 'clustalw' => \$clustalw, 'w' => \$clustalw);
#-----------------------------------------------------------------------------
#----------------------------------- MAIN ------------------------------------
#-----------------------------------------------------------------------------
my $usage = basename($0)." [options] <fastq1> <fastq2>
Takes paired-end fastq read files and concatenates the R2 read onto the R1 read and the R2
quality scores onto the R1 quality scores. R2 sequence is reverse complemented before
joining to R1; R2 quality scores are reversed before joining to R1. Assumes there is no
overlap between the R1 and R2 reads. (If you expect there could be overlap, even a few
bp, use something like pandaseq or pear to get concatenated/joined reads.)
fastq1 is the first read in a pair.
fastq2 is the second read in a pair.
Processes about 1000 pairs/second with 12 processors (if no other instances running) using
a reference. 5x faster without using a reference.
OPTIONS:
-n/--number Number of Ns to use in concatenation. Default = 0.
-r/--ref Single fasta file containing reference sequence to use to decide a custom
number of Ns to insert in concatenation. R1 and R2 reads will be aligned to the
reference with clustalw or bwa mem and the internal gap size will be determined.
This is recommended for long read lengths, e.g., MiSeq with 250bp reads. If bwa mem is
used for alignment (default), this file should first be indexed using command
'bwa index <file>'.
-b/--buffer Number of bases up and down of the end of the first read to look for the
gap between reads with respect to reference. Only required if --ref specified.
Default = 20. This should be at least twice the expected gap size.
-w/--clustalw Use ClustalW alignment with reference sequence to determine gap size.
Default is to use bwa mem. Using ClustalW is about 20-25x slower. Requires --ref.
-p/--cpu Number of processors to use. Default = 1. Determines the number of threads
to use for bwa if --ref is supplied.
-c/--char Character to use for concatenation. Default = N.
-o/--output Output file name. Default is [file1 basename, minus \"_1\"].concat.fastq .
Example:
concatenate_fastq.pl -o Sample_5.concat.fastq -r Seq12_093009_HA_cds.fa --cpu 14 Sample_5_1.fastq Sample_5_2.trim.fastq
";
# Change log
# 2013-09-23
# Created script.
# 2013-11-14
# Added option --number to put Ns between R1 and R2.
# 2013-11-15
# Added option --ref to align to reference and add a custom number of Ns between the reads during concatenation.
# 2013-11-18
# Parallelized concatentation. Optimal cpu number is about 12-14. ~17x speed-up with 14 cpus. Processes ~800,000 pairs of reads in 3 hours about 4400 pairs/minute.
# 2013-11-27
# Added option --char/-c. Also changed short option for --cpu to -p instead of -c.
# Removed some alternate code in sub get_custom_number_Ns that was commented out (running clustalw manually instead of using Bioperl Run module)
# 2013-12-03
# Added select STDOUT; line before $| = 1; to try to fix the problem of lines in the output being intermingled. New problem is that there are many blank lines in the output now.
# 2013-12-04
# Removed select STDOUT as it was printing blank lines. Instead, open the output file in append mode in the child processes, lock the file handle, write to the file, then close it.
# Added -o/--output option to give output file a name
# Added the function to align ungapped concatenated reads to the reference sequence with bwa mem to look for gap size to increase speed. Made this the default mode. Clustalw is optional, via -w/--clustalw. Renamed sub get_custom_number_Ns to get_custom_number_Ns_clustalw
# Commented out the Parallel Loop so the BWA method could be fast (3850 pairs/second). Copying a large hash to all child processes was slowing it down to 10 minutes per 1000 pairs.
# 2013-12-17
# Changed the base quality to 3 '$' instead of 2 '#' for the inserted Ns. This way, I can use fastq_quality_filter -q 3 -p 100 to filter out any concatenated reads where either R1 or R2 had an N in the original read if I want to.
# 2014-01-09
# Removed the Parallel Loop lines that were commented out.
# 2015-01-16
# Changed Processed read counter to every 100K instead of every 10K.
# 2015-03-16
# Added table of gap lengths
# 2015-03-18
# Added basename to temporary output files to avoid collisions in case no output directory is specified
# 2015-05-14
# Set default for primerID length in get_custom_number_Ns_clustalw to 12 instead of 10
# Put the Parallel Loop for clustalw in a separate subroutine and made it run in parallel. It will now be done and saved as a hash of id-> gap length just like the bwa subroutine.
# Added a step to check the reference sequence for a gap (i.e., insertion in the read), and save this as a negative gap.
# Also counting insertion values from cigar string when running bwa as negative gap values in the table output to screen. Treating it as zero-gap in the output fastq file for now though.
# Added $count to file name because tempdir is reusing the same random string.
# Check first whether bwa index exists before running bwa index. -- Actually, reference should be indexed before running. Exits with a warning if sam file is empty.
# To do
# Make a distribution graph of insert sizes and/or gap sizes when using --ref. Gap sizes are already in a table so maybe not gap sizes.
# Test MAFFT to see if we can speed it up even further compared to Clustalw
# Make clustalw-specific modules (e.g., Bioperl Run) optional using eval.
# # Maybe make an option to remove any sequences that don't align to the reference...
# Add a script before this one the check whether reads of a pair overlap or not; if so, send to pandaseq, if not, run in concatenate_fastq.pl
# Maybe change default output directory to something other than Cwd?
# Parameterize primerID length in get_custom_number_Ns_clustalw, instead of using 10 (or 12) as default. Maybe also parameterize the expected gap length. Or make the default buffer size large enough...
# Maybe change iterator() sub so that it saves them as an array in the first place, so no need to split later.
# Find a way to deal with *insertions* (when no gap, but seems to be an insertion. Maybe trim off some of the read before concatenating...?
unless ($ARGV[1]){ print STDERR "$usage\n"; exit; }
my $start_time = time;
unless (-d $save){ mkdir($save) or warn "$!\n" }
if ($clustalw){
unless ($ref){
die "Please enter reference sequence for alignment\n";
}
}
my $refseq = ""; # reference sequence itself
if ($ref && $clustalw){
my $in = Bio::SeqIO->new(-file => $ref , -format => 'Fasta' ); # There should only be one sequence in this file, so it should be the first. # This conflicts with the Bioperl module
my $seq = $in->next_seq();
$refseq = $seq->seq();
if ($verbose){ print "refseq: $refseq\n"; }
}
# Run BWA or clustalw to get gap sizes if reference sequence file is supplied.
my $id_to_gap_hash; # hashref with keys as fastq id and value is the gap size to insert between R1 and R2. Determined from CIGAR string in alignment with bwa mem.
if ($ref){
if($clustalw){
# Use clustalw to align each of the reads to reference and save gap size in hash
$id_to_gap_hash = pass_seqs_to_clustalw($ARGV[0],$ARGV[1], $refseq, $buffer);
}
else {
# Use BWA to align all of the reads and save gap sizes as determined from CIGAR string.
if (check_for_bwa() ){ # check_for_bwa() sub will return 1 if bwa mem is not found; will also print an appropriate error message.
exit 1;
}
else {
print STDERR "Found bwa mem. Aligning reads to $ref\n";
$id_to_gap_hash = get_gaps_with_bwa($ref, $ARGV[0], $ARGV[1], $buffer);
}
}
# Now print out counts for gap sizes.
my $gap_sizes; # hashref with keys as gap sizes
foreach my $gap (values %$id_to_gap_hash){
$gap_sizes->{$gap}++;
}
print STDERR "Gap_size\tCount\n";
for my $gap (sort {$gap_sizes->{$b} <=> $gap_sizes->{$a}} keys %$gap_sizes){
print STDERR "$gap\t$gap_sizes->{$gap}\n";
}
print STDERR "Note: negative gap values are treated as 0.\n";
}
# print Dumper($id_to_gap_hash); exit;
# Set up output file
unless ($output){
my ($filename,$dir,$ext) = fileparse($ARGV[0],@SUFFIXES);
$filename =~ s/_\d$/\.concat/; # e.g., Sample_5_1.fastq changed to Sample_5.concat (.fastq added next line)
$output = $save . "/" . $filename . $ext; # Don't open here, open in each child fork
}
if (-e $output){
print STDERR "File already exists: $output -- will overwrite\n";
# sleep 5;
unlink($output);
}
open (my $FH1 , '<' , $ARGV[0]) || die $!;
open (my $FH2 , '<' , $ARGV[1]) || die $!;
my $fastq_iterator = iterator($FH1);
my $fastq_iterator_2 = iterator($FH2);
my $count = 0; # Keep a count of progress. Need to increment this in the first sub.
if ($ref){
print STDERR "Walking through reads to insert gaps between reads. ";
&elapsed($start_time, 'Elapsed', $verbose);
}
my $entry = "";
my $entry2 = "";
my $writefh = open_to_write($output);
# The BWA method is very fast if you don't use a Parallel Loop.
while( $entry = $fastq_iterator->()){
$entry2 = $fastq_iterator_2->();
$count++;
if ($count % 10000 == 0){
print STDERR "Processed $count reads. ";
&elapsed($start_time, 'Elapsed', $verbose);
}
# print "record $count\nread1:\n$entry\nread2:\n$entry2\n";
my @fastq = split(/\n/, $entry);
my @fastq2 = split(/\n/, $entry2); # About 225,000 records per second before doing this step. # Adding this conversion to an array brings speed to about 130,000 records per second
# Check that the ids match.
my @ids = split(/\s+/, $fastq[0]); # e.g., split this: @MISEQ:50:000000000-A4142:1:1101:17205:1539 1:N:0:ACCACCTT
my @ids2 = split(/\s+/, $fastq2[0]);
# die "Read ids do not match! Exiting...\n" unless ($ids[0] eq $ids2[0]); # Adding this check to all reads brings speed to about 100,000 records per second. Just checking every 1000th read to save time (to see if the files became out of sync at a certain point), it runs at 110,000 records per second. Just check all reads in that case -- it doesn't take much more time.
my $first = Bio::Seq->new( -seq => $fastq[1], -id => $fastq[0], );
my $second = Bio::Seq->new( -seq => $fastq2[1], -id => $fastq[0], );
$second = $second->revcom(); # Seq object, Reverse complement
my $first_quals = $fastq[3];
my $second_quals = reverse($fastq2[3]); # Reverse quality scores.
if ($ref){
my $id = $ids[0]; # Need just the first part of the id, because that is how the hash keys were saved, e.g., @MISEQ:50:000000000-A4142:1:1101:17205:1539 . Also remove the @
$id =~ s/^@//;
$number = $id_to_gap_hash->{$id};
$number = 0 if ($number < 0); # insertions are counted as negative values. Maybe we should use them to trim some of the read off...? For now just treat it as if there were no gap.
# print "id: $id number: $number\n\n-->$id_to_gap_hash->{$id}\n"; exit;
}
my $new_fastq = $first->id() . "\n" . $first->seq() . "$char"x$number . $second->seq() . "\n" . '+' . "\n" . $first_quals . '$'x$number . $second_quals . "\n"; # Use '$' as low quality for gap between R1 and R2, (quality 3. Was '#', quality 2.)
print $writefh "$new_fastq";
}
close($FH1);
close($FH2);
close($writefh);
&elapsed($start_time, 'Total', $verbose);
#-----------------------------------------------------------------------------
#---------------------------------- SUBS -------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
sub pass_seqs_to_clustalw {
# Takes the two input fastq files and reads them simultaneously and passes two reads at a time to clustalw, then save the gap sizes to a hash.
my ($fastq1, $fastq2, $refseq, $buffer) = @_;
open (my $FH1 , '<' , $ARGV[0]) || die $!;
open (my $FH2 , '<' , $ARGV[1]) || die $!;
my $fastq_iterator = iterator($FH1);
my $fastq_iterator_2 = iterator($FH2);
print STDERR "Processing aligning reads to reference with clustalw to determine gap size for each pair of reads.\n";
my $count = 0; # Keep a count of progress. Need to increment this in the first sub.
my $entry = "";
my $entry2 = "";
# Start Parallel loop here...
my %id_to_gap_hash; # hashref to store id -> gap hash. Make this a shared hash.
my $pl = Parallel::Loops->new($cpu);
$pl->share(\%id_to_gap_hash);
$pl->while( sub { $entry = $fastq_iterator->(); $entry2 = $fastq_iterator_2->(); $count++; defined($entry); }, sub {
if ($count % 1000 == 0){
print STDERR "Processed $count reads. ";
&elapsed($start_time, 'Elapsed', $verbose);
}
# print "record $count\nread1:\n$entry\nread2:\n$entry2\n";
my @fastq = split(/\n/, $entry);
my @fastq2 = split(/\n/, $entry2); # About 225,000 records per second before doing this step. # Adding this conversion to an array brings speed to about 130,000 records per second
# Check that the ids match.
my @ids = split(/\s+/, $fastq[0]); # e.g., split this: @MISEQ:50:000000000-A4142:1:1101:17205:1539 1:N:0:ACCACCTT
# my @ids2 = split(/\s+/, $fastq2[0]);
# die "Read ids do not match! Exiting...\n" unless ($ids[0] eq $ids2[0]); # Adding this check to all reads brings speed to about 100,000 records per second. Just checking every 1000th read to save time (to see if the files became out of sync at a certain point), it runs at 110,000 records per second. Just check all reads in that case -- it doesn't take much more time.
my $first = Bio::Seq->new( -seq => $fastq[1], -id => $fastq[0], );
my $second = Bio::Seq->new( -seq => $fastq2[1], -id => $fastq[0], );
$second = $second->revcom(); # Seq object, Reverse complement
my $first_quals = $fastq[3];
my $second_quals = reverse($fastq2[3]); # Reverse quality scores.
$number = get_custom_number_Ns_clustalw($refseq, $first->seq() , $second->seq(), $buffer, $count);
# my $new_fastq = $first->id() . "\n" . $first->seq() . "$char"x$number . $second->seq() . "\n" . '+' . "\n" . $first_quals . '$'x$number . $second_quals . "\n"; # Use '$' as low quality for gap between R1 and R2, (quality 3. Was '#', quality 2.)
# print $writefh "$new_fastq";
my $id = $ids[0]; # Need just the first part of the id, because that is how the hash keys were saved, e.g., @MISEQ:50:000000000-A4142:1:1101:17205:1539 . Also remove the @
$id =~ s/^@//;
$id_to_gap_hash{$id} = $number;
}
); ## End of Parallel Loop;
close($FH1);
close($FH2);
return (\%id_to_gap_hash);
}
#-----------------------------------------------------------------------------
sub iterator{
my $file_handle = shift // die $!;
return sub{
my $record .= readline($file_handle) // return;
$record .= readline($file_handle);
$record .= readline($file_handle);
$record .= readline($file_handle);
return $record;
};
}
#-----------------------------------------------------------------------------
sub get_custom_number_Ns_clustalw {
# Arguments
# Reference sequence, First sequence, second sequence, buffer
# Concatenates reads with no Ns, makes a temporary fasta file, aligns concatenated read to reference with clustalw, then gets the region that should correspond to the gap between the reads and looks for the largest gap.
my ($refseq,$first,$second,$buffer,$count) = @_;
my $read_length = length($first);
my $concat = $first . $second;
if ($verbose){ print STDERR "second in sub: $second\n"; }
if ($verbose){ print STDERR "concat: $concat\n"; }
# Create temporary fasta file
my $cwd = Cwd::cwd();
my $tempdir = File::Temp->newdir( "$cwd/fasta_".$count."_tempXXXXXX" ); # was "$cwd/fasta_file_tempXXXXX"
my $temp_fasta = $tempdir."/temp.fa";
my $fh = open_to_write($temp_fasta, 0, 0, 1);
print $fh ">ref\n$refseq\n>read\n$concat\n";
close($fh);
# Align with clustalw
my $factory = Bio::Tools::Run::Alignment::Clustalw->new(-OUTORDER => "INPUT", -QUIET => 1, -QUICKTREE => 1, ); # -OUTORDER => "INPUT", -QUIET => 1,
my $aln = $factory->align("$temp_fasta"); # $aln is a SimpleAlign object. http://search.cpan.org/~cjfields/BioPerl-1.6.901/Bio/SimpleAlign.pm.
# Get a slice of the alignment near the gap
my $primerID_length = 12;
my $col = $aln->column_from_residue_number("read", $primerID_length) - $primerID_length; # Effective postion of beginning of read in the alignment. Look for the position of the 12th read residue within the alignment and then subtract 12. Just in case some of the first bases align to an early region of the reference, especially if the first 12 bases are a random primerID, so they could align to anything.
if ($verbose){ print STDERR "col: $col\n"; }
my $slice = $aln->slice($col + $read_length - $buffer, $col + $read_length + $buffer + 10); # If buffer is 20, then go 20 bp upstream and 30 bp downstream of the end of the R1 read. The extra 10 is based on the expected gap size. Maybe it should be command-line option....
if ($verbose){
my $seqIO = Bio::AlignIO->new(-fh => \*STDERR, -format => 'clustalw'); # I had this outside of the "if ($verbose)" block and it slowed things down a LOT.
$seqIO->write_aln($slice);
}
# Get the aligned read from the alignment
my @seqs = $slice->each_seq();
my $aln_read_seq = $seqs[-1]->seq(); # Works. Needs to have reference first and read second.
if ($verbose){ print STDERR "read: $aln_read_seq\n"; }
# Look for largest gap. Gap could be '-' or '.' character.
my $gap_length = get_gap_size($aln_read_seq);
# If gap_length is 0, get the reference sequence and look for a gap there -- maybe there is an insertion in the read
if ($gap_length == 0){
# Get ref sequence
my $aln_ref_seq = $seqs[0]->seq(); # Works. Needs to have reference first and read second.
if ($verbose){ print STDERR "ref: $aln_ref_seq\n"; }
# Look for largest gap. Gap could be '-' or '.' character.
$gap_length = -1 * get_gap_size($aln_ref_seq); # If gap found in reference, that would be an insertion in the read, so count this as a "negative" gap.
}
if ($verbose){ print STDERR "gap length: $gap_length\n"; }
return $gap_length;
}
#-----------------------------------------------------------------------------
sub get_gap_size {
# Takes a sequence and looks for the largest sequence of '-' or '.' characters
my $seq = shift;
my $gap_length = 0; # Default gap size is zero
while($seq =~ m/([\-\.]+)/g){ # Looking for consecutive '-' or '.' characters.
if (length($1)>$gap_length){
$gap_length = length($1);
}
}
return $gap_length;
}
#-----------------------------------------------------------------------------
sub check_for_bwa {
# Check for bwa mem on the PATH.
# Returns 0 if successful, 1 if an error occurs.
my $pwd = pwd_for_hpc();
my $output=`$pwd/bwa mem 2>&1`; # Captures stdout and stderr.
$output = trim($output);
# Test on command-line: perl -e'use aomisc; my $output=`bwa mem 2>&1`; $output = trim($output); print "-$output-\n"'
# If bwa mem is present, value will start: "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]".
# If an older version of bwa is present, value will be: "[main] unrecognized command 'mem'"
# If no bwa is present, the value will be undef.
if ($output =~ m/Usage: bwa mem/){
return 0; #successfully found bwa mem
}
elsif($output =~ m/unrecognized command/){
print STDERR "Please update bwa to version 0.7.5a-r405 or newer.\n";
return 1;
}
else{
print STDERR "No bwa installation detected. Please install bwa (version 0.7.5a-r405 or newer) or add to PATH environment variable to run this program.\n";
return 1;
}
}
#-----------------------------------------------------------------------------
sub get_gaps_with_bwa{
my ($ref, $fastq_file1, $fastq_file2, $buffer) = @_;
# print join "\t", ($ref, $fastq_file1, $fastq_file2, $buffer),"\n"; exit;
my $id_to_gap_hash; # Hash to store
#### First, go through the input fastq to make fasta files with the sequences concatenated with no Ns. ###
# This section borrows a lot of code from above. It might be nice to put it all in a subroutine at some point to make everything more concise.
# Get file base name to add to temp files.
my ($base,$path,$ext) = fileparse($fastq_file1,@SUFFIXES);
$base =~ s/_R?1//g;
# Create temporary fastq file
print STDERR "Making temporary fastq file by concatenating with no gap."; &elapsed($start_time, ' Elapsed', $verbose);
# my $cwd = Cwd::cwd();
# my $tempdir = File::Temp->newdir( "$cwd/concat_fastq_tempXXXXXXX" );
my $tempdir = $save;
my $temp_fastq = "$tempdir/$base" . ".temp.fastq";
my $fh = open_to_write($temp_fastq, 0, 0, 1);
open (my $first , '<' , $fastq_file1) || die $!;
open (my $second, '<' , $fastq_file2) || die $!;
my $fastq_iterator = iterator($first);
my $fastq_iterator_2 = iterator($second);
my $read_length = 0;
while (my $entry = $fastq_iterator->() ){
my $entry2 = $fastq_iterator_2->(); # About 225,000 records per second
my @fastq = split(/\n/, $entry);
my @fastq2 = split(/\n/, $entry2); # Adding this conversion to an array brings speed to about 130,000 records per second
my @ids = split(/\s+/, $fastq[0]);
my @ids2 = split(/\s+/, $fastq2[0]);
die "Read ids do not match! Exiting...\n" unless ($ids[0] eq $ids2[0]); # Adding this check to all reads brings speed to about 100,000 records per second. Just checking every 1000th read to save time (to see if the files became out of sync at a certain point), it runs at 110,000 records per second. Just check all reads in that case -- it doesn't take much more time.
$read_length = length($fastq[1]) unless ($read_length);
my $first = Bio::Seq->new( -seq => $fastq[1], -id => $fastq[0], );
my $second = Bio::Seq->new( -seq => $fastq2[1], -id => $fastq[0], );
$second = $second->revcom(); # Seq object, Reverse complement
my $first_quals = $fastq[3];
my $second_quals = reverse($fastq2[3]); # Reverse quality scores.
my $number = 0;
my $new_fastq = $first->id() . "\n" . $first->seq() . "$char"x$number . $second->seq() . "\n" . '+' . "\n" . $first_quals . "#"x$number . $second_quals . "\n"; # Use '#' as low quality for gap between R1 and R2, 2
print $fh "$new_fastq";
}
close($first);
close($second);
close($fh);
# system("wc $temp_fastq");
#### Now align to the reference sequence with bwa mem ###
print STDERR "Aligning temporary fastq file to reference with bwa mem."; &elapsed($start_time, ' Elapsed', $verbose);
my $refbase = basename($ref);
my $temp_sam = "$tempdir/$base" . ".temp.sam";
# my $temp_sam = "temp.sam";
my $pwd = pwd_for_hpc();
my $bwa_index_out = "$tempdir/$base" . ".bwa_index_out.txt";
my $bwa_mem_out = "$tempdir/$base" . ".bwa_mem_out.txt";
my $samtools_view_out = "$tempdir/$base" . ".samtools_view_out";
# [Need to check first whether the bam index exists before running bwa index...]
# [Also check whether $ref exists in $tempdir before copying...]
# my $cmd = "cp $ref $tempdir/; $pwd/bwa index $tempdir/$refbase 2> $bwa_index_out && $pwd/bwa mem -t $cpu -M $tempdir/$refbase $temp_fastq 2> $bwa_mem_out | $pwd/samtools view -S -F 256 - 2> $samtools_view_out > $temp_sam";
my $cmd = "$pwd/bwa mem -t $cpu -M $tempdir/$refbase $temp_fastq 2> $bwa_mem_out | $pwd/samtools view -S -F 256 - 2> $samtools_view_out > $temp_sam";
# print $cmd; exit;
print STDERR "Executing: $cmd\n";
system($cmd); # This will copy the reference sequence to the temporary directory, index it, align the reads, filter to get only primary alignments and output a SAM file.
# system("cat $tempdir/bwa_mem_out.txt");
# system("wc $temp_sam");
# Check to see whether sam file is non-empty. If no bwa index is present, sam file will be empty.
unless (-s $temp_sam){
print STDERR "SAM file empty. Please check to see if your reference $ref has been indexed with bwa index.\n";
exit 1;
}
#### Now parse the sam file to get id and cigar string and save gaps between reads
print STDERR "Alignment done, parsing output."; &elapsed($start_time, ' Elapsed', $verbose);
my $samfh = open_to_read($temp_sam);
my $unmapped = 0;
my $mapped = 0;
while(<$samfh>){
chomp;
my @F = split(/\t/);
my ($id,$cigar_string) = ($F[0],$F[5]);
unless($id && $cigar_string =~ m/\w+/){ # Check to see that the reads were mapped. If not mapped, cigar string will be *. If not mapped, assign gap as zero.
$id_to_gap_hash->{$id} = 0;
$unmapped++;
next;
}
$mapped++;
my @numbers = split(/[A-Z]/, $cigar_string);
my @letters = split(/[0-9]+/, $cigar_string);
# print "cigar: $cigar_string\nnumbers: ", join " ", @numbers, "letters:", @letters; print "\n";
shift(@letters);
my $cigar; # AoA of Cigar elements
for (my $i=0; $i < @numbers; $i++){
$cigar->[$i] = [$letters[$i], $numbers[$i]];
}
# print Dumper($cigar);
my $sum = 0;
my $stored = 0;
CIGAR: for (my $i=0; $i < @$cigar; $i++){
if ($sum >= ($read_length - $buffer) && $sum <= ($read_length + $buffer * 2)){
# position of the deletion/insertion in the read with respect to reference is within one buffer upstream and 2 buffer distances downstream of the end of the R1 read
if($cigar->[$i]->[0] eq 'D'){
# print "id: $F[0]\tsum: $sum\tmatch: $cigar->[$i]->[1]\n";
$id_to_gap_hash->{$id} = $cigar->[$i]->[1]; # store the size of the gap. Positive value for gap.
$stored++;
last CIGAR; #? Is there ever insertion and deletion??
}
elsif($cigar->[$i]->[0] eq 'I'){
# somehow store the position to delete the overlap region.
# Negative value for insertion.
$id_to_gap_hash->{$id} = -1 * $cigar->[$i]->[1]; # store the size of the gap. Negative value for an insertion. (Does this work??? Yes, it works)
$stored++;
last CIGAR; #? Unless sometimes insertion and deletion??
}
}
$sum+= $cigar->[$i]->[1];
# unless ($cigar->[$i]->[1] =~ m/^\d+$/){
# print STDERR "id: $id\n";
# die Dumper($cigar->[$i]);
# }
}
unless ($stored){
$id_to_gap_hash->{$id} = 0; # If there was no D cigar element in the expected location, then there is no gap in the read, so assign a zero gap. (actually, there might actually be overlap between these reads... I've found that sometimes there is a gap in the reference with these reads)
}
}
close($samfh);
printf STDERR "%2d\treads mapped to the reference sequence (%.3f).\n", $mapped, $mapped/($mapped + $unmapped) if ($mapped);
printf STDERR "%2d\treads not mapped to the reference sequence (%.3f).\n", $unmapped, $unmapped/($mapped + $unmapped) if ($unmapped);
&elapsed($start_time, ' Elapsed', $verbose);
return ($id_to_gap_hash);
}
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#Useful things
#my $total_lines = 0;
# $total_lines++; #somewhere in subroutine, if I run through the file first.
# my $lines_done = 0;
# $lines_done++; #in while loop, increment as each new line is processed
# if (($lines_done % 25) == 0){ #put this in the while loop too.
# print "Done processing $lines_done of $total_lines sequences.";
# &elapsed($start_time, ' Elapsed', $verbose);
# }