diff --git a/PhyloFlash.pm b/PhyloFlash.pm index 3b193bc..a5ae21a 100644 --- a/PhyloFlash.pm +++ b/PhyloFlash.pm @@ -586,6 +586,8 @@ sub fasta_copy_except { open_or_die(\$dfh, ">", $dest); my %acc_hash = map { $_ => 1 } @accs; my $skip = 0; + my $num_acc = scalar (keys %acc_hash); + msg ("Number of sequences to skip: $num_acc"); while(my $row = <$sfh>) { if (substr($row, 0, 1) eq '>') { my ($acc) = ($row =~ m/>([^ ]*) /); @@ -916,8 +918,10 @@ sub fix_rname_taxstr { foreach my $id (keys %$sam_href) { foreach my $segment (keys %{$sam_href->{$id}}) { foreach my $rname (keys %{$sam_href->{$id}{$segment}}) { - my $new_rname = join " ", ($sam_href->{$id}{$segment}{$rname}{'RNAME'}, $acc_href->{$sam_href->{$id}{$segment}{$rname}{'RNAME'}}); - $sam_href->{$id}{$segment}{$rname}{'RNAME'} = $new_rname; + foreach my $href (@{$sam_href->{$id}{$segment}{$rname}}) { + my $new_rname = join " ", ($href->{'RNAME'}, $acc_href->{$href->{'RNAME'}}); + $href->{'RNAME'} = $new_rname; + } } } } @@ -933,9 +937,17 @@ sub samaref_to_lines { $fastq_href) = @_; my @lines; foreach my $href (@$aref) { + # Split read ID + my $id_fwd = $href->{'QNAME'}; + #my $id_rev = $href->{'QNAME'}; + my $sep; + if ($id_fwd =~ m/^(.+)([:\/_])[12]$/) { + $id_fwd = $1.$2.'1'; + #$id_rev = $1.$2.'2'; + } # Splice in first segment dummy entry if this is rev with fwd read unmapped if ($href->{'FLAG'} & 0x80 && $href->{'FLAG'} & 0x8) { - my @splice = ($href->{'QNAME'}, # QNAME + my @splice = ($id_fwd, # QNAME 0x1 + 0x4 + 0x40, # Paired read, segment unmapped, first segment '*', # RNAME '0', # POS @@ -981,9 +993,11 @@ sub fix_pairing_flags { # Flag 0x2 - both segments have alignments foreach my $segment (keys %{$sam_href->{$id}}) { foreach my $ref (keys %{$sam_href->{$id}{$segment}}) { - $sam_href->{$id}{$segment}{$ref}{'FLAG'} += 0x2 unless $sam_href->{$id}{$segment}{$ref}{'FLAG'} & 0x2; - # Record if flag 0x10 set for this segment - $segment_revcomp{$segment} ++ if $sam_href->{$id}{$segment}{$ref}{'FLAG'} & 0x10; + foreach my $href (@{$sam_href->{$id}{$segment}{$ref}}) { + $href->{'FLAG'} += 0x2 unless $href->{'FLAG'} & 0x2; + # Record if flag 0x10 set for this segment + $segment_revcomp{$segment} ++ if $href->{$ref}{'FLAG'} & 0x10; + } } } # Sweep through once more @@ -991,7 +1005,9 @@ sub fix_pairing_flags { foreach my $segment (keys %{$sam_href->{$id}}) { foreach my $ref (keys %{$sam_href->{$id}{$segment}}) { if (defined $segment_revcomp{$other_segment{$segment}}) { - $sam_href->{$id}{$segment}{$ref}{'FLAG'} += 0x20 unless $sam_href->{$id}{$segment}{$ref}{'FLAG'} & 0x20; + foreach my $href (@{$sam_href->{$id}{$segment}{$ref}}) { + $href->{'FLAG'} += 0x20 unless $href->{'FLAG'} & 0x20; + } } } } @@ -1000,13 +1016,14 @@ sub fix_pairing_flags { # Flag 0x8 - next segment unmapped foreach my $segment (keys %{$sam_href->{$id}}) { foreach my $ref (keys %{$sam_href->{$id}{$segment}}) { - $sam_href->{$id}{$segment}{$ref}{'FLAG'} -= 0x2 if $sam_href->{$id}{$segment}{$ref}{'FLAG'} & 0x2; - $sam_href->{$id}{$segment}{$ref}{'FLAG'} += 0x8 unless $sam_href->{$id}{$segment}{$ref}{'FLAG'} & 0x8; + foreach my $href (@{$sam_href->{$id}{$segment}{$ref}}) { + $href->{'FLAG'} -= 0x2 if $href->{'FLAG'} & 0x2; + $href->{'FLAG'} += 0x8 unless $href->{'FLAG'} & 0x8; + } } } } } - # No return value because it modifies hash in place } @@ -1038,11 +1055,17 @@ sub read_sortmerna_sam { # Hash SAM record by line number push @samhref_arr, $href; - #$samhash_by_line {$counter} = $href; - # Split read ID on first whitespace [not necessary as sortmerna already does this] - #my ($id, @discard) = split / /, $href->{'QNAME'}; + # Sortmerna splits readname on first whitespace my $id = $href->{'QNAME'}; + # If readname has a hard suffix indicated fwd or reverse segment, rename + # to match forward read and hash by this key. + # This is because BBmap will later rename QNAME to match forward read + # and this is the key that is needed to search for reads that map to + # assembly + if ($id =~ m/^(.+)([:\/_])[12]$/) { + $id = $1.$2.'1'; + } # Get sequence revcomp if bitflag 0x10 is set my $sequence_original; @@ -1078,11 +1101,11 @@ sub read_sortmerna_sam { # Check whether it is primary or secondary alignment for this read # Assume that supplementary alignments not reported - if (defined $samhash_by_qname_segment_ref{$href->{'QNAME'}}{$segment}) { + if (defined $samhash_by_qname_segment_ref{$id}{$segment}) { # Flag 0x100 as secondary alignment if entry for this read already encountered $href->{'FLAG'} += 0x100 unless $href->{'FLAG'} & 0x100; } - $samhash_by_qname_segment_ref{$href->{'QNAME'}}{$segment}{$href->{'RNAME'}} = $href; + push @{$samhash_by_qname_segment_ref{$id}{$segment}{$href->{'RNAME'}}}, $href; $counter ++; } close($fh); @@ -1129,38 +1152,44 @@ sub read_interleaved_fastq { chomp $line; my $modulo = $counter % 8; last if defined $limit && ( $counter / 8 ) > $limit; - if ($counter % 8 == 0 && $line =~ m/^@(.+)/) { + if ($modulo == 0 && $line =~ m/^@(.+)/) { # Header line of fwd read my $fullheader = $1; my ($id, @discard) = split / /, $fullheader; # Split on whitespace + # Strip any suffix indicating fwd or rev segment + if ($id =~ m/^(.+)([:\/_])[12]/) { + $id = $1.$2.'1'; + } $currid = $id; if (defined $hash{$currid}) { # If another sequence of this name already defined, warn print STDERR "WARNING: Splitting Fastq header on whitespace yields non-unique seq ID: $currid\n"; } $hash{$currid}{'fwd'}{'fullheader'} = $fullheader; - } elsif ($counter % 8 == 1) { + } elsif ($modulo == 1) { # Seq line of fwd read $hash{$currid}{'fwd'}{'seq'} = $line; $hash{$currid}{'byseq'}{$line} = 'fwd'; - } elsif ($counter % 8 == 3) { + } elsif ($modulo == 3) { # Quality line of fwd read $hash{$currid}{'fwd'}{'qual'} = $line; - } elsif ($counter % 8 == 4 && $line =~ m/^@(.+)/) { + } elsif ($modulo == 4 && $line =~ m/^@(.+)/) { # Header line of rev read my $fullheader = $1; my ($id, @discard) = split / /, $fullheader; # Split on whitespace + # Strip any suffix indicating fwd or rev segment + if ($id =~ m/^(.+)([:\/_])[12]/) { + $id = $1.$2.'1'; + } if ($id ne $currid) { - print STDERR "WARNING: Fastq rev read header $id does not match fwd read $currid at line $counter\n"; - print STDERR "Check if interleaved file correctly formatted\n"; + msg ("WARNING: Fastq rev read header $id does not match fwd read $currid at line $counter\nCould not recognize standard read segment suffix\nCheck if interleaved file correctly formatted"); } - $hash{$currid}{'rev'}{'fullheader'} = $fullheader; - } elsif ($counter % 8 == 5) { + } elsif ($modulo == 5) { # Seq line of rev read $hash{$currid}{'rev'}{'seq'} = $line; $hash{$currid}{'byseq'}{$line} = 'rev'; - } elsif ($counter % 8 == 7) { + } elsif ($modulo == 7) { # Qual line of rev read $hash{$currid}{'rev'}{'qual'} = $line; } @@ -1262,9 +1291,9 @@ sub initialize_outfiles_hash { "spades_log", { description => "Log file from SPAdes assembler", - discard => 1, + discard => 0, filename => "$libraryNAME.spades.out", - intable => 0, + intable => 1, }, "full_len_class", { @@ -1332,9 +1361,9 @@ sub initialize_outfiles_hash { "emirge_log", { description => "Log file from EMIRGE sequence reconstruction", - discard => 1, + discard => 0, filename => "$libraryNAME.emirge.out", - intable => 0, + intable => 1, }, "assemratio_csv", { @@ -1474,7 +1503,7 @@ sub initialize_outfiles_hash { description => "phyloFlash log file", discard => 0, filename => "$libraryNAME.phyloFlash.log", - intable => 0, + intable => 1, }, "phyloFlash_archive", { @@ -1682,9 +1711,9 @@ sub initialize_outfiles_hash { "sam_remap_trusted", { description => "SAM file of read mapping vs trusted contigs", - discard => 1, + discard => 0, filename => "$libraryNAME.trusted.bbmap.sam", - intable => 0, + intable => 1, }, "bbmap_remap_log_trusted", { diff --git a/phyloFlash.pl b/phyloFlash.pl index 793cc0b..c2282b6 100755 --- a/phyloFlash.pl +++ b/phyloFlash.pl @@ -148,11 +148,18 @@ =head2 INPUT AND ANALYSIS OPTIONS =item -id I -Minimum allowed identity for read mapping process in %. Must be within +Minimum allowed identity for BBmap read mapping process in %. Must be within 50..98. Set this to a lower value for very divergent taxa Default: 70 +=item -evalue_sortmerna I + +E-value cutoff to use for SortMeRNA (only if I<-sortmerna> option chosen). In +scientific exponent notation (see below) + +Default: 1e-09 + =item -clusterid I Identity threshold for clustering with vsearch in %. @@ -304,7 +311,8 @@ =head1 COPYRIGHT AND LICENSE my $SEmode = 0; # single ended mode my $libraryNAME = undef; # output basename my $interleaved = 0; # Flag - interleaved read data in read1 input (default = 0, no) -my $id = 70; # minimum %id for mapping +my $id = 70; # minimum %id for mapping with BBmap +my $evalue_sortmerna = 1e-09; # E-value cutoff for sortmerna, ignored if -sortmerna not chosen my $readlength = 100; # length of input reads my $readlimit = -1; # max # of reads to use my $amplimit = 500000; # number of SSU pairs at which to switch to emirge_amplicon @@ -338,9 +346,31 @@ =head1 COPYRIGHT AND LICENSE my %outfiles; # Hash to keep track of output files # variables for report generation -my $sam_fixed_href; # Ref to hash storing initial mapping data (stored as refs to hashes keyed by SAM column name), keyed by RNAME, read orientation, QNAME +my $sam_fixed_href; # Ref to hash storing initial mapping data (stored as array of refs to hashes keyed by SAM column name), keyed by RNAME, read orientation, QNAME my $sam_fixed_aref; # Ref to array storing initial mapping data (pointing to same refs as above) in original order of SAM input file my %ssu_sam_mapstats; # Statistics of mapping vs SSU database, parsed from from SAM file +# Fields for %ssu_sam_mapstats +# total_reads Total read (pairs) input +# total_read_segments Total read segments input +# ssu_fwd_seg_map Total fwd read segments mapped +# ssu_rev_seg_map Total rev read segments mapped +# ssu_tot_map Total read segments mapped +# ssu_tot_pair_map Total read pairs mapped (one or both segments) +# ssu_tot_pair_ratio Ratio of read (pairs) mapping to SSU (one or both segments +# ssu_tot_pair_ratio_pc Above as percentage +# assem_tot_map Total segments mapping to full length assembled +# ssu_unassem Total segments not mapping to full length +# assem_ratio Ratio assembled/unassmebled +# assem_ratio_pc Above as percentage +# spades_fwd_map Fwd read segments mapping to SPAdes +# spades_rev_map +# spades_tot_map +# emirge_fwd_map +# emirge_rev_map +# emirge_tot_map +# trusted_fwd_map +# trusted_rev_map +# trusted_tot_map # Hashes to keep count of NTUs my %taxa_ambig_byread_hash; # Hash of all taxonomy assignments per read @@ -492,6 +522,7 @@ sub parse_cmdline { 'crlf' => \$crlf, 'decimalcomma' => \$decimalcomma, 'sortmerna!' => \$use_sortmerna, + 'evalue_sortmerna=s' => \$evalue_sortmerna, 'emirge!' => \$emirge, 'skip_emirge' => \$skip_emirge, 'skip_spades' => \$skip_spades, @@ -594,7 +625,7 @@ sub parse_cmdline { if ($use_sortmerna == 1) { msg ("Sortmerna will be used instead of BBmap for the initial SSU rRNA read extraction"); msg ("Sortmerna requires uncompressed input files - Ensure that you have enough disk space"); - msg ("The following input parameters specific to BBMap will be ignored: --id, --tophit, --maxinsert"); + msg ("The following input parameters specific to BBMap will be ignored: --id, --maxinsert"); } # check surplus arguments @@ -818,8 +849,10 @@ sub write_csv { "cwd",$cwd, "minimum mapping identity",$id, "single ended mode",$SEmode, - "input reads",$readnr_pairs, + "input reads",$ssu_sam_mapstats{'total_reads'}, + "input read segments",$ssu_sam_mapstats{'total_read_segments'}, "mapped SSU reads",$SSU_total_pairs, + "mapped SSU read segments",$ssu_sam_mapstats{'ssu_tot_map'}, "mapping ratio",$SSU_ratio_pc, "detected median insert size",$ins_me, "used insert size",$ins_used, @@ -830,6 +863,8 @@ sub write_csv { "NTUs observed twice",$xtons[1], "NTUs observed three or more times",$xtons[2], "NTU Chao1 richness estimate",sprintf("%.3f",$chao1), # Round to 3 d.p. + "program command",$progcmd, + "database path",$DBHOME, )); my $fh; open_or_die(\$fh, ">", $outfiles{"report_csv"}{"filename"}); @@ -998,12 +1033,20 @@ sub sortmerna_filter_sam { "--sam", # SAM alignment output #"--blast 1", # Blast-tabular alignment output "--log", - "--best 10", # Take 10 best hits + #"--best 10", # Take 10 best hits "--min_lis 10", - "-e 1.00e-05", # E-value cutoff + "-e $evalue_sortmerna", # E-value cutoff "-a $cpus", "-v", # Verbose (turn off?) ); + if (defined $tophit_flag) { + msg ("Reporting only single best hit per read"); + push @sortmerna_args, "--best 1"; + } else { + msg ("Reporting 10 best hits per read"); + push @sortmerna_args, "--best 10"; + } + msg ("Running sortmerna:"); # Run Sortmerna run_prog("sortmerna", @@ -1068,6 +1111,7 @@ sub sortmerna_filter_sam { sub parse_mapstats_from_sam_arr { my ($aref, # Array of hrefs to SAM mapping $readnr, # Total reads (segments) input to mapper + $statshref, # Ref to hash of mapping statistics $SEmode # Flag for single-end mode ) = @_; #$readnr,$readnr_pairs,$SSU_total_pairs,$SSU_ratio,$SSU_ratio_pc @@ -1087,6 +1131,10 @@ sub parse_mapstats_from_sam_arr { # Go through each SAM record and check bitflags to see if mapping or not next if $href->{'FLAG'} & 0x100; # Skip secondary alignments my ($splitname, @discard) = split /\s/, $href->{'QNAME'}; # Split read name on whitespace and add to hash for counting + # Strip read segment suffixes from name + if ($splitname =~ m/^(.+)([:\/_])[12]/) { + $splitname = $1.$2; + } $qname_hash{$splitname} ++; if ($href->{'FLAG'} & 0x40) { # Fwd read $ssu_f_reads ++ unless $href->{'FLAG'} & 0x4; # Unless fwd read unmapped @@ -1132,16 +1180,15 @@ sub parse_mapstats_from_sam_arr { msg("=> both segments mapping to different references: $ssu_bad_pairs"); msg("Read segments where next segment unmapped: $mapped_half"); } elsif ($SEmode == 1) { - $unmapped = 1-$SSU_ratio; + $unmapped = (1-$SSU_ratio)*$readnr; } - # Ratios of mapped vs unmapped to report my @mapratio_csv; if ($SEmode == 1) { # TO DO: Add numerical values to text labels push @mapratio_csv, "Unmapped,".$unmapped; - push @mapratio_csv, "Mapped,".$SSU_ratio; + push @mapratio_csv, "Mapped,".$SSU_ratio*$readnr; } elsif ($SEmode == 0) { # For PE reads, do not include Unmapped in piechart because it will be # impossible to read the other slices anyway. Instead report mapping @@ -1152,6 +1199,16 @@ sub parse_mapstats_from_sam_arr { push @mapratio_csv, "Mapped single,".$mapped_half; } + # Update stats hash + $statshref->{'total_reads'} = $readnr_pairs; + $statshref->{'total_read_segments'} = $readnr; + $statshref->{'ssu_fwd_seg_map'} = $ssu_f_reads; + $statshref->{'ssu_rev_seg_map'} = $ssu_r_reads; + $statshref->{'ssu_tot_map'} = $ssu_f_reads + $ssu_r_reads; + $statshref->{'ssu_tot_pair_map'} = $SSU_total_pairs; + $statshref->{'ssu_tot_pair_ratio'} = $SSU_ratio; + $statshref->{'ssu_tot_pair_ratio_pc'} = $SSU_ratio_pc; + # CSV file to draw piechart my $fh_csv; open_or_die (\$fh_csv, ">", $outfiles{"mapratio_csv"}{"filename"}); @@ -1167,7 +1224,7 @@ sub parse_mapstats_from_sam_arr { $skip_assembly_flag = 1; } - my @output_array = ($readnr,$readnr_pairs,$SSU_total_pairs,$SSU_ratio,$SSU_ratio_pc); + my @output_array = ($readnr,$readnr_pairs,$SSU_total_pairs,$SSU_ratio,$SSU_ratio_pc); # TO DO: Use hash ssu_sam_mapstats to manage this instead return (\@output_array,$skip_assembly_flag); } @@ -1284,7 +1341,7 @@ sub fix_hash_bbmap_sam { } # Hash refs to each SAM alignment by QNAME, segment, and RNAME - $sam_hash{$current_read}{$current_orientation}{$line_href->{'RNAME'}} = $line_href; + push @{$sam_hash{$current_read}{$current_orientation}{$line_href->{'RNAME'}}}, $line_href; # Store array of SAM alignments in order encountered in file push @sam_arr, $line_href; @@ -1848,28 +1905,26 @@ sub screen_remappings { foreach my $pair (@pairs) { # Check if read segment exists and add to total reads if (defined $sam_fixed_href->{$read}{$pair}) { - if ($pair eq 'fwd') { - $ssu_fwd_map ++; - } elsif ($pair eq 'rev') { - $ssu_rev_map ++ ; - } - # Check whether has been flagged as mappign to SPAdes or Emirge or trusted contig + # Check whether read segment been flagged as mapping to SPAdes or Emirge or trusted contig if (defined $sam_fixed_href->{$read}{$pair}{"mapped2spades"} | defined $sam_fixed_href->{$read}{$pair}{"mapped2emirge"} | defined $sam_fixed_href->{$read}{$pair}{'mapped2trusted'}) { # Add to total of read segments mapping to a full-length seq $total_assembled ++; } else { # If not mapping to full-length seq, check if it has mapped to a SILVA ref sequence foreach my $ref (keys %{$sam_fixed_href->{$read}{$pair}}) { - if (ref($sam_fixed_href->{$read}{$pair}{$ref}) eq 'HASH' && defined $sam_fixed_href->{$read}{$pair}{$ref}->{'FLAG'}) { # If this is a hashed SAM record - unless ($sam_fixed_href->{$read}{$pair}{$ref}->{'FLAG'} & 0x4) { - push @unassem_readcounters, $read; - # If using top hit - if ($sam_fixed_href->{$read}{$pair}{$ref}->{'RNAME'} =~ m/\w+\.\d+\.\d+\s(.+)/) { - my $taxonlongstring = $1; - push @unassem_taxa, $taxonlongstring; + foreach my $href (@{$sam_fixed_href->{$read}{$pair}{$ref}}) { + if (ref($href) eq 'HASH' && defined $href->{'FLAG'}) { # If this is a hashed SAM record + unless ($href->{'FLAG'} & 0x4 || $href->{'FLAG'} & 0x100) { # Unless segment unmapped or is a secondary alignment + push @unassem_readcounters, $read; + # If using top hit + if ($href->{'RNAME'} =~ m/\w+\.\d+\.\d+\s(.+)/) { + my $taxonlongstring = $1; + push @unassem_taxa, $taxonlongstring; + } } } } + } } @@ -1884,13 +1939,12 @@ sub screen_remappings { $taxa_unassem_summary_href = consensus_taxon_counter(\%taxa_ambig_byread_hash, \@unassem_readcounters, $taxon_report_lvl); } - # Calculate ratio of reads assembled and store in hash - my $ssu_tot_map = $ssu_fwd_map + $ssu_rev_map; - my $total_unassembled = $ssu_tot_map - $total_assembled; - $ssu_sam_mapstats{"ssu_tot_map"} = $ssu_tot_map; # Total reads mapped + # Calculate ratio of reads segments assembled and store in hash + my $total_unassembled = $ssu_sam_mapstats{'ssu_tot_map'} - $total_assembled; + #$ssu_sam_mapstats{"ssu_tot_map"} = $ssu_tot_map; # Total reads mapped $ssu_sam_mapstats{"assem_tot_map"} = $total_assembled; # Total mapping to full-length $ssu_sam_mapstats{"ssu_unassem"} = $total_unassembled; # Total not mapping to a full-length - $ssu_sam_mapstats{"assem_ratio"} = $total_assembled/$ssu_tot_map; # Ratio assembled/reconstructed + $ssu_sam_mapstats{"assem_ratio"} = $total_assembled/$ssu_sam_mapstats{'ssu_tot_map'}; # Ratio assembled/reconstructed $ssu_sam_mapstats{"assem_ratio_pc"} = sprintf ("%.3f", $ssu_sam_mapstats{"assem_ratio"} * 100); # As a percentage to 3 dp # Calculate totals for each tool used @@ -2021,7 +2075,13 @@ sub emirge_run { } else { $ins_used = $ins_me; } - + if ($ins_std == 0) { + # insert size std dev has to be nonzero or Emirge will refuse to run + # SortMeRNA doesn't report insert size, so we make a guess + $ins_std = $ins_used / 2; + msg ("Warning: No insert size reported by mapper. Using initial guess $ins_std"); + } + msg("the insert size used is $ins_used +- $ins_std"); # FIXME: EMIRGE dies with too many SSU reads, the cutoff needs to be adjusted... if ($SSU_total_pairs <= $amplimit) { @@ -2439,7 +2499,7 @@ sub run_plotscript_SVG { } # Piechart of proportion mapped - msg("Plotting piechart of mappign ratios"); + msg("Plotting piechart of mapping ratios"); my @map_args = ("-pie", $outfiles{"mapratio_csv"}{"filename"}, #"-title=\"$SSU_ratio_pc % reads mapped\"" @@ -2916,7 +2976,7 @@ sub write_logfile { parse_stats_taxonomy_from_sam_array($sam_fixed_aref); # Get mapping statistics from hashed SAM records -my ($mapping_stats_aref, $skipflag) = parse_mapstats_from_sam_arr($sam_fixed_aref,$readnr,$SEmode); +my ($mapping_stats_aref, $skipflag) = parse_mapstats_from_sam_arr($sam_fixed_aref,$readnr,\%ssu_sam_mapstats,$SEmode); # Dereference stats ($readnr,$readnr_pairs,$SSU_total_pairs,$SSU_ratio,$SSU_ratio_pc) = @$mapping_stats_aref; if (defined $skipflag && $skipflag == 1) { diff --git a/phyloFlash_compare.pl b/phyloFlash_compare.pl index afbb1b5..a3895de 100755 --- a/phyloFlash_compare.pl +++ b/phyloFlash_compare.pl @@ -865,7 +865,9 @@ sub calc_weight_tax_unifrac_pair { # The equivalent of the scaling factor D would be the total taxonomic rank depth # of the tax tree, e.g. 7 for species, 4 for order. The branch length between # each rank is implicitly = 1 in the calc_weight_tax_unifrac_pair() procedure - return $raw_unifrac / $taxlevel; + # The number D in Lozupone et al. 2007 reduces to 2d because d_j is the same + # for all j in a taxonomic tree, and sum_j^n (A_j/A_T) is unity. + return $raw_unifrac / (2 * $taxlevel); } diff --git a/phyloFlash_makedb.pl b/phyloFlash_makedb.pl index 59d5a07..f763264 100755 --- a/phyloFlash_makedb.pl +++ b/phyloFlash_makedb.pl @@ -15,7 +15,13 @@ =head1 SYNOPSIS ### use local copies -phyloFlash makedb.pl --silva_file F --univec_file F +phyloFlash_makedb.pl --silva_file F --univec_file F + +## Get help + +phyloFlash_makedb.pl --help + +phyloFlash_makedb.pl --man =head1 DESCRIPTION @@ -109,6 +115,12 @@ =head2 CONFIGURATION AND HELP Default: 10 +=item --log I + +Write phyloFlash_makedb.pl log to a file. + +Default: None + =item --check_env Check that required dependencies are available in your path. If specifying @@ -175,6 +187,7 @@ =head1 COPYRIGHT AND LICENSE my $sortmerna = 0; # Index sortmerna tool my $emirge = 1; # Index emirge my $memlimitGb = 10; # In Gb +my $makedb_log; # commandline arguments my $use_remote = 0; # Download SILVA and Univec databases via FTP @@ -194,6 +207,7 @@ =head1 COPYRIGHT AND LICENSE exit; }, "keep|k" => \$keep, "overwrite!" => \$overwrite, + "log=s" => \$makedb_log, "help|h" => sub{pod2usage(-verbose=>1);}, "man|m" => sub {pod2usage(-verbose=>2);}, "version" => sub { welcome(); @@ -262,6 +276,7 @@ =head1 COPYRIGHT AND LICENSE my @lsu_in_ssh; if (! -e "$dbdir/SILVA_SSU.noLSU.fasta" || $overwrite == 1) { @lsu_in_ssh = find_LSU("$dbdir/SILVA_SSU.noLSU.fasta"); + msg ("Removing sequences with potential LSU contamination"); fasta_copy_except("$dbdir/SILVA_SSU.fasta", "$dbdir/SILVA_SSU.noLSU.fasta", @lsu_in_ssh, @@ -335,6 +350,9 @@ =head1 COPYRIGHT AND LICENSE finish(); +write_logfile($makedb_log) if defined $makedb_log; + + ## SUBS ######################################################################## sub welcome { @@ -399,6 +417,7 @@ sub find_LSU { sub make_vsearch_udb { my ($infile, $outfile, $overwrite) = @_; msg ("Indexing $infile to make UDB file $outfile with Vsearch"); + my $log = "tmp.vsearch_make_udb.log"; if (! -e $outfile || $overwrite == 1) { my @vsearch_params = ("--threads $cpus", '--notrunclabels', # Keep full header including taxstring @@ -406,7 +425,9 @@ sub make_vsearch_udb { "--output $outfile", ); run_prog("vsearch", - join (" ", @vsearch_params) + join (" ", @vsearch_params), + undef, + $log ); } else { msg ("WARNING: File $outfile already exists. Not overwriting"); @@ -417,6 +438,7 @@ sub make_vsearch_udb { sub mask_repeats { my ($src, $dst, $overwrite) = @_; msg("masking low entropy regions in SSU RefNR"); + my $log = "tmp.bbmask_mask_repeats.log"; if (! -e $dst || $overwrite == 1) { run_prog("bbmask", " overwrite=t " @@ -425,7 +447,9 @@ sub mask_repeats { . "in=$src " . "out=$dst " . "minkr=4 maxkr=8 mr=t minlen=20 minke=4 maxke=8 " - . "fastawrap=0 "); + . "fastawrap=0 ", + undef, + $log); } else { msg ("File $dst exists, not overwriting"); } @@ -435,6 +459,7 @@ sub mask_repeats { sub univec_trim { my ($univec, $src, $dst,$overwrite) = @_; msg("removing UniVec contamination in SSU RefNR"); + my $log = "tmp.bbduk_remove_univec.log"; if (! -e $dst || $overwrite == 1 ) { run_prog("bbduk", "ref=$univec " @@ -445,30 +470,27 @@ sub univec_trim { . "ktrim=r ow=t minlength=800 mink=11 hdist=1 " . "in=$src " . "out=$dst " - . "stats=$dst.UniVec_contamination_stats.txt"); + . "stats=$dst.UniVec_contamination_stats.txt", + undef, + $log); } else { msg ("File $dst exists, not overwriting"); } } -sub iuppac_replace { - my ($src, $dst) = @_; - msg("replacing IUPAC coded ambiguous bases with randomly chosen bases"); - run_prog("fixchars", - "<$src", - $dst); -} - #the NR99 file is fixed and a reference file is generated for bbmap sub bbmap_db { my ($ref, $path, $overwrite) = @_; msg("creating bbmap reference"); + my $log = "tmp.bbmap_index.log"; if (!-d "$path/ref" || $overwrite == 1) { run_prog("bbmap", " -Xmx".$memlimitGb."g " # Original 4 Gb limit was not enough . "threads=$cpus " . "ref=$ref " - . "path=$path "); + . "path=$path ", + undef, + $log); } else { msg ("WARNING: Folder $path exists, not overwriting"); } @@ -478,8 +500,12 @@ sub bbmap_db { sub bowtie_index { my ($fasta) = @_; my $btidx = $fasta =~ s/\.fasta$/\.bt/r; + my $log = "tmp.bowtiebuild.log"; msg("creating bowtie index (for emirge)"); - run_prog("bowtiebuild", "$fasta $btidx -q"); + run_prog("bowtiebuild", + "$fasta $btidx -q", + undef, + $log); } # Generate index for Sortmerna from the filtered fixed SSU clustered at 96% id @@ -487,6 +513,7 @@ sub sortmerna_index { my ($fasta,$overwrite) = @_; my $memlimitMb = $memlimitGb * 1000; my $prefix = $fasta =~ s/\.fasta$//r; + my $log = "tmp.indexdb_rna.log"; msg ("creating sortmerna index"); if (! -e "$prefix.bursttrie_0.dat" || $overwrite == 1) { my @indexdb_args = ("--ref $fasta,$prefix", @@ -494,6 +521,8 @@ sub sortmerna_index { "-v"); run_prog("indexdb_rna", join (" ", @indexdb_args), + undef, + $log ) } else { msg ("WARNING: SortMeRNA indices for file prefix $prefix already exist, not overwriting"); @@ -534,3 +563,12 @@ sub finish { the databases with -dbhome: /path/to/your/databases/ or change script line XXX accordingly");#Fixme } + +sub write_logfile { + my $file = shift; + msg ("Saving log to file $file"); + my $fh; + open_or_die(\$fh, ">>", $file); + print $fh join "\n", @PhyloFlash::msg_log; + close($fh); +} \ No newline at end of file