From df3daa3939a9d1cd110fa3314a81664eacd27299 Mon Sep 17 00:00:00 2001 From: Kwee Boon Brandon Seah Date: Wed, 28 Apr 2021 13:12:36 +0200 Subject: [PATCH 1/3] Skip barrnap if no SPAdes assembly Addresses issue #142 Also show more informative error message if SPAdes or Emirge fail --- phyloFlash.pl | 200 ++++++++++++++++++++++++++------------------------ 1 file changed, 105 insertions(+), 95 deletions(-) diff --git a/phyloFlash.pl b/phyloFlash.pl index 642dc00..4a912fb 100755 --- a/phyloFlash.pl +++ b/phyloFlash.pl @@ -1600,107 +1600,111 @@ sub spades_parse { #further processing if (! -s "$libraryNAME.spades/scaffolds.fasta") { msg("no phylotypes assembled with SPAdes"); + msg("possible causes include low or uneven coverage of SSU sequences"); $skip_spades = 1; remove_tree("$libraryNAME.spades"); + return 1; } - # run barrnap once for each domain - # if single cell data - accept partial rRNAs down to 0.1 - # and use lower e-value setting - my $b_args = " --evalue 1e-100 --reject 0.6 " ; - if ($sc == 1) { - $b_args = " --evalue 1e-20 --reject 0.1 "; - } - - foreach ('bac', 'arch', 'euk') { - run_prog("barrnap", - $b_args. - "--kingdom $_ --gene ssu --threads $cpus " . - "$libraryNAME.spades/scaffolds.fasta", - $outfiles{"gff_".$_}{"filename"}, - $outfiles{"barrnap_log"}{"filename"}); - $outfiles{"gff_".$_}{"made"}++; - $outfiles{"barrnap_log"}{"made"}++; - } - - # now merge multi-hits on the same scaffold-and-strand by picking - # the lowest start and highest stop position. - - my %ssus; - # pre-filter with grep for speed - my $fh; - open_or_die(\$fh, "-|", - "grep -hE '16S_rRNA\|18S_rRNA' ". - $outfiles{"gff_bac"}{"filename"}." ". - $outfiles{"gff_arch"}{"filename"}." ". - $outfiles{"gff_euk"}{"filename"}); - while (my $row = <$fh>) { - my @cols = split("\t", $row); - # gff format: - # 0 seqname, 1 source, 2 feature, 3 start, 4 end, - # 5 score, 6 strand, 7 frame, 8 attribute - - my $seqname = $cols[0]; - my $start = $cols[3]; - my $stop = $cols[4]; - my $strand = $cols[6]; - - # put our desired output fasta name into "feature" col 3 - # the may be "bug" using / mixing bed and gff formats - # but it saves us messing with the fasta afterwards - $seqname =~ m/NODE_([0-9]*)_.*cov_([0-9\\.]*)/; - $cols[2] = "$libraryNAME.PFspades_$1_$2"; - - # do the actual merging, left most start and right most stop wins - if (exists $ssus{$seqname.$strand}) { - my $old_start = $ssus{$seqname.$strand}[3]; - my $old_stop = $ssus{$seqname.$strand}[4]; - $cols[3] = ($start < $old_start) ? $start : $old_start; - $cols[4] = ($stop > $old_stop) ? $stop : $old_stop; - } - $ssus{$seqname.$strand} = [@cols]; - } - close($fh); - - if (scalar keys %ssus == 0) { - msg("no contig in spades assembly found to contain rRNA"); - return 1; - } else { - open_or_die(\$fh, ">", $outfiles{"gff_all"}{"filename"}); - for my $key (sort keys %ssus) { - print $fh join("\t",@{$ssus{$key}}); + else { + # run barrnap once for each domain + # if single cell data - accept partial rRNAs down to 0.1 + # and use lower e-value setting + my $b_args = " --evalue 1e-100 --reject 0.6 " ; + if ($sc == 1) { + $b_args = " --evalue 1e-20 --reject 0.1 "; + } + + foreach ('bac', 'arch', 'euk') { + run_prog("barrnap", + $b_args. + "--kingdom $_ --gene ssu --threads $cpus " . + "$libraryNAME.spades/scaffolds.fasta", + $outfiles{"gff_".$_}{"filename"}, + $outfiles{"barrnap_log"}{"filename"}); + $outfiles{"gff_".$_}{"made"}++; + $outfiles{"barrnap_log"}{"made"}++; + } + + # now merge multi-hits on the same scaffold-and-strand by picking + # the lowest start and highest stop position. + + my %ssus; + # pre-filter with grep for speed + my $fh; + open_or_die(\$fh, "-|", + "grep -hE '16S_rRNA\|18S_rRNA' ". + $outfiles{"gff_bac"}{"filename"}." ". + $outfiles{"gff_arch"}{"filename"}." ". + $outfiles{"gff_euk"}{"filename"}); + while (my $row = <$fh>) { + my @cols = split("\t", $row); + # gff format: + # 0 seqname, 1 source, 2 feature, 3 start, 4 end, + # 5 score, 6 strand, 7 frame, 8 attribute + + my $seqname = $cols[0]; + my $start = $cols[3]; + my $stop = $cols[4]; + my $strand = $cols[6]; + + # put our desired output fasta name into "feature" col 3 + # the may be "bug" using / mixing bed and gff formats + # but it saves us messing with the fasta afterwards + $seqname =~ m/NODE_([0-9]*)_.*cov_([0-9\\.]*)/; + $cols[2] = "$libraryNAME.PFspades_$1_$2"; + + # do the actual merging, left most start and right most stop wins + if (exists $ssus{$seqname.$strand}) { + my $old_start = $ssus{$seqname.$strand}[3]; + my $old_stop = $ssus{$seqname.$strand}[4]; + $cols[3] = ($start < $old_start) ? $start : $old_start; + $cols[4] = ($stop > $old_stop) ? $stop : $old_stop; + } + $ssus{$seqname.$strand} = [@cols]; } close($fh); - $outfiles{"gff_all"}{"made"}++; - # fastaFromBed will build a .fai index from the source .fasta - # However, it does not notice if the .fasta changed. So we - # delete the .fai if it is older than the .fasta. - if ( -e "$libraryNAME.spades/scaffolds.fasta.fai" && - file_is_newer("$libraryNAME.spades/scaffolds.fasta", - "$libraryNAME.spades/scaffolds.fasta.fai")) { - unlink("$libraryNAME.spades/scaffolds.fasta.fai"); - } + if (scalar keys %ssus == 0) { + msg("no contig in spades assembly found to contain rRNA"); + return 1; + } else { + open_or_die(\$fh, ">", $outfiles{"gff_all"}{"filename"}); + for my $key (sort keys %ssus) { + print $fh join("\t",@{$ssus{$key}}); + } + close($fh); + $outfiles{"gff_all"}{"made"}++; + + # fastaFromBed will build a .fai index from the source .fasta + # However, it does not notice if the .fasta changed. So we + # delete the .fai if it is older than the .fasta. + if ( -e "$libraryNAME.spades/scaffolds.fasta.fai" && + file_is_newer("$libraryNAME.spades/scaffolds.fasta", + "$libraryNAME.spades/scaffolds.fasta.fai")) { + unlink("$libraryNAME.spades/scaffolds.fasta.fai"); + } - # extract rrna fragments from spades scaffolds accoding to gff - my @fastaFromBed_args = ("-fi $libraryNAME.spades/scaffolds.fasta", - "-bed",$outfiles{"gff_all"}{"filename"}, - "-fo",$outfiles{"spades_fastaFromBed"}{"filename"}, - "-s","-name", + # extract rrna fragments from spades scaffolds accoding to gff + my @fastaFromBed_args = ("-fi $libraryNAME.spades/scaffolds.fasta", + "-bed",$outfiles{"gff_all"}{"filename"}, + "-fo",$outfiles{"spades_fastaFromBed"}{"filename"}, + "-s","-name", + ); + run_prog("fastaFromBed", + join (" ",@fastaFromBed_args), + $outfiles{"fastaFromBed_out"}{"filename"}, + "&1"); + # Fix bedtools headers from bedtools v2.26 and v2.27+ + fix_bedtools_header ($outfiles{"spades_fastaFromBed"}{"filename"}, + $outfiles{"spades_fasta"}{"filename"} ); - run_prog("fastaFromBed", - join (" ",@fastaFromBed_args), - $outfiles{"fastaFromBed_out"}{"filename"}, - "&1"); - # Fix bedtools headers from bedtools v2.26 and v2.27+ - fix_bedtools_header ($outfiles{"spades_fastaFromBed"}{"filename"}, - $outfiles{"spades_fasta"}{"filename"} - ); - $outfiles{"spades_fastaFromBed"}{"made"}++; - $outfiles{"spades_fasta"}{"made"}++; - $outfiles{"fastaFromBed_out"}{"made"}++; - msg("done..."); - return 0; + $outfiles{"spades_fastaFromBed"}{"made"}++; + $outfiles{"spades_fasta"}{"made"}++; + $outfiles{"fastaFromBed_out"}{"made"}++; + msg("done..."); + return 0; + } } } @@ -3051,7 +3055,7 @@ sub write_report_html { if ($readnr == 0) { # Exit if no reads mapped - err ("ERROR: No reads were detected! Possible reasons: Coverage in library too low; option -readlimit too low; input is not a (meta)genome/transcriptome dataset."); + err("ERROR: No reads were detected! Possible reasons: Coverage in library too low; option -readlimit too low; input is not a (meta)genome/transcriptome dataset."); } # Get taxonomic summary from hashed SAM records @@ -3085,14 +3089,20 @@ sub write_report_html { if ($spades_parse_return == 0) { bbmap_remap("SPAdes"); } + } else { + msg("SPAdes exited with error, this may happen if no sequences were assembled Possible causes include coverage per sequence too low or uneven"); } } # Run Emirge if not explicitly skipped if ($skip_emirge == 0) { my $emirge_return = emirge_run(); # Parse output from emirge if it exits without errors - emirge_parse() if $emirge_return == 0; - bbmap_remap("EMIRGE") if $emirge_return == 0; + if ($emirge_return == 0) { + emirge_parse(); + bbmap_remap("EMIRGE"); + } else { + msg("EMIRGE exited with error, this may happen if no sequences were reconstructed Possible causes include coverage per sequence too low or uneven"); + } } # If full length sequenced produced, match vs SILVA database with Vsearch and align with best hits if (defined $outfiles{"spades_fasta"}{"made"} || defined $outfiles{"emirge_fasta"}{"made"} || defined $outfiles{'trusted_fasta'}{'made'}) { From 5a7a3eaef32fcc96b9edb0e68272be68eef73dda Mon Sep 17 00:00:00 2001 From: Kwee Boon Brandon Seah Date: Wed, 28 Apr 2021 13:18:24 +0200 Subject: [PATCH 2/3] Bump version number --- PhyloFlash.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PhyloFlash.pm b/PhyloFlash.pm index a961db6..6c3aee5 100644 --- a/PhyloFlash.pm +++ b/PhyloFlash.pm @@ -28,7 +28,7 @@ This module contains helper functions shared by the phyloFlash scripts. =cut -our $VERSION = "3.4"; +our $VERSION = "3.4.1"; our @ISA = qw(Exporter); our @EXPORT = qw( $VERSION From d78eed570e183289e73a9b89672194eb4d73db50 Mon Sep 17 00:00:00 2001 From: Kwee Boon Brandon Seah Date: Fri, 8 Oct 2021 16:04:29 +0200 Subject: [PATCH 3/3] Recommend Mamba env solver in docs --- README.md | 11 ++++++++++- docs/index.md | 8 ++++++++ docs/install.md | 8 ++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3240a5e..acf6aff 100644 --- a/README.md +++ b/README.md @@ -27,13 +27,20 @@ Read [our paper](https://doi.org/10.1128/mSystems.00920-20) on phyloFlash. [Conda](https://conda.io/docs/) is a package manager that will also install dependencies that are required if you don't have them already. -phyloFlash is distributed through the [Bioconda](http://bioconda.github.io/) channel on Conda. +phyloFlash is distributed through the [Bioconda](http://bioconda.github.io/) +channel on Conda. According to the [Conda documentation](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html), it is recommended to install all packages at the same time to avoid dependency conflicts, and to create new environments instead of installing to the base environment. +We also recommend using [Mamba](https://mamba.readthedocs.io/en/latest/) as a +drop-in substitute for Conda. It implements a more effective dependency solver +and is also the default Conda frontend for the pipeline managers Snakemake. +Conda sometimes fails to solve the environment, and in these cases Mamba +usually works. + ```bash # If you haven't set up Bioconda already conda config --add channels defaults @@ -44,6 +51,8 @@ conda config --set channel_priority strict # Create new environment named "pf" with phyloflash # Sortmerna is an optional dependency conda create -n pf phyloflash sortmerna=2.1b +# If Conda is unable to solve the environment; requires mamba in base env +mamba create -n pf phyloflash sortmerna=2.1b ``` ### Download from GitHub diff --git a/docs/index.md b/docs/index.md index bbe9048..0bb997a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -44,6 +44,12 @@ it is recommended to install all packages at the same time to avoid dependency conflicts, and to create new environments instead of installing to the base environment. +We also recommend using [Mamba](https://mamba.readthedocs.io/en/latest/) as a +drop-in substitute for Conda. It implements a more effective dependency solver +and is also the default Conda frontend for the pipeline managers Snakemake. +Conda sometimes fails to solve the environment, and in these cases Mamba +usually works. + ```bash # If you haven't set up Bioconda already conda config --add channels defaults @@ -54,6 +60,8 @@ conda config --set channel_priority strict # Create new environment named "pf" with phyloflash # sortmerna is an optional dependency conda create -n pf phyloflash sortmerna=2.1b +# If Conda is unable to solve the environment; requires mamba in base env +mamba create -n pf phyloflash sortmerna=2.1b ``` ### Download from GitHub diff --git a/docs/install.md b/docs/install.md index 2c99dd9..4dc7e92 100644 --- a/docs/install.md +++ b/docs/install.md @@ -36,6 +36,12 @@ it is recommended to install all packages at the same time to avoid dependency conflicts, and to create new environments instead of installing to the base environment. +We also recommend using [Mamba](https://mamba.readthedocs.io/en/latest/) as a +drop-in substitute for Conda. It implements a more effective dependency solver +and is also the default Conda frontend for the pipeline managers Snakemake. +Conda sometimes fails to solve the environment, and in these cases Mamba +usually works. + ```bash # If you haven't set up Bioconda already conda config --add channels defaults @@ -46,6 +52,8 @@ conda config --set channel_priority strict # Create new environment named "pf" with phyloflash # sortmerna is an optional dependency conda create -n pf phyloflash sortmerna=2.1b +# If Conda is unable to solve the environment; requires mamba in base env +mamba create -n pf phyloflash sortmerna=2.1b ``` In some cases, `conda install` can hang on the "Solving environment" step. This