diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b4d60e7..0d2a81c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ General tidy up of the configuration and the pipeline - Increased the resources for blastn - Removed some options that were not used or not needed - All relevant outputs are now copied to the output directory +- Fixed some blast parameters to match the behaviour of the Snakemake pipeline ### Parameters diff --git a/conf/base.config b/conf/base.config index 65058d5e..75fa0d06 100644 --- a/conf/base.config +++ b/conf/base.config @@ -105,9 +105,15 @@ process { } withName: "BLAST_BLASTN" { - cpus = { check_max( 24 * task.attempt, 'cpus' ) } - memory = { check_max( 100.MB * task.attempt, 'memory' ) } - time = { check_max( 12.h * task.attempt, 'time' ) } + + // There are blast failures we don't know how to fix. Just ignore for now + errorStrategy = { task.exitStatus in ((130..145) + 104) ? (task.attempt == process.maxRetries ? 'ignore' : 'retry') : 'finish' } + + // Most jobs complete quickly but some need a lot longer. For those outliers, + // the CPU usage remains usually low, often nearing a single CPU + cpus = { check_max( 6 - (task.attempt-1), 'cpus' ) } + memory = { check_max( 1.GB * Math.pow(4, task.attempt-1), 'memory' ) } + time = { check_max( 10.h * Math.pow(4, task.attempt-1), 'time' ) } } withName:CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/conf/modules.config b/conf/modules.config index 3e54b96a..ac597dc4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -138,7 +138,7 @@ process { } withName: "BLAST_BLASTN" { - ext.args = "-outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1.0e-10 -lcase_masking -dust '20 64 1'" + ext.args = "-task megablast -outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1.0e-10 -lcase_masking -dust '20 64 1'" } withName: "CUSTOM_DUMPSOFTWAREVERSIONS" { diff --git a/modules.json b/modules.json index d80a794d..ebb45a6c 100644 --- a/modules.json +++ b/modules.json @@ -30,12 +30,14 @@ "diamond/blastp": { "branch": "master", "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/diamond/blastp/diamond-blastp.diff" }, "diamond/blastx": { "branch": "master", "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/diamond/blastx/diamond-blastx.diff" }, "fastawindows": { "branch": "master", diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff index 449f7240..e01e07cb 100644 --- a/modules/nf-core/blast/blastn/blast-blastn.diff +++ b/modules/nf-core/blast/blastn/blast-blastn.diff @@ -16,21 +16,31 @@ Changes in module 'nf-core/blast/blastn' output: tuple val(meta), path('*.txt'), emit: txt -@@ -23,6 +23,7 @@ +@@ -23,6 +23,8 @@ def prefix = task.ext.prefix ?: "${meta.id}" def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : '' ++ def command_epilog = taxid ? "|| true" : '' """ if [ "${is_compressed}" == "true" ]; then -@@ -39,6 +40,7 @@ +@@ -39,8 +41,15 @@ -num_threads ${task.cpus} \\ -db \$DB \\ -query ${fasta_name} \\ + ${exclude_taxon} \\ ${args} \\ - -out ${prefix}.txt +- -out ${prefix}.txt ++ -out ${prefix}.txt \\ ++ 2> >( tee "${prefix}.error.log" >&2 ) $command_epilog ++ ++ if [[ -s "${prefix}.error.log" ]] ++ then ++ grep -qF 'BLAST Database error: Taxonomy ID(s) not found.Taxonomy ID(s) not found' "${prefix}.error.log" ++ fi + cat <<-END_VERSIONS > versions.yml + "${task.process}": ************************************************************ diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf index 368e7bcc..d674989a 100644 --- a/modules/nf-core/blast/blastn/main.nf +++ b/modules/nf-core/blast/blastn/main.nf @@ -24,6 +24,7 @@ process BLAST_BLASTN { def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : '' + def command_epilog = taxid ? "|| true" : '' """ if [ "${is_compressed}" == "true" ]; then @@ -42,7 +43,13 @@ process BLAST_BLASTN { -query ${fasta_name} \\ ${exclude_taxon} \\ ${args} \\ - -out ${prefix}.txt + -out ${prefix}.txt \\ + 2> >( tee "${prefix}.error.log" >&2 ) $command_epilog + + if [[ -s "${prefix}.error.log" ]] + then + grep -qF 'BLAST Database error: Taxonomy ID(s) not found.Taxonomy ID(s) not found' "${prefix}.error.log" + fi cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/diamond/blastp/diamond-blastp.diff b/modules/nf-core/diamond/blastp/diamond-blastp.diff new file mode 100644 index 00000000..12608ea0 --- /dev/null +++ b/modules/nf-core/diamond/blastp/diamond-blastp.diff @@ -0,0 +1,29 @@ +Changes in module 'nf-core/diamond/blastp' +--- modules/nf-core/diamond/blastp/main.nf ++++ modules/nf-core/diamond/blastp/main.nf +@@ -12,6 +12,7 @@ + tuple val(meta2), path(db) + val out_ext + val blast_columns ++ val taxid + + output: + tuple val(meta), path('*.blast'), optional: true, emit: blast +@@ -32,6 +33,7 @@ + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def columns = blast_columns ? "${blast_columns}" : '' ++ def exclude_taxon = taxid ? "--taxon-exclude ${taxid}" : '' + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break +@@ -59,6 +61,7 @@ + --db \$DB \\ + --query ${fasta_name} \\ + --outfmt ${outfmt} ${columns} \\ ++ ${exclude_taxon} \\ + ${args} \\ + --out ${prefix}.${out_ext} + + +************************************************************ diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf index dc01cdcc..ae5a1248 100644 --- a/modules/nf-core/diamond/blastp/main.nf +++ b/modules/nf-core/diamond/blastp/main.nf @@ -12,6 +12,7 @@ process DIAMOND_BLASTP { tuple val(meta2), path(db) val out_ext val blast_columns + val taxid output: tuple val(meta), path('*.blast'), optional: true, emit: blast @@ -32,6 +33,7 @@ process DIAMOND_BLASTP { def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta def columns = blast_columns ? "${blast_columns}" : '' + def exclude_taxon = taxid ? "--taxon-exclude ${taxid}" : '' switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break @@ -59,6 +61,7 @@ process DIAMOND_BLASTP { --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ + ${exclude_taxon} \\ ${args} \\ --out ${prefix}.${out_ext} diff --git a/modules/nf-core/diamond/blastx/diamond-blastx.diff b/modules/nf-core/diamond/blastx/diamond-blastx.diff new file mode 100644 index 00000000..eff4326a --- /dev/null +++ b/modules/nf-core/diamond/blastx/diamond-blastx.diff @@ -0,0 +1,29 @@ +Changes in module 'nf-core/diamond/blastx' +--- modules/nf-core/diamond/blastx/main.nf ++++ modules/nf-core/diamond/blastx/main.nf +@@ -12,6 +12,7 @@ + tuple val(meta2), path(db) + val out_ext + val blast_columns ++ val taxid + + output: + tuple val(meta), path('*.blast'), optional: true, emit: blast +@@ -33,6 +34,7 @@ + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def columns = blast_columns ? "${blast_columns}" : '' ++ def exclude_taxon = taxid ? "--taxon-exclude ${taxid}" : '' + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break +@@ -60,6 +62,7 @@ + --db \$DB \\ + --query ${fasta_name} \\ + --outfmt ${outfmt} ${columns} \\ ++ ${exclude_taxon} \\ + ${args} \\ + --out ${prefix}.${out_ext} \\ + --log + +************************************************************ diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf index bf3f623c..dfa82e24 100644 --- a/modules/nf-core/diamond/blastx/main.nf +++ b/modules/nf-core/diamond/blastx/main.nf @@ -12,6 +12,7 @@ process DIAMOND_BLASTX { tuple val(meta2), path(db) val out_ext val blast_columns + val taxid output: tuple val(meta), path('*.blast'), optional: true, emit: blast @@ -33,6 +34,7 @@ process DIAMOND_BLASTX { def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta def columns = blast_columns ? "${blast_columns}" : '' + def exclude_taxon = taxid ? "--taxon-exclude ${taxid}" : '' switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break @@ -60,6 +62,7 @@ process DIAMOND_BLASTX { --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ + ${exclude_taxon} \\ ${args} \\ --out ${prefix}.${out_ext} \\ --log diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 59c65a24..2a89471f 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -39,6 +39,7 @@ workflow BUSCO_DIAMOND { | transpose() | filter { rank,id -> rank =~ /species/ } | map { rank, id -> id} + | first | set { ch_taxid } @@ -116,7 +117,7 @@ workflow BUSCO_DIAMOND { // Hardcoded to match the format expected by blobtools def outext = 'txt' def cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' - DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols ) + DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols, ch_taxid ) ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() ) diff --git a/subworkflows/local/run_blastx.nf b/subworkflows/local/run_blastx.nf index ed2df41f..715e5ae2 100644 --- a/subworkflows/local/run_blastx.nf +++ b/subworkflows/local/run_blastx.nf @@ -11,6 +11,7 @@ workflow RUN_BLASTX { fasta // channel: [ val(meta), path(fasta) ] table // channel: [ val(meta), path(busco_table) ] blastx // channel: [ val(meta), path(blastx_db) ] + taxon_id // channel: val(taxon_id) main: @@ -30,7 +31,7 @@ workflow RUN_BLASTX { // Hardocded to match the format expected by blobtools def outext = 'txt' def cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' - DIAMOND_BLASTX ( BLOBTOOLKIT_CHUNK.out.chunks, blastx, outext, cols) + DIAMOND_BLASTX ( BLOBTOOLKIT_CHUNK.out.chunks, blastx, outext, cols, taxon_id ) ch_versions = ch_versions.mix ( DIAMOND_BLASTX.out.versions.first() ) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 5c1d946c..3610cdde 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -140,6 +140,7 @@ workflow BLOBTOOLKIT { PREPARE_GENOME.out.genome, BUSCO_DIAMOND.out.first_table, ch_blastx, + BUSCO_DIAMOND.out.taxon_id, ) ch_versions = ch_versions.mix ( RUN_BLASTX.out.versions ) @@ -151,7 +152,7 @@ workflow BLOBTOOLKIT { RUN_BLASTX.out.blastx_out, PREPARE_GENOME.out.genome, ch_blastn, - BUSCO_DIAMOND.out.taxon_id + BUSCO_DIAMOND.out.taxon_id, ) //