From 8a05dc2bf3152c9b924b8730c1a9ee7acb26fb5a Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 2 Oct 2024 13:17:03 +0100 Subject: [PATCH 1/3] Documentation update --- docs/usage.md | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 22df5508..1f17571a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -54,7 +54,20 @@ An [example samplesheet](assets/test/samplesheet.csv) has been provided with the The pipeline can also accept a samplesheet generated by the [nf-core/fetchngs](https://nf-co.re/fetchngs) pipeline (tested with version 1.11.0). The pipeline then needs the `--fetchngs_samplesheet true` option _and_ `--align true`, since the data files would all be unaligned. -## Getting databases ready for the pipeline +## Database parameters + +Configure access to your local databases with the `--busco`, `--blastp`, `--blastx`, `--blastn`, and `--taxdump` parameters. + +Note that `--busco` refers to the download path of _all_ lineages. +Then, when explicitly selecting the lineages to run the pipeline on, +provide the names of these lineages _with_ their `_odb10` suffix as a comma-separated string. +For instance: + +```bash +--busco path-to-databases/busco/ --busco_lineages vertebrata_odb10,bacteria_odb10,fungi_odb10 +``` + +### Getting databases ready for the pipeline The BlobToolKit pipeline can be run in many different ways. The default way requires access to several databases: @@ -65,7 +78,7 @@ The BlobToolKit pipeline can be run in many different ways. The default way requ It is a good idea to put a date suffix for each database location so you know at a glance whether you are using the latest version. We are using the `YYYY_MM` format as we do not expect the databases to be updated more frequently than once a month. However, feel free to use `DATE=YYYY_MM_DD` or a different format if you prefer. -### 1. NCBI taxdump database +#### 1. NCBI taxdump database Create the database directory and move into the directory: @@ -82,7 +95,7 @@ Retrieve and decompress the NCBI taxdump: curl -L ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar xzf - ``` -### 2. NCBI nucleotide BLAST database +#### 2. NCBI nucleotide BLAST database Create the database directory and move into the directory: @@ -106,7 +119,7 @@ tar xf taxdb.tar.gz -C $NT && rm taxdb.tar.gz ``` -### 3. UniProt reference proteomes database +#### 3. UniProt reference proteomes database You need [diamond blast](https://github.com/bbuchfink/diamond) installed for this step. The easiest way is probably using [conda](https://anaconda.org/bioconda/diamond). Make sure you have the latest version of Diamond (>2.x.x) otherwise the `--taxonnames` argument may not work. @@ -140,7 +153,7 @@ zcat */*/*.idmapping.gz | grep "NCBI_TaxID" | awk '{print $1 "\t" $1 "\t" $3 "\t diamond makedb -p 16 --in reference_proteomes.fasta.gz --taxonmap reference_proteomes.taxid_map --taxonnodes $TAXDUMP/nodes.dmp --taxonnames $TAXDUMP/names.dmp -d reference_proteomes.dmnd ``` -### 4. BUSCO databases +#### 4. BUSCO databases Create the database directory and move into the directory: From abe2b763e88a68f3ac8edfd854c1f2fd0003dea9 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 14 Oct 2024 15:04:30 +0100 Subject: [PATCH 2/3] Bumped up the underlying BTK version because 4.3.9 had a bug --- CHANGELOG.md | 8 ++++++++ docs/usage.md | 2 +- modules/local/blobtoolkit/chunk.nf | 2 +- modules/local/blobtoolkit/countbuscos.nf | 2 +- modules/local/blobtoolkit/createblobdir.nf | 2 +- modules/local/blobtoolkit/extractbuscos.nf | 2 +- modules/local/blobtoolkit/summary.nf | 2 +- modules/local/blobtoolkit/unchunk.nf | 2 +- modules/local/blobtoolkit/updateblobdir.nf | 2 +- modules/local/blobtoolkit/updatemeta.nf | 2 +- modules/local/blobtoolkit/windowstats.nf | 2 +- modules/local/generate_config.nf | 2 +- 12 files changed, 19 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23549f6f..ba8fb37d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,14 @@ The pipeline is now considered to be a complete and suitable replacement for the - Fill in accurate read information in the blobDir. Users are now reqiured to indicate in the samplesheet whether the reads are paired or single. +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. + +| Dependency | Old version | New version | +| ----------- | ----------- | ----------- | +| blobtoolkit | 4.3.9 | 4.3.13 | + ## [[0.6.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.6.0)] – Bellsprout – [2024-09-13] The pipeline has now been validated for draft (unpublished) assemblies. diff --git a/docs/usage.md b/docs/usage.md index 1f17571a..48db5565 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -245,7 +245,7 @@ List of tools for any given dataset can be fetched from the API, for example htt | Dependency | Snakemake | Nextflow | | ----------------- | --------- | -------- | -| blobtoolkit | 4.3.2 | 4.3.9 | +| blobtoolkit | 4.3.2 | 4.3.13 | | blast | 2.12.0 | 2.14.1 | | blobtk | 0.5.0 | 0.5.1 | | busco | 5.3.2 | 5.5.0 | diff --git a/modules/local/blobtoolkit/chunk.nf b/modules/local/blobtoolkit/chunk.nf index 7dad9182..0b8a7989 100644 --- a/modules/local/blobtoolkit/chunk.nf +++ b/modules/local/blobtoolkit/chunk.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_CHUNK { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_CHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta) , path(fasta) diff --git a/modules/local/blobtoolkit/countbuscos.nf b/modules/local/blobtoolkit/countbuscos.nf index 1b415504..0e525ede 100644 --- a/modules/local/blobtoolkit/countbuscos.nf +++ b/modules/local/blobtoolkit/countbuscos.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_COUNTBUSCOS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_COUNTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta), path(table, stageAs: 'dir??/*') diff --git a/modules/local/blobtoolkit/createblobdir.nf b/modules/local/blobtoolkit/createblobdir.nf index dfaddb7d..b68e32a1 100644 --- a/modules/local/blobtoolkit/createblobdir.nf +++ b/modules/local/blobtoolkit/createblobdir.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta), path(window, stageAs: 'windowstats/*') diff --git a/modules/local/blobtoolkit/extractbuscos.nf b/modules/local/blobtoolkit/extractbuscos.nf index 1e4440cb..bd598539 100644 --- a/modules/local/blobtoolkit/extractbuscos.nf +++ b/modules/local/blobtoolkit/extractbuscos.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_EXTRACTBUSCOS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_EXTRACTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta), path(fasta) diff --git a/modules/local/blobtoolkit/summary.nf b/modules/local/blobtoolkit/summary.nf index 9b1a262f..539fe9c8 100644 --- a/modules/local/blobtoolkit/summary.nf +++ b/modules/local/blobtoolkit/summary.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_SUMMARY { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_SUMMARY module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta), path(blobdir) diff --git a/modules/local/blobtoolkit/unchunk.nf b/modules/local/blobtoolkit/unchunk.nf index 5285b0dc..87fa1bb5 100644 --- a/modules/local/blobtoolkit/unchunk.nf +++ b/modules/local/blobtoolkit/unchunk.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UNCHUNK { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_UNCHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta), path(blast_table) diff --git a/modules/local/blobtoolkit/updateblobdir.nf b/modules/local/blobtoolkit/updateblobdir.nf index d736a03f..b3940642 100644 --- a/modules/local/blobtoolkit/updateblobdir.nf +++ b/modules/local/blobtoolkit/updateblobdir.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEBLOBDIR { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta), path(input, stageAs: "input_blobdir") diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf index a5556348..356e6699 100644 --- a/modules/local/blobtoolkit/updatemeta.nf +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEMETA { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_UPDATEMETA module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta), path(input) diff --git a/modules/local/blobtoolkit/windowstats.nf b/modules/local/blobtoolkit/windowstats.nf index d432a8ff..7ca59db4 100644 --- a/modules/local/blobtoolkit/windowstats.nf +++ b/modules/local/blobtoolkit/windowstats.nf @@ -4,7 +4,7 @@ process BLOBTOOLKIT_WINDOWSTATS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_WINDOWSTATS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta), path(tsv) diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index a6cac943..2992ab6d 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -3,7 +3,7 @@ process GENERATE_CONFIG { label 'process_single' conda "conda-forge::requests=2.28.1 conda-forge::pyyaml=6.0" - container "docker.io/genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.13" input: tuple val(meta), val(fasta) From dfb4655aafd07022504e4b3a48b4242edb58a8d7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 14 Oct 2024 15:14:25 +0100 Subject: [PATCH 3/3] Updated the blastn runtime requirements to avoid basement jobs Indeed, 85.9% of jobs complete within 4 hours, 98% within 47 hours, and 99.7% within 167 hours --- CHANGELOG.md | 2 ++ conf/base.config | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba8fb37d..2b07486f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ The pipeline is now considered to be a complete and suitable replacement for the "grid plots". - Fill in accurate read information in the blobDir. Users are now reqiured to indicate in the samplesheet whether the reads are paired or single. +- Updated the Blastn settings to allow 7 days runtime at most, since that + covers 99.7% of the jobs. ### Software dependencies diff --git a/conf/base.config b/conf/base.config index f24b6b99..25648852 100644 --- a/conf/base.config +++ b/conf/base.config @@ -106,14 +106,15 @@ process { withName: "BLAST_BLASTN" { - // There are blast failures we don't know how to fix. Just ignore for now - errorStrategy = { task.exitStatus in ((130..145) + 104) ? (task.attempt == process.maxRetries ? 'ignore' : 'retry') : 'finish' } + // There are blast failures we don't know how to fix. We just give up after 3 attempts + errorStrategy = { task.exitStatus in ((130..145) + 104) ? (task.attempt == 3 ? 'ignore' : 'retry') : 'finish' } + // Most jobs complete quickly but some need a lot longer. For those outliers, - // the CPU usage remains usually low, often nearing a single CPU - cpus = { check_max( 6 - (task.attempt-1), 'cpus' ) } - memory = { check_max( 1.GB * Math.pow(4, task.attempt-1), 'memory' ) } - time = { check_max( 10.h * Math.pow(4, task.attempt-1), 'time' ) } + // the CPU usage remains usually low, averaging a single CPU + cpus = { check_max( task.attempt == 1 ? 4 : 1, 'cpus' ) } + memory = { check_max( 2.GB, 'memory' ) } + time = { check_max( task.attempt == 1 ? 4.h : ( task.attempt == 2 ? 47.h : 167.h ), 'time' ) } } withName:CUSTOM_DUMPSOFTWAREVERSIONS {