Merge pull request #3 from tkchafin/dev

Dev
sanger-tol · Nov 11, 2024 · 2afc63d · 2afc63d
2 parents 254fea1 + 8d2b83f
commit 2afc63d
Show file tree

Hide file tree

Showing 14 changed files with 73 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,16 @@ The pipeline is now considered to be a complete and suitable replacement for the
   "grid plots".
 - Fill in accurate read information in the blobDir. Users are now reqiured
   to indicate in the samplesheet whether the reads are paired or single.
+- Updated the Blastn settings to allow 7 days runtime at most, since that
+  covers 99.7% of the jobs.
+
+### Software dependencies
+
+Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported.
+
+| Dependency  | Old version | New version |
+| ----------- | ----------- | ----------- |
+| blobtoolkit | 4.3.9       | 4.3.13      |
 
 ## [[0.6.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.6.0)] – Bellsprout – [2024-09-13]
 

diff --git a/conf/base.config b/conf/base.config
@@ -106,14 +106,15 @@ process {
 
     withName: "BLAST_BLASTN" {
 
-        // There are blast failures we don't know how to fix. Just ignore for now
-        errorStrategy = { task.exitStatus in ((130..145) + 104) ? (task.attempt == process.maxRetries ? 'ignore' : 'retry') : 'finish' }
+        // There are blast failures we don't know how to fix. We just give up after 3 attempts
+        errorStrategy = { task.exitStatus in ((130..145) + 104) ? (task.attempt == 3 ? 'ignore' : 'retry') : 'finish' }
+
 
         // Most jobs complete quickly but some need a lot longer. For those outliers,
-        // the CPU usage remains usually low, often nearing a single CPU
-        cpus   = { check_max( 6    -            (task.attempt-1), 'cpus'   ) }
-        memory = { check_max( 1.GB * Math.pow(4, task.attempt-1), 'memory' ) }
-        time   = { check_max( 10.h * Math.pow(4, task.attempt-1), 'time'   ) }
+        // the CPU usage remains usually low, averaging a single CPU
+        cpus   = { check_max( task.attempt == 1 ? 4 : 1, 'cpus' ) }
+        memory = { check_max( 2.GB, 'memory' ) }
+        time   = { check_max( task.attempt == 1 ? 4.h : ( task.attempt == 2 ? 47.h : 167.h ), 'time' ) }
     }
 
     withName:CUSTOM_DUMPSOFTWAREVERSIONS {

diff --git a/conf/test.config b/conf/test.config
@@ -25,7 +25,7 @@ params {
     input     = "${projectDir}/assets/test/samplesheet_s3.csv"
 
     // Fasta references
-    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
+    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.phiXspike.fasta.gz"
     accession = "GCA_922984935.2"
     taxon     = "Meles meles"
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -54,7 +54,20 @@ An [example samplesheet](assets/test/samplesheet.csv) has been provided with the
 The pipeline can also accept a samplesheet generated by the [nf-core/fetchngs](https://nf-co.re/fetchngs) pipeline (tested with version 1.11.0).
 The pipeline then needs the `--fetchngs_samplesheet true` option _and_ `--align true`, since the data files would all be unaligned.
 
-## Getting databases ready for the pipeline
+## Database parameters
+
+Configure access to your local databases with the `--busco`, `--blastp`, `--blastx`, `--blastn`, and `--taxdump` parameters.
+
+Note that `--busco` refers to the download path of _all_ lineages.
+Then, when explicitly selecting the lineages to run the pipeline on,
+provide the names of these lineages _with_ their `_odb10` suffix as a comma-separated string.
+For instance:
+
+```bash
+--busco path-to-databases/busco/ --busco_lineages vertebrata_odb10,bacteria_odb10,fungi_odb10
+```
+
+### Getting databases ready for the pipeline
 
 The BlobToolKit pipeline can be run in many different ways. The default way requires access to several databases:
 
@@ -65,35 +78,31 @@ The BlobToolKit pipeline can be run in many different ways. The default way requ
 
 It is a good idea to put a date suffix for each database location so you know at a glance whether you are using the latest version. We are using the `YYYY_MM` format as we do not expect the databases to be updated more frequently than once a month. However, feel free to use `DATE=YYYY_MM_DD` or a different format if you prefer.
 
-### 1. NCBI taxdump database
+#### 1. NCBI taxdump database
 
-Create the database directory and move into the directory:
+Create the database directory, retrieve and decompress the NCBI taxonomy:
 
 ```bash
-DATE=2023_03
+DATE=2024_10
 TAXDUMP=/path/to/databases/taxdump_${DATE}
-mkdir -p $TAXDUMP
-cd $TAXDUMP
-```
-
-Retrieve and decompress the NCBI taxdump:
-
-```bash
-curl -L ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar xzf -
+mkdir -p "$TAXDUMP"
+curl -L ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -xzf - -C "$TAXDUMP"
 ```
 
-### 2. NCBI nucleotide BLAST database
+#### 2. NCBI nucleotide BLAST database
 
 Create the database directory and move into the directory:
 
 ```bash
-DATE=2023_03
+DATE=2024_10
 NT=/path/to/databases/nt_${DATE}
 mkdir -p $NT
 cd $NT
 ```
 
-Retrieve the NCBI blast nt database (version 5) files and tar gunzip them. We are using the `&&` syntax to ensure that each command completes without error before the next one is run:
+Retrieve the NCBI blast nt database (version 5) files and tar gunzip them.
+`wget` and the use of the FTP protocol are necessary to resolve the wildcard `nt.???.tar.gz`.
+We are using the `&&` syntax to ensure that each command completes without error before the next one is run:
 
 ```bash
 wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.???.tar.gz" -P $NT/ &&
@@ -106,46 +115,51 @@ tar xf taxdb.tar.gz -C $NT &&
 rm taxdb.tar.gz
 ```
 
-### 3. UniProt reference proteomes database
+#### 3. UniProt reference proteomes database
 
-You need [diamond blast](https://github.com/bbuchfink/diamond) installed for this step. The easiest way is probably using [conda](https://anaconda.org/bioconda/diamond). Make sure you have the latest version of Diamond (>2.x.x) otherwise the `--taxonnames` argument may not work.
+You need [diamond blast](https://github.com/bbuchfink/diamond) installed for this step.
+The easiest way is probably to install their [pre-compiled release](https://github.com/bbuchfink/diamond/releases).
+Make sure you have the latest version of Diamond (>2.x.x) otherwise the `--taxonnames` argument may not work.
 
 Create the database directory and move into the directory:
 
 ```bash
-DATE=2023_03
+DATE=2024_10
 UNIPROT=/path/to/databases/uniprot_${DATE}
 mkdir -p $UNIPROT
 cd $UNIPROT
 ```
 
-The UniProt `Refseq_Proteomes_YYYY_MM.tar.gz` file is very large (>160 GB) and will take a long time to download. The command below looks complex because it needs to get around the problem of using wildcards with wget and curl.
+The UniProt `Refseq_Proteomes_YYYY_MM.tar.gz` file is very large (close to 200 GB) and will take a long time to download.
+The command below looks complex because it needs to get around the problem of using wildcards with wget and curl.
 
 ```bash
-wget -q -O $UNIPROT/reference_proteomes.tar.gz \
-  ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/$(curl \
-    -vs ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/ 2>&1 | \
-    awk '/tar.gz/ {print $9}')
-tar xf reference_proteomes.tar.gz
+EBI_URL=ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/
+mkdir extract
+curl -L $EBI_URL/$(curl -vs $EBI_URL 2>&1 | awk '/tar.gz/ {print $9}') | \
+  tar -xzf - -C extract
 
 # Create a single fasta file with all the fasta files from each subdirectory:
-touch reference_proteomes.fasta.gz
-find . -mindepth 2 | grep "fasta.gz" | grep -v 'DNA' | grep -v 'additional' | xargs cat >> reference_proteomes.fasta.gz
+find extract -type f -name '*.fasta.gz' ! -name '*_DNA.fasta.gz' ! -name '*_additional.fasta.gz' -exec cat '{}' '+' > reference_proteomes.fasta.gz
 
 # create the accession-to-taxid map for all reference proteome sequences:
-printf "accession\taccession.version\ttaxid\tgi\n" > reference_proteomes.taxid_map
-zcat */*/*.idmapping.gz | grep "NCBI_TaxID" | awk '{print $1 "\t" $1 "\t" $3 "\t" 0}' >> reference_proteomes.taxid_map
+find extract -type f -name '*.idmapping.gz' -exec zcat {} + | \
+  awk 'BEGIN {OFS="\t"; print "accession", "accession.version", "taxid", "gi"} $2=="NCBI_TaxID" {print $1, $1, $3, 0}' > reference_proteomes.taxid_map
 
 # create the taxon aware diamond blast database
 diamond makedb -p 16 --in reference_proteomes.fasta.gz --taxonmap reference_proteomes.taxid_map --taxonnodes $TAXDUMP/nodes.dmp --taxonnames $TAXDUMP/names.dmp -d reference_proteomes.dmnd
+
+# clean up
+mv extract/{README,STATS} .
+rm -r extract
 ```
 
-### 4. BUSCO databases
+#### 4. BUSCO databases
 
 Create the database directory and move into the directory:
 
 ```bash
-DATE=2023_03
+DATE=2024_10
 BUSCO=/path/to/databases/busco_${DATE}
 mkdir -p $BUSCO
 cd $BUSCO
@@ -232,7 +246,7 @@ List of tools for any given dataset can be fetched from the API, for example htt
 
 | Dependency        | Snakemake | Nextflow |
 | ----------------- | --------- | -------- |
-| blobtoolkit       | 4.3.2     | 4.3.9    |
+| blobtoolkit       | 4.3.2     | 4.3.13   |
 | blast             | 2.12.0    | 2.14.1   |
 | blobtk            | 0.5.0     | 0.5.1    |
 | busco             | 5.3.2     | 5.5.0    |

diff --git a/modules/local/blobtoolkit/chunk.nf b/modules/local/blobtoolkit/chunk.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_CHUNK {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_CHUNK module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta) , path(fasta)

diff --git a/modules/local/blobtoolkit/countbuscos.nf b/modules/local/blobtoolkit/countbuscos.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_COUNTBUSCOS {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_COUNTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta), path(table, stageAs: 'dir??/*')

diff --git a/modules/local/blobtoolkit/createblobdir.nf b/modules/local/blobtoolkit/createblobdir.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta), path(window, stageAs: 'windowstats/*')

diff --git a/modules/local/blobtoolkit/extractbuscos.nf b/modules/local/blobtoolkit/extractbuscos.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_EXTRACTBUSCOS {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_EXTRACTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta), path(fasta)

diff --git a/modules/local/blobtoolkit/summary.nf b/modules/local/blobtoolkit/summary.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_SUMMARY {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_SUMMARY module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta), path(blobdir)

diff --git a/modules/local/blobtoolkit/unchunk.nf b/modules/local/blobtoolkit/unchunk.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_UNCHUNK {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_UNCHUNK module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta), path(blast_table)

diff --git a/modules/local/blobtoolkit/updateblobdir.nf b/modules/local/blobtoolkit/updateblobdir.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEBLOBDIR {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta), path(input, stageAs: "input_blobdir")

diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEMETA {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_UPDATEMETA module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta), path(input)

diff --git a/modules/local/blobtoolkit/windowstats.nf b/modules/local/blobtoolkit/windowstats.nf
@@ -4,7 +4,7 @@ process BLOBTOOLKIT_WINDOWSTATS {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_WINDOWSTATS module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta), path(tsv)

diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf
@@ -3,7 +3,7 @@ process GENERATE_CONFIG {
     label 'process_single'
 
     conda "conda-forge::requests=2.28.1 conda-forge::pyyaml=6.0"
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+    container "docker.io/genomehubs/blobtoolkit:4.3.13"
 
     input:
     tuple val(meta), val(fasta)