diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 589d7118..14a733bc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,19 +35,9 @@ jobs: with: version: "${{ matrix.NXF_VER }}" - - name: Download the NCBI taxdump database - run: | - mkdir ncbi_taxdump - curl -L https://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -C ncbi_taxdump -xzf - - - - name: Download the BUSCO lineage database - run: | - mkdir busco_database - curl -L https://tolit.cog.sanger.ac.uk/test-data/resources/busco/blobtoolkit.GCA_922984935.2.2023-08-03.lineages.tar.gz | tar -C busco_database -xzf - - - name: Run pipeline with test data # You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --taxdump $PWD/ncbi_taxdump --busco $PWD/busco_database --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b07486f..c5d7c7b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[0.7.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.7.0)] – Psyduck – [2024-10-02] +## [[0.7.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.7.0)] – Psyduck – [2024-11-20] The pipeline is now considered to be a complete and suitable replacement for the Snakemake version. @@ -13,6 +13,7 @@ The pipeline is now considered to be a complete and suitable replacement for the to indicate in the samplesheet whether the reads are paired or single. - Updated the Blastn settings to allow 7 days runtime at most, since that covers 99.7% of the jobs. +- Allow database inputs to be optionally compressed (`.tar.gz`) ### Software dependencies diff --git a/assets/test/mMelMel3.1.buscogenes.dmnd b/assets/test/mMelMel3.1.buscogenes.dmnd deleted file mode 100644 index 391345ba..00000000 Binary files a/assets/test/mMelMel3.1.buscogenes.dmnd and /dev/null differ diff --git a/assets/test/mMelMel3.1.buscoregions.dmnd b/assets/test/mMelMel3.1.buscoregions.dmnd deleted file mode 100644 index 91fa6042..00000000 Binary files a/assets/test/mMelMel3.1.buscoregions.dmnd and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ndb b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ndb deleted file mode 100644 index 18062436..00000000 Binary files a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ndb and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nhr b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nhr deleted file mode 100644 index 0b5d4906..00000000 Binary files a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nhr and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nin b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nin deleted file mode 100644 index bebd568b..00000000 Binary files a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nin and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nog b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nog deleted file mode 100644 index e6ef79c7..00000000 Binary files a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nog and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nos b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nos deleted file mode 100644 index 99700566..00000000 Binary files a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nos and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.not b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.not deleted file mode 100644 index 047e8d38..00000000 Binary files a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.not and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nsq b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nsq deleted file mode 100644 index 48497573..00000000 Binary files a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nsq and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ntf b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ntf deleted file mode 100644 index 3be5ea5b..00000000 Binary files a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ntf and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nto b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nto deleted file mode 100644 index 6d4a41c7..00000000 Binary files a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nto and /dev/null differ diff --git a/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 b/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 deleted file mode 100644 index dc933c1f..00000000 Binary files a/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 and /dev/null differ diff --git a/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd b/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd deleted file mode 100644 index a0d0e1d2..00000000 Binary files a/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd and /dev/null differ diff --git a/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd b/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd deleted file mode 100644 index 3f2a1a54..00000000 Binary files a/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ndb b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ndb deleted file mode 100644 index 0905629a..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ndb and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nhr b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nhr deleted file mode 100644 index 1fa3521a..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nhr and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nin b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nin deleted file mode 100644 index 0503c4c7..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nin and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nog b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nog deleted file mode 100644 index 7dcd60eb..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nog and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nos b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nos deleted file mode 100644 index 6bd1dcdf..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nos and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.not b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.not deleted file mode 100644 index 8bacddec..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.not and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nsq b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nsq deleted file mode 100644 index 6afe38e9..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nsq and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ntf b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ntf deleted file mode 100644 index efd34086..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ntf and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nto b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nto deleted file mode 100644 index 4b140ec3..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nto and /dev/null differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 b/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 deleted file mode 100644 index 2a56a82f..00000000 Binary files a/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 and /dev/null differ diff --git a/conf/test.config b/conf/test.config index 1801bc09..20331442 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,11 +30,11 @@ params { taxon = "Meles meles" // Databases - taxdump = "/lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump" - busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03" - blastp = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd" - blastx = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd" - blastn = "${projectDir}/assets/test/nt_mMelMel3.1" + taxdump = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" + busco = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/blobtoolkit.GCA_922984935.2.2023-08-03.tar.gz" + blastp = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/mMelMel3.1.buscogenes.dmnd.tar.gz" + blastx = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/mMelMel3.1.buscoregions.dmnd.tar.gz" + blastn = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/nt_mMelMel3.1.tar.gz" // Need to be set to avoid overfilling /tmp use_work_dir_as_temp = true diff --git a/conf/test_full.config b/conf/test_full.config index ca78130e..a86e0050 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -25,11 +25,11 @@ params { taxon = "Laetiporus sulphureus" // Databases - taxdump = "/lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump" + taxdump = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" busco = "/lustre/scratch123/tol/resources/busco/latest" - blastp = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd" - blastx = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd" - blastn = "${projectDir}/assets/test_full/nt_gfLaeSulp1.1" + blastp = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/resources/gfLaeSulp1.1.buscogenes.dmnd.tar.gz" + blastx = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/resources/gfLaeSulp1.1.buscoregions.dmnd.tar.gz" + blastn = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/resources/nt_gfLaeSulp1.1.tar.gz" // Need to be set to avoid overfilling /tmp use_work_dir_as_temp = true diff --git a/conf/test_raw.config b/conf/test_raw.config index 0cf1d16f..7af9bd2e 100644 --- a/conf/test_raw.config +++ b/conf/test_raw.config @@ -31,11 +31,11 @@ params { taxon = "Meles meles" // Databases - taxdump = "/lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump" - busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03" - blastp = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd" - blastx = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd" - blastn = "${projectDir}/assets/test/nt_mMelMel3.1/" + taxdump = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" + busco = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/blobtoolkit.GCA_922984935.2.2023-08-03.tar.gz" + blastp = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/mMelMel3.1.buscogenes.dmnd.tar.gz" + blastx = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/mMelMel3.1.buscoregions.dmnd.tar.gz" + blastn = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/nt_mMelMel3.1.tar.gz" // Need to be set to avoid overfilling /tmp use_work_dir_as_temp = true diff --git a/docs/usage.md b/docs/usage.md index d2ed32b1..6f8909bf 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -78,6 +78,9 @@ The BlobToolKit pipeline can be run in many different ways. The default way requ It is a good idea to put a date suffix for each database location so you know at a glance whether you are using the latest version. We are using the `YYYY_MM` format as we do not expect the databases to be updated more frequently than once a month. However, feel free to use `DATE=YYYY_MM_DD` or a different format if you prefer. +Note that all input databases may be optionally passed directly to the pipeline compressed as `.tar.gz`, and the pipeline will handle decompression. +The instructions below show how to build each input database in _two_ forms: decompressed _and_ compressed. You may not need to do both. Select the one that is most appropriate for how you want to use the pipeline. + #### 1. NCBI taxdump database Create the database directory, retrieve and decompress the NCBI taxonomy: @@ -85,8 +88,10 @@ Create the database directory, retrieve and decompress the NCBI taxonomy: ```bash DATE=2024_10 TAXDUMP=/path/to/databases/taxdump_${DATE} +TAXDUMP_TAR=/path/to/databases/taxdump_${DATE}.tar.gz mkdir -p "$TAXDUMP" -curl -L ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -xzf - -C "$TAXDUMP" +curl -L ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz -o $TAXDUMP_TAR +tar -xzf $TAXDUMP_TAR -C "$TAXDUMP" ``` #### 2. NCBI nucleotide BLAST database @@ -96,6 +101,7 @@ Create the database directory and move into the directory: ```bash DATE=2024_10 NT=/path/to/databases/nt_${DATE} +NT_TAR=/path/to/databases/nt_${DATE}.tar.gz mkdir -p $NT cd $NT ``` @@ -113,6 +119,11 @@ done wget "https://ftp.ncbi.nlm.nih.gov/blast/db/v5/taxdb.tar.gz" && tar xf taxdb.tar.gz -C $NT && rm taxdb.tar.gz + +# Compress and cleanup +cd .. +tar -cvzf $NT_TAR $NT +rm -r $NT ``` #### 3. UniProt reference proteomes database @@ -126,6 +137,7 @@ Create the database directory and move into the directory: ```bash DATE=2024_10 UNIPROT=/path/to/databases/uniprot_${DATE} +UNIPROT_TAR=/path/to/databases/uniprot_${DATE}.tar.gz mkdir -p $UNIPROT cd $UNIPROT ``` @@ -152,6 +164,12 @@ diamond makedb -p 16 --in reference_proteomes.fasta.gz --taxonmap reference_prot # clean up mv extract/{README,STATS} . rm -r extract +rm -r $TAXDUMP + +# Compress final database and cleanup +cd .. +tar -cvzf $UNIPROT_TAR $UNIPROT +rm -r $UNIPROT ``` #### 4. BUSCO databases @@ -161,6 +179,7 @@ Create the database directory and move into the directory: ```bash DATE=2024_10 BUSCO=/path/to/databases/busco_${DATE} +BUSCO_TAR=/path/to/databases/busco_${DATE}.tar.gz mkdir -p $BUSCO cd $BUSCO ``` @@ -181,6 +200,13 @@ If you have [GNU parallel](https://www.gnu.org/software/parallel/) installed, yo find v5/data -name "*.tar.gz" | parallel "cd {//}; tar -xzf {/}" ``` +Finally re-compress and cleanup the files: + +```bash +tar -cvzf $BUSCO_TAR $BUSCO +rm -r $BUSCO +``` + ## Changes from Snakemake to Nextflow ### Commands diff --git a/modules.json b/modules.json index 4af0bcd6..23b5b5d2 100644 --- a/modules.json +++ b/modules.json @@ -87,6 +87,11 @@ "installed_by": ["modules"], "patch": "modules/nf-core/seqtk/subseq/seqtk-subseq.diff" }, + "untar": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "windowmasker/mkcounts": { "branch": "master", "git_sha": "32cac29d4a92220965dace68a1fb0bb2e3547cac", diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index 2992ab6d..0d121fca 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -10,13 +10,11 @@ process GENERATE_CONFIG { val taxon_query val busco_lin path lineage_tax_ids - tuple val(meta2), path(blastn) val reads - // The following are passed as "val" because we just want to know the full paths. No staging necessary - val blastp_path - val blastx_path - val blastn_path - val taxdump_path + tuple val(meta2), path(blastp) + tuple val(meta3), path(blastx) + tuple val(meta4), path(blastn) + tuple val(meta5), path(taxdump) output: tuple val(meta), path("*.yaml") , emit: yaml @@ -43,10 +41,10 @@ process GENERATE_CONFIG { $accession_params \\ --nt $blastn \\ $input_reads \\ - --blastp ${blastp_path} \\ - --blastx ${blastx_path} \\ - --blastn ${blastn_path} \\ - --taxdump ${taxdump_path} \\ + --blastp ${blastp} \\ + --blastx ${blastx} \\ + --blastn ${blastn} \\ + --taxdump ${taxdump} \\ --output_prefix ${prefix} cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 00000000..c7794856 --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::grep=3.11 + - conda-forge::sed=4.8 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 00000000..9bd8f554 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,84 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:22.04' : + 'nf-core/ubuntu:22.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir ${prefix} + ## Dry-run untaring the archive to get the files and place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + for i in `tar -tf ${archive}`; + do + if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]]; + then + touch \${i} + else + mkdir -p \${i} + fi + done + else + for i in `tar -tf ${archive}`; + do + if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]]; + then + touch ${prefix}/\${i} + else + mkdir -p ${prefix}/\${i} + fi + done + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 00000000..290346b3 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,49 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - untar: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - $prefix: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test new file mode 100644 index 00000000..c957517a --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test @@ -0,0 +1,85 @@ +nextflow_process { + + name "Test Process UNTAR" + script "../main.nf" + process "UNTAR" + tag "modules" + tag "modules_nfcore" + tag "untar" + + test("test_untar") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + } + + test("test_untar_onlyfiles") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + } + + test("test_untar - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + } + + test("test_untar_onlyfiles - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + } +} diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap new file mode 100644 index 00000000..ceb91b79 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -0,0 +1,158 @@ +{ + "test_untar_onlyfiles": { + "content": [ + { + "0": [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ], + "1": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ], + "untar": [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ], + "versions": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-10T12:04:28.231047" + }, + "test_untar_onlyfiles - stub": { + "content": [ + { + "0": [ + [ + [ + + ], + [ + "hello.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ], + "untar": [ + [ + [ + + ], + [ + "hello.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-10T12:04:45.773103" + }, + "test_untar - stub": { + "content": [ + { + "0": [ + [ + [ + + ], + [ + "hash.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "opts.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "taxo.k2d:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ], + "untar": [ + [ + [ + + ], + [ + "hash.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "opts.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "taxo.k2d:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-10T12:04:36.777441" + }, + "test_untar": { + "content": [ + { + "0": [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ], + "1": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ], + "untar": [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ], + "versions": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-10T12:04:19.377674" + } +} \ No newline at end of file diff --git a/modules/nf-core/untar/tests/tags.yml b/modules/nf-core/untar/tests/tags.yml new file mode 100644 index 00000000..feb6f15c --- /dev/null +++ b/modules/nf-core/untar/tests/tags.yml @@ -0,0 +1,2 @@ +untar: + - modules/nf-core/untar/** diff --git a/nextflow_schema.json b/nextflow_schema.json index e722369d..3c75ab58 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -52,7 +52,8 @@ "type": "string", "enum": ["png", "svg"], "description": "Select the format of the output images.", - "fa_icon": "fas fa-image" + "fa_icon": "fas fa-image", + "default": "png" }, "outdir": { "type": "string", @@ -109,40 +110,40 @@ "properties": { "busco": { "type": "string", - "format": "directory-path", + "format": "path", "description": "Local directory where clade-specific BUSCO lineage datasets are stored", "fa_icon": "fas fa-folder-open" }, "lineage_tax_ids": { "type": "string", - "format": "file-path", + "format": "path", "description": "Local file that holds a mapping between BUSCO lineages and taxon IDs.", "help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.*.2019-12-16.txt.tar.gz", "fa_icon": "fas fa-file-code" }, "blastp": { "type": "string", - "format": "file-path", - "pattern": "^\\S+\\.dmnd$", + "format": "path", + "pattern": "^\\S+\\.dmnd.*$", "description": "Path to the Diamond species-specific buscogenes database", "fa_icon": "fas fa-file-archive" }, "blastx": { "type": "string", - "format": "file-path", - "pattern": "^\\S+\\.dmnd$", + "format": "path", + "pattern": "^\\S+\\.dmnd.*$", "description": "Path to the Diamond species-specific buscoregions database", "fa_icon": "fas fa-file-archive" }, "blastn": { "type": "string", - "format": "directory-path", + "format": "path", "description": "Path to the nucleotide BLAST database", "fa_icon": "fas fa-file-archive" }, "taxdump": { "type": "string", - "format": "directory-path", + "format": "path", "description": "Path to the new NCBI tax dump database", "fa_icon": "fas fa-folder-open" } diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 4b07723e..2e1a442d 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -47,12 +47,11 @@ workflow BUSCO_DIAMOND { ch_fasta_with_lineage, "genome", ch_fasta_with_lineage.map { it[0].lineage_name }, - busco_db, + busco_db.first(), [], ) ch_versions = ch_versions.mix ( BUSCO.out.versions.first() ) - // // Tidy up the BUSCO output directories before publication // diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 69a4757a..223c01b7 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -2,6 +2,7 @@ // Check input samplesheet and get aligned read channels // +include { UNTAR } from '../../modules/nf-core/untar/main' include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' @@ -15,11 +16,56 @@ workflow INPUT_CHECK { taxon // channel: val(taxon) busco_lin // channel: val([busco_lin]) lineage_tax_ids // channel: /path/to/lineage_tax_ids - blastn // channel: [ val(meta), path(blastn_db) ] + blastn // channel: [ path(blastn_db) ] + blastp // channel: [ path(blastp_db) ] + blastx // channel: [ path(blastx_db) ] + busco_db // channel: [ path(busco_db) ] + taxdump // channel: [ path(taxdump) ] main: ch_versions = Channel.empty() + // + // SUBWORKFLOW: Decompress databases if needed + // + + // Join into single databases channel + databases = blastn.concat(blastp, blastx, busco_db, taxdump) + + // Check which need to be decompressed + ch_dbs_for_untar = databases + .branch { db_meta, db_path -> + untar: db_path.name.endsWith( ".tar.gz" ) + skip: true + } + + // Untar the databases + UNTAR ( ch_dbs_for_untar.untar ) + ch_versions = ch_versions.mix( UNTAR.out.versions.first() ) + + // Join and format dbs + // NOTE: The conditional for blastp/x is needed because nf-core/untar puts the database in a directory + ch_databases = UNTAR.out.untar.concat( ch_dbs_for_untar.skip ) + .map { meta, db -> [ meta + [id: db.baseName], db] } + .map { db_meta, db_path -> + if (db_meta.type in ["blastp", "blastx"]) { + [db_meta, file(db_path.toString() + "/${db_path.name}", checkIfExists: true)] + } else { + [db_meta, db_path] + } + } + .branch { db_meta, db_path -> + blastn: db_meta.type == "blastn" + blastp: db_meta.type == "blastp" + blastx: db_meta.type == "blastx" + busco: db_meta.type == "busco" + taxdump: db_meta.type == "taxdump" + } + + + // + // SUBWORKFLOW: Process samplesheet + // if ( params.fetchngs_samplesheet ) { FETCHNGSSAMPLESHEET_CHECK ( samplesheet ) .csv @@ -66,12 +112,11 @@ workflow INPUT_CHECK { taxon, busco_lin, lineage_tax_ids, - blastn, reads.collect(flat: false).ifEmpty([]), - params.blastp, - params.blastx, - params.blastn, - params.taxdump, + ch_databases.blastp, + ch_databases.blastx, + ch_databases.blastn, + ch_databases.taxdump, ) ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first()) @@ -115,6 +160,11 @@ workflow INPUT_CHECK { categories_tsv = GENERATE_CONFIG.out.categories_tsv // channel: [ val(meta), path(tsv) ] taxon_id = ch_taxon_id // channel: val(taxon_id) busco_lineages = ch_busco_lineages // channel: val([busco_lin]) + blastn = ch_databases.blastn // channel: [ val(meta), path(blastn_db) ] + blastp = ch_databases.blastp // channel: [ val(meta), path(blastp_db) ] + blastx = ch_databases.blastx // channel: [ val(meta), path(blastx_db) ] + busco_db = ch_databases.busco.map { _, db_path -> db_path } // channel: [ path(busco_db) ] + taxdump = ch_databases.taxdump.map { _, db_path -> db_path } // channel: [ path(taxdump) ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 280278a7..d9effaa4 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -24,17 +24,21 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.fasta) { ch_fasta = Channel.value([ [ 'id': params.accession ?: file(params.fasta.replace(".gz", "")).baseName ], file(params.fasta) ]) } else { exit 1, 'Genome fasta file must be specified!' } if (params.taxon) { ch_taxon = Channel.value(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } -if (params.blastp) { ch_blastp = Channel.value([ [ 'id': file(params.blastp).baseName ], params.blastp ]) } else { exit 1, 'Diamond BLASTp database must be specified!' } -if (params.blastx) { ch_blastx = Channel.value([ [ 'id': file(params.blastx).baseName ], params.blastx ]) } else { exit 1, 'Diamond BLASTx database must be specified!' } -if (params.blastn) { ch_blastn = Channel.value([ [ 'id': file(params.blastn).baseName ], params.blastn ]) } else { exit 1, 'BLASTn database not specified!' } -if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } +if (params.blastp) { ch_blastp = Channel.fromPath(params.blastp).map { tuple(["type": "blastp"], it) } } else { exit 1, 'Diamond BLASTp database must be specified!' } +if (params.blastx) { ch_blastx = Channel.fromPath(params.blastx).map { tuple(["type": "blastx"], it) } } else { exit 1, 'Diamond BLASTx database must be specified!' } +if (params.blastn) { ch_blastn = Channel.fromPath(params.blastn).map { tuple(["type": "blastn"], it) } } else { exit 1, 'BLASTn database not specified!' } +if (params.taxdump) { ch_taxdump = Channel.fromPath(params.taxdump).map { tuple(["type": "taxdump"], it) } } else { exit 1, 'NCBI Taxonomy database not specified!' } if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' } if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids).first() } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' } // Create channel for optional parameters if (params.busco_lineages) { ch_busco_lin = Channel.value(params.busco_lineages) } else { ch_busco_lin = Channel.value([]) } -if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) } +if (params.busco) { + ch_busco_db = Channel.fromPath(params.busco).first().map { tuple([ "type": "busco"], it ) } +} else { + ch_busco_db = Channel.value([]) +} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -109,6 +113,10 @@ workflow BLOBTOOLKIT { ch_busco_lin, ch_lineage_tax_ids, ch_blastn, + ch_blastx, + ch_blastp, + ch_busco_db, + ch_taxdump, ) ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) @@ -135,8 +143,8 @@ workflow BLOBTOOLKIT { BUSCO_DIAMOND ( PREPARE_GENOME.out.genome, INPUT_CHECK.out.busco_lineages, - ch_busco_db, - ch_blastp, + INPUT_CHECK.out.busco_db, + INPUT_CHECK.out.blastp, INPUT_CHECK.out.taxon_id, ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) @@ -147,7 +155,7 @@ workflow BLOBTOOLKIT { RUN_BLASTX ( PREPARE_GENOME.out.genome, BUSCO_DIAMOND.out.first_table, - ch_blastx, + INPUT_CHECK.out.blastx, INPUT_CHECK.out.taxon_id, ) ch_versions = ch_versions.mix ( RUN_BLASTX.out.versions ) @@ -159,7 +167,7 @@ workflow BLOBTOOLKIT { RUN_BLASTN ( RUN_BLASTX.out.blastx_out, PREPARE_GENOME.out.genome, - ch_blastn, + INPUT_CHECK.out.blastn, INPUT_CHECK.out.taxon_id, ) @@ -187,7 +195,7 @@ workflow BLOBTOOLKIT { BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), RUN_BLASTX.out.blastx_out.ifEmpty([[],[]]), RUN_BLASTN.out.blastn_out.ifEmpty([[],[]]), - ch_taxdump + INPUT_CHECK.out.taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLS.out.versions )