From 275c04829b076fe2d72cfa5a1a0ba29c6128ae26 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 8 Jan 2024 17:05:23 +0000
Subject: [PATCH 01/44] Fixed the conditional flowing

only run BLAST_BLASTN if BLASTN_TAXON has an empty output
---
 subworkflows/local/run_blastn.nf | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf
index 87cb0a88..5e3c913f 100644
--- a/subworkflows/local/run_blastn.nf
+++ b/subworkflows/local/run_blastn.nf
@@ -52,23 +52,27 @@ workflow RUN_BLASTN {
     // Run blastn search
     // run blastn excluding taxon_id
     BLASTN_TAXON ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, taxon_id )
+    ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() )
 
     // check if blastn output table is empty
     BLASTN_TAXON.out.txt
-    | map { meta, txt -> txt.isEmpty() }
-    | set { is_txt_empty }
+    | branch { meta, txt ->
+        empty:     txt.isEmpty()
+        not_empty: true
+    }
+    | set { ch_blastn_taxon_out }
 
     // repeat the blastn search without excluding taxon_id
-    if ( is_txt_empty ) {
-        BLAST_BLASTN ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, [] )
-        ch_blastn_txt = BLAST_BLASTN.out.txt
-    }
-    else {
-        ch_blastn_txt = BLASTN_TAXON.out.txt
-    }
+    ch_blastn_taxon_out.empty.join ( BLOBTOOLKIT_CHUNK.out.chunks )
+    | map { meta, txt, fasta -> [meta, fasta] }
+    | set { ch_blast_blastn_input }
 
+    BLAST_BLASTN ( ch_blast_blastn_input, blastn, [] )
     ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() )
 
+    BLAST_BLASTN.out.txt
+    | mix( ch_blastn_taxon_out.not_empty )
+    | set { ch_blastn_txt }
 
     // Unchunk chunked blastn results
     BLOBTOOLKIT_UNCHUNK ( ch_blastn_txt )

From af953d864c7d3414d6ad64cbe2e84521b9e36b58 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 9 Jan 2024 10:25:50 +0000
Subject: [PATCH 02/44] We shouldn't be using this old taxonomy database

---
 conf/test.config      | 2 +-
 conf/test_full.config | 2 +-
 conf/test_raw.config  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 221a0f22..623cf3f9 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -30,7 +30,7 @@ params {
     taxon     = "Meles meles"
 
     // Databases
-    taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
+    taxdump = "/lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump"
     busco   = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03"
     blastp  = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd"
     blastx  = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd"
diff --git a/conf/test_full.config b/conf/test_full.config
index ff1ac068..6af9eecb 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -25,7 +25,7 @@ params {
     taxon     = "Laetiporus sulphureus"
 
     // Databases
-    taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
+    taxdump = "/lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump"
     busco   = "/lustre/scratch123/tol/resources/busco/latest"
     blastp  = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd"
     blastx  = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd"
diff --git a/conf/test_raw.config b/conf/test_raw.config
index 6d4174c2..47cc4267 100644
--- a/conf/test_raw.config
+++ b/conf/test_raw.config
@@ -31,8 +31,8 @@ params {
     taxon     = "Meles meles"
 
     // Databases
-    taxdump   = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
-    busco     = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03"
+    taxdump = "/lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump"
+    busco   = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03"
     blastp  = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd"
     blastx  = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd"
     blastn  = "${projectDir}/assets/test/nt_mMelMel3.1/"

From 26ad7a62d1ceaa05299914ddc8da5789d721f372 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 9 Jan 2024 11:17:42 +0000
Subject: [PATCH 03/44] We also need the taxdb files for the NT database

---
 docs/usage.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/usage.md b/docs/usage.md
index 4e4c9d7c..c4aad4a1 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -91,6 +91,7 @@ Retrieve the NCBI blast nt database (version 5) files and tar gunzip them. We ar
 
 ```bash
 wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.??.tar.gz" -P $NT/ &&
+wget https://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz -P $NT &&
 for file in $NT/*.tar.gz; do
     tar xf $file -C $NT && rm $file;
 done

From 79d9bb5d68d62fc9521070e5131a1a1947212d71 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 9 Jan 2024 14:55:21 +0000
Subject: [PATCH 04/44] File numbers now have three digits

---
 docs/usage.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/usage.md b/docs/usage.md
index c4aad4a1..143de417 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -90,7 +90,7 @@ cd $NT
 Retrieve the NCBI blast nt database (version 5) files and tar gunzip them. We are using the `&&` syntax to ensure that each command completes without error before the next one is run:
 
 ```bash
-wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.??.tar.gz" -P $NT/ &&
+wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.???.tar.gz" -P $NT/ &&
 wget https://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz -P $NT &&
 for file in $NT/*.tar.gz; do
     tar xf $file -C $NT && rm $file;

From 903ebc0731065ca8907c0768d6670772c9b7a9e6 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jan 2024 15:02:45 +0000
Subject: [PATCH 05/44] bugfix: removed extra brackets that were only needed in
 Snakemake rules

---
 bin/nohitlist.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/nohitlist.sh b/bin/nohitlist.sh
index c935cebe..9286654f 100755
--- a/bin/nohitlist.sh
+++ b/bin/nohitlist.sh
@@ -8,7 +8,7 @@ E=$4
 
 # find ids of sequences with no hits in the blastx search
 grep '>' $fasta | \
-    grep -v -w -f <(awk -v evalue="$E" '{{if($14<{evalue}){{print $1}}}}' $blast | sort | uniq) | \
+    grep -v -w -f <(awk -v evalue="$E" '{if($14<evalue){print $1}}' $blast | sort | uniq) | \
     cut -f1 | sed 's/>//' > $prefix.nohit.txt
 
 

From 04ea5e333e022d858208422e2c1d01d3192b2813 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jan 2024 15:03:24 +0000
Subject: [PATCH 06/44] bugfix: the headers are separated with regular
 whitespace, not tabs

---
 bin/nohitlist.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/nohitlist.sh b/bin/nohitlist.sh
index 9286654f..bd9bcc14 100755
--- a/bin/nohitlist.sh
+++ b/bin/nohitlist.sh
@@ -9,7 +9,7 @@ E=$4
 # find ids of sequences with no hits in the blastx search
 grep '>' $fasta | \
     grep -v -w -f <(awk -v evalue="$E" '{if($14<evalue){print $1}}' $blast | sort | uniq) | \
-    cut -f1 | sed 's/>//' > $prefix.nohit.txt
+    awk '{print $1}' | sed 's/>//' > $prefix.nohit.txt
 
 
 

From 023a0416f784ec83dc308225bd9921ada71d3767 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jan 2024 15:41:45 +0000
Subject: [PATCH 07/44] Updated SAMTOOLS_FASTA

---
 modules.json                           | 2 +-
 modules/nf-core/samtools/fasta/main.nf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules.json b/modules.json
index 38431cf4..c5020014 100644
--- a/modules.json
+++ b/modules.json
@@ -59,7 +59,7 @@
                     },
                     "samtools/fasta": {
                         "branch": "master",
-                        "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c",
+                        "git_sha": "9b1071e19265cf9c0d06958a011cf7a9cfe37213",
                         "installed_by": ["modules"]
                     },
                     "samtools/index": {
diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf
index 63e2852e..cb009ddd 100644
--- a/modules/nf-core/samtools/fasta/main.nf
+++ b/modules/nf-core/samtools/fasta/main.nf
@@ -24,7 +24,7 @@ process SAMTOOLS_FASTA {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fasta.gz" :
+    def output = ( interleave && ! meta.single_end ) ? "| gzip > ${prefix}_interleaved.fasta.gz" :
         meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" :
         "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz"
     """

From 1d7fe4109bddef94a1eb1f4cb611a0181066429a Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jan 2024 15:45:38 +0000
Subject: [PATCH 08/44] We want *all* the reads in the output file

With the `-0` option, the output file ("interleaved") is empty for PacBio
because all the reads go to the "other" file. Whereas paired-reads all go to
the "interleaved" file and none to the "other" file.

The simplest is to not use the `-0` option. Then, `samtools fasta` simply sends
all the reads to the standard output.
---
 modules.json                                       |  3 ++-
 modules/nf-core/samtools/fasta/main.nf             |  1 -
 modules/nf-core/samtools/fasta/samtools-fasta.diff | 13 +++++++++++++
 3 files changed, 15 insertions(+), 2 deletions(-)
 create mode 100644 modules/nf-core/samtools/fasta/samtools-fasta.diff

diff --git a/modules.json b/modules.json
index c5020014..97f74e42 100644
--- a/modules.json
+++ b/modules.json
@@ -60,7 +60,8 @@
                     "samtools/fasta": {
                         "branch": "master",
                         "git_sha": "9b1071e19265cf9c0d06958a011cf7a9cfe37213",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/samtools/fasta/samtools-fasta.diff"
                     },
                     "samtools/index": {
                         "branch": "master",
diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf
index cb009ddd..4b0cad9a 100644
--- a/modules/nf-core/samtools/fasta/main.nf
+++ b/modules/nf-core/samtools/fasta/main.nf
@@ -32,7 +32,6 @@ process SAMTOOLS_FASTA {
         fasta \\
         $args \\
         --threads ${task.cpus-1} \\
-        -0 ${prefix}_other.fasta.gz \\
         $input \\
         $output
 
diff --git a/modules/nf-core/samtools/fasta/samtools-fasta.diff b/modules/nf-core/samtools/fasta/samtools-fasta.diff
new file mode 100644
index 00000000..e2374ed9
--- /dev/null
+++ b/modules/nf-core/samtools/fasta/samtools-fasta.diff
@@ -0,0 +1,13 @@
+Changes in module 'nf-core/samtools/fasta'
+--- modules/nf-core/samtools/fasta/main.nf
++++ modules/nf-core/samtools/fasta/main.nf
+@@ -32,7 +32,6 @@
+         fasta \\
+         $args \\
+         --threads ${task.cpus-1} \\
+-        -0 ${prefix}_other.fasta.gz \\
+         $input \\
+         $output
+ 
+
+************************************************************

From 7729e202d8a44291aaa936b3b29c3ecc110238ba Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jan 2024 16:05:04 +0000
Subject: [PATCH 09/44] Correctly pick up the name of the blast database

(waiting for nf-core approval)
---
 modules/nf-core/blast/blastn/blast-blastn.diff | 15 ++++++++++++---
 modules/nf-core/blast/blastn/main.nf           |  8 +++++++-
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff
index dc3f108f..cfac21d4 100644
--- a/modules/nf-core/blast/blastn/blast-blastn.diff
+++ b/modules/nf-core/blast/blastn/blast-blastn.diff
@@ -9,16 +9,25 @@ Changes in module 'nf-core/blast/blastn'
  
      output:
      tuple val(meta), path('*.txt'), emit: txt
-@@ -23,7 +24,7 @@
+@@ -23,17 +24,24 @@
      def prefix = task.ext.prefix ?: "${meta.id}"
      def is_compressed = fasta.getExtension() == "gz" ? true : false
      def fasta_name = is_compressed ? fasta.getBaseName() : fasta
--
 +    def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : ''
+ 
      """
      if [ "${is_compressed}" == "true" ]; then
          gzip -c -d ${fasta} > ${fasta_name}
-@@ -34,6 +35,7 @@
+     fi
+ 
+-    DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'`
++    DB=`find -L ./ -name "*.nal" | sed 's/\\.nal\$//'`
++    if [ -z "\$DB" ]; then
++        DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'`
++    fi
++    echo Using \$DB
++
+     blastn \\
          -num_threads ${task.cpus} \\
          -db \$DB \\
          -query ${fasta_name} \\
diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf
index 44b581a9..065ad7cd 100644
--- a/modules/nf-core/blast/blastn/main.nf
+++ b/modules/nf-core/blast/blastn/main.nf
@@ -25,12 +25,18 @@ process BLAST_BLASTN {
     def is_compressed = fasta.getExtension() == "gz" ? true : false
     def fasta_name = is_compressed ? fasta.getBaseName() : fasta
     def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : ''
+
     """
     if [ "${is_compressed}" == "true" ]; then
         gzip -c -d ${fasta} > ${fasta_name}
     fi
 
-    DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'`
+    DB=`find -L ./ -name "*.nal" | sed 's/\\.nal\$//'`
+    if [ -z "\$DB" ]; then
+        DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'`
+    fi
+    echo Using \$DB
+
     blastn \\
         -num_threads ${task.cpus} \\
         -db \$DB \\

From e0f248660817c1e686d84c879b62a4bef9ada5be Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jan 2024 16:20:42 +0000
Subject: [PATCH 10/44] Started the CHANGELOG

---
 CHANGELOG.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bd4cc71d..528dd210 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,16 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.2.0)] – Poliwag – [2024-01-XX]
+
+### Enhancements & fixes
+
+- Fixed the conditional runs of blastn
+- Fixed the generation of the no-hit list
+- Fixed the conversion of the unaligned input files to Fasta
+- Fixed the documentation about preparing the NT database
+- Fixed the detection of the NT database in the nf-core module
+
 ## [[0.2.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.2.0)] – Pikachu – [2023-12-22]
 
 ### Enhancements & fixes

From 7046a75743777cc0308283f40418418f42297cb8 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jan 2024 17:02:35 +0000
Subject: [PATCH 11/44] Allow FastQ files as input

This simplifies connecting to the fetchngs pipeline
---
 bin/check_samplesheet.py          | 2 ++
 subworkflows/local/input_check.nf | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index f5bf5c5b..c63d06fe 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -27,6 +27,8 @@ class RowChecker:
     VALID_FORMATS = (
         ".cram",
         ".bam",
+        ".fastq",
+        ".fastq.gz",
     )
 
     VALID_DATATYPES = (
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 01849bd1..bb271f53 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -55,6 +55,10 @@ def create_data_channels(LinkedHashMap row) {
     // add path(s) of the read file(s) to the meta map
     def data_meta = []
 
+    if ( !params.align && (row.datafile.endsWith(".fastq") || row.datafile.endsWith(".fastq.gz")) ) {
+        exit 1, "ERROR: Please check input samplesheet and pipeline parameters -> Data file is in FastQ format but --align is not set!\n${row.datafile}"
+    }
+
     if ( !file(row.datafile).exists() ) {
         exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.datafile}"
     } else {

From e54bc850d482afe09fcad598feef0192732d3017 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jan 2024 17:41:55 +0000
Subject: [PATCH 12/44] Skip the conversion to fasta if the input is fastq

---
 subworkflows/local/minimap_alignment.nf | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf
index b9a4409e..e0b479bc 100644
--- a/subworkflows/local/minimap_alignment.nf
+++ b/subworkflows/local/minimap_alignment.nf
@@ -20,13 +20,22 @@ workflow MINIMAP2_ALIGNMENT {
     ch_versions = Channel.empty()
 
 
-    // Convert reads to FASTA
-    SAMTOOLS_FASTA ( input, true )
+    // Convert BAM/CRAM reads to FASTA
+    input
+    | branch {
+        meta, reads ->
+            fastq:   reads.toString().endsWith(".fastq") || reads.toString().endsWith(".fastq.gz") || reads.toString().endsWith(".fq") || reads.toString().endsWith(".fq.gz")
+            bamcram: true
+    }
+    | set { ch_reads_by_type }
+
+    SAMTOOLS_FASTA ( ch_reads_by_type.bamcram, true )
     ch_versions = ch_versions.mix(SAMTOOLS_FASTA.out.versions.first())
 
 
     // Branch input by sequencing type
     SAMTOOLS_FASTA.out.interleaved
+    | mix ( ch_reads_by_type.fastq )
     | branch {
         meta, reads ->
             hic: meta.datatype == "hic"

From 60c2f1d71275cd4fc5cbf08112c5cac907efba4c Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jan 2024 17:44:33 +0000
Subject: [PATCH 13/44] Like in the genome-note pipeline having a duplicated
 profile entry causes some problems

---
 conf/modules.config | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 155111ab..9f7641b5 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -62,8 +62,11 @@ process {
 
     withName: "BUSCO" {
         scratch = true
-        // Overridden in the test profile, see at the end of this file
-        ext.args = "--force"
+        ext.args = { 'test' in workflow.profile.tokenize(',') ?
+                        // Additional configuration to speed processes up during testing.
+                        // Note: BUSCO *must* see the double-quotes around the parameters
+                        '--force --metaeuk_parameters \'"-s=2"\' --metaeuk_rerun_parameters \'"-s=2"\''
+                    : '--force' }
         publishDir = [
             path: { "${params.outdir}/BUSCO" },
             mode: params.publish_dir_mode,
@@ -164,26 +167,3 @@ process {
 
 }
 
-
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Additional configuration to speed processes up during testing.
-
-----------------------------------------------------------------------------------------
-*/
-
-profiles {
-    test {
-        process {
-            withName: BUSCO {
-                // Note: BUSCO *must* see the double-quotes around the parameters
-                ext.args = '--force --metaeuk_parameters \'"-s=2"\' --metaeuk_rerun_parameters \'"-s=2"\''
-                publishDir = [
-                    path: { "${params.outdir}/BUSCO" },
-                    mode: params.publish_dir_mode,
-                    saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
-                ]
-            }
-        }
-    }
-}

From 363d79625e5b98a79e0d662dcdf1aaa84a63f788 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 11 Jan 2024 08:30:13 +0000
Subject: [PATCH 14/44] Support samplesheets created by nf-core/fetchngs

---
 CHANGELOG.md                               |  11 +
 bin/check_fetchngs_samplesheet.py          | 241 +++++++++++++++++++++
 modules/local/fetchngssamplesheet_check.nf |  32 +++
 nextflow.config                            |   1 +
 nextflow_schema.json                       |   5 +
 subworkflows/local/input_check.nf          |  60 ++++-
 workflows/blobtoolkit.nf                   |   1 +
 7 files changed, 343 insertions(+), 8 deletions(-)
 create mode 100755 bin/check_fetchngs_samplesheet.py
 create mode 100644 modules/local/fetchngssamplesheet_check.nf

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 528dd210..8da8f3dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed the conversion of the unaligned input files to Fasta
 - Fixed the documentation about preparing the NT database
 - Fixed the detection of the NT database in the nf-core module
+- The pipeline now supports samplesheets generated by the
+  [nf-core/fetchngs](https://nf-co.re/fetchngs) pipeline by passing the
+  `--fetchngs_samplesheet true` option.
+
+### Parameters
+
+| Old parameter | New parameter          |
+| ------------- | ---------------------- |
+|               | --fetchngs_samplesheet |
+
+> **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
 
 ## [[0.2.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.2.0)] – Pikachu – [2023-12-22]
 
diff --git a/bin/check_fetchngs_samplesheet.py b/bin/check_fetchngs_samplesheet.py
new file mode 100755
index 00000000..2e1edac9
--- /dev/null
+++ b/bin/check_fetchngs_samplesheet.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python
+
+
+"""Provide a command line tool to validate and transform tabular samplesheets."""
+
+
+import argparse
+import csv
+import logging
+import sys
+from collections import Counter
+from pathlib import Path
+
+logger = logging.getLogger()
+
+
+class RowChecker:
+    """
+    Define a service that can validate and transform each given row.
+
+    Attributes:
+        modified (list): A list of dicts, where each dict corresponds to a previously
+            validated and transformed row. The order of rows is maintained.
+
+    """
+
+    VALID_FORMATS = (
+        ".fastq.gz",
+    )
+
+    def __init__(
+        self,
+        accession_col="run_accession",
+        model_col="instrument_model",
+        platform_col="instrument_platform",
+        library_col="library_strategy",
+        file1_col="fastq_1",
+        **kwargs,
+    ):
+        """
+        Initialize the row checker with the expected column names.
+
+        Args:
+            accession_col (str): The name of the column that contains the accession name
+                (default "run_accession").
+            model_col (str): The name of the column that contains the model name
+                of the instrument (default "instrument_model").
+            platform_col (str): The name of the column that contains the platform name
+                of the instrument (default "instrument_platform").
+            library_col (str): The name of the column that contains the strategy of the
+                preparation of the library (default "library_strategy").
+        """
+        super().__init__(**kwargs)
+        self._accession_col = accession_col
+        self._model_col = model_col
+        self._platform_col = platform_col
+        self._library_col = library_col
+        self._file1_col = file1_col
+        self._seen = set()
+        self.modified = []
+
+    def validate_and_transform(self, row):
+        """
+        Perform all validations on the given row.
+
+        Args:
+            row (dict): A mapping from column headers (keys) to elements of that row
+                (values).
+
+        """
+        self._validate_accession(row)
+        self._validate_file(row)
+        self._seen.add((row[self._accession_col], row[self._file1_col]))
+        self.modified.append(row)
+
+    def _validate_accession(self, row):
+        """Assert that the run accession name exists."""
+        if len(row[self._accession_col]) <= 0:
+            raise AssertionError("Run accession is required.")
+
+    def _validate_file(self, row):
+        """Assert that the datafile is non-empty and has the right format."""
+        if len(row[self._file1_col]) <= 0:
+            raise AssertionError("Data file is required.")
+        self._validate_data_format(row[self._file1_col])
+
+    def _validate_data_format(self, filename):
+        """Assert that a given filename has one of the expected FASTQ extensions."""
+        if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
+            raise AssertionError(
+                f"The data file has an unrecognized extension: {filename}\n"
+                f"It should be one of: {', '.join(self.VALID_FORMATS)}"
+            )
+
+    def validate_unique_accessions(self):
+        """
+        Assert that the combination of accession name and aligned filename is unique.
+
+        In addition to the validation, also rename all accessions to have a suffix of _T{n}, where n is the
+        number of times the same accession exist, but with different FASTQ files, e.g., multiple runs per experiment.
+
+        """
+        if len(self._seen) != len(self.modified):
+            raise AssertionError("The pair of accession and file name must be unique.")
+        seen = Counter()
+        for row in self.modified:
+            accession = row[self._accession_col]
+            seen[accession] += 1
+            row[self._accession_col] = f"{accession}_T{seen[accession]}"
+
+
+def read_head(handle, num_lines=10):
+    """Read the specified number of lines from the current position in the file."""
+    lines = []
+    for idx, line in enumerate(handle):
+        if idx == num_lines:
+            break
+        lines.append(line)
+    return "".join(lines)
+
+
+def sniff_format(handle):
+    """
+    Detect the tabular format.
+
+    Args:
+        handle (text file): A handle to a `text file`_ object. The read position is
+        expected to be at the beginning (index 0).
+
+    Returns:
+        csv.Dialect: The detected tabular format.
+
+    .. _text file:
+        https://docs.python.org/3/glossary.html#term-text-file
+
+    """
+    peek = read_head(handle)
+    handle.seek(0)
+    sniffer = csv.Sniffer()
+    dialect = sniffer.sniff(peek)
+    return dialect
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    Check that the tabular samplesheet has the structure expected by sanger-tol pipelines.
+
+    Validate the general shape of the table, expected columns, and each row. Also add
+    Args:
+        file_in (pathlib.Path): The given tabular samplesheet. The format can be either
+            CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
+        file_out (pathlib.Path): Where the validated and transformed samplesheet should
+            be created; always in CSV format.
+
+    Example:
+        This function checks that the samplesheet follows the following structure,
+        see also the `blobtoolkit samplesheet`_::
+
+        sample,datatype,datafile
+        sample1,hic,/path/to/file1.cram
+        sample1,pacbio,/path/to/file2.cram
+        sample1,ont,/path/to/file3.cram
+
+    .. _blobtoolkit samplesheet:
+        https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv
+
+    """
+    required_columns = {"run_accession", "instrument_model", "instrument_platform", "library_strategy", "fastq_1"}
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_in.open(newline="") as in_handle:
+        reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
+        # Validate the existence of the expected header columns.
+        if not required_columns.issubset(reader.fieldnames):
+            req_cols = ", ".join(required_columns)
+            logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.")
+            sys.exit(1)
+        # Validate each row.
+        checker = RowChecker()
+        for i, row in enumerate(reader):
+            try:
+                checker.validate_and_transform(row)
+            except AssertionError as error:
+                logger.critical(f"{str(error)} On line {i + 2}.")
+                sys.exit(1)
+        checker.validate_unique_accessions()
+    header = list(reader.fieldnames)
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_out.open(mode="w", newline="") as out_handle:
+        writer = csv.DictWriter(out_handle, header, delimiter=",")
+        writer.writeheader()
+        for row in checker.modified:
+            writer.writerow(row)
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Validate and transform a tabular samplesheet.",
+        epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv",
+    )
+    parser.add_argument(
+        "file_in",
+        metavar="FILE_IN",
+        type=Path,
+        help="Tabular input samplesheet in CSV or TSV format.",
+    )
+    parser.add_argument(
+        "file_out",
+        metavar="FILE_OUT",
+        type=Path,
+        help="Transformed output samplesheet in CSV format.",
+    )
+    parser.add_argument(
+        "-l",
+        "--log-level",
+        help="The desired log level (default WARNING).",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
+        default="WARNING",
+    )
+    parser.add_argument(
+        "-v",
+        "--version",
+        action="version",
+        version="%(prog)s 1.0.0",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv=None):
+    """Coordinate argument parsing and program execution."""
+    args = parse_args(argv)
+    logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
+    if not args.file_in.is_file():
+        logger.error(f"The given input file {args.file_in} was not found!")
+        sys.exit(2)
+    args.file_out.parent.mkdir(parents=True, exist_ok=True)
+    check_samplesheet(args.file_in, args.file_out)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/modules/local/fetchngssamplesheet_check.nf b/modules/local/fetchngssamplesheet_check.nf
new file mode 100644
index 00000000..962768d5
--- /dev/null
+++ b/modules/local/fetchngssamplesheet_check.nf
@@ -0,0 +1,32 @@
+process FETCHNGSSAMPLESHEET_CHECK {
+    tag "$samplesheet"
+    label 'process_single'
+
+    conda "conda-forge::python=3.9.1"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.9--1' :
+        'biocontainers/python:3.9--1' }"
+
+    input:
+    path samplesheet
+
+    output:
+    path '*.csv'       , emit: csv
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script: // This script is bundled with the pipeline, in sanger-tol/blobtoolkit/bin/
+    """
+    check_fetchngs_samplesheet.py \\
+        $samplesheet \\
+        samplesheet.valid.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        check_fetchngs_samplesheet.py: \$(check_fetchngs_samplesheet.py --version | cut -d' ' -f2)
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
index 98b0398c..25d6f721 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -15,6 +15,7 @@ params {
     yaml                       = null
     align                      = false
     mask                       = false
+    fetchngs_samplesheet       = false
 
     // Reference options    
     fasta                      = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 37c8a567..a9dc4885 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -32,6 +32,11 @@
                     "description": "Turn on optional genome masking if needed.",
                     "fa_icon": "fas fa-toggle-off"
                 },
+                "fetchngs_samplesheet": {
+                    "type": "boolean",
+                    "description": "Turn on the conversion from a nf-core/fetchngs samplesheet.",
+                    "fa_icon": "fas fa-toggle-off"
+                },
                 "yaml": {
                     "type": "string",
                     "format": "file-path",
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index bb271f53..f8d515fd 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -2,8 +2,9 @@
 // Check input samplesheet and get aligned read channels
 //
 
-include { SAMPLESHEET_CHECK  } from '../../modules/local/samplesheet_check'
-include { BLOBTOOLKIT_CONFIG } from '../../modules/local/blobtoolkit/config'
+include { SAMPLESHEET_CHECK         } from '../../modules/local/samplesheet_check'
+include { FETCHNGSSAMPLESHEET_CHECK } from '../../modules/local/fetchngssamplesheet_check'
+include { BLOBTOOLKIT_CONFIG        } from '../../modules/local/blobtoolkit/config'
 
 workflow INPUT_CHECK {
     take:
@@ -14,14 +15,23 @@ workflow INPUT_CHECK {
     main:
     ch_versions = Channel.empty()
 
+    if ( params.fetchngs_samplesheet ) {
+        FETCHNGSSAMPLESHEET_CHECK ( samplesheet )
+            .csv
+            .splitCsv ( header:true, sep:',' )
+            .map { create_data_channels_from_fetchngs(it) }
+            .set { aln }
+        ch_versions = ch_versions.mix ( FETCHNGSSAMPLESHEET_CHECK.out.versions.first() )
 
-    SAMPLESHEET_CHECK ( samplesheet )
-        .csv
-        .splitCsv ( header:true, sep:',' )
-        .map { create_data_channels(it) }
-        .set { aln }
+    } else {
+        SAMPLESHEET_CHECK ( samplesheet )
+            .csv
+            .splitCsv ( header:true, sep:',' )
+            .map { create_data_channels(it) }
+            .set { aln }
+        ch_versions = ch_versions.mix ( SAMPLESHEET_CHECK.out.versions.first() )
+    }
 
-    ch_versions = ch_versions.mix ( SAMPLESHEET_CHECK.out.versions.first() )
 
     if ( !params.yaml ) {
         aln
@@ -67,3 +77,37 @@ def create_data_channels(LinkedHashMap row) {
 
     return data_meta
 }
+
+// Function to get list of [ meta, datafile ]
+def create_data_channels_from_fetchngs(LinkedHashMap row) {
+    // create meta map
+    def meta = [:]
+    meta.id         = row.run_accession
+
+    switch (row.instrument_platform) {
+        case "ILLUMINA":
+            meta.datatype = (row.library_strategy == "Hi-C" ? "hic" : "illumina")
+            break
+        case "OXFORD_NANOPORE":
+            meta.datatype = "ont"
+            break
+        case "PACBIO_SMRT":
+            meta.datatype = (row.instrument_model == "Sequel" ? "pacbio_clr" : "pacbio")
+            break
+        default:
+            meta.datatype = "illumina"
+    }
+
+
+    // add path(s) of the read file(s) to the meta map
+    def data_meta = []
+
+    if ( !file(row.fastq_1).exists() ) {
+        exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.fastq_1}"
+    } else {
+        data_meta = [ meta, file(row.fastq_1) ]
+    }
+
+    return data_meta
+}
+
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index 0d452a54..aeb1bdfc 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -28,6 +28,7 @@ if (params.blastp && params.accession) { ch_blastp = Channel.of([ [ 'id': params
 if (params.blastx && params.accession) { ch_blastx = Channel.of([ [ 'id': params.accession ], params.blastx ]).first() } else { exit 1, 'Diamond BLASTx database and accession must be specified!' }
 if (params.blastn && params.accession) { ch_blastn = Channel.of([ [ 'id': params.accession ], params.blastn ]).first() } else { exit 1, 'BLASTn database not specified!' }
 if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' }
+if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' }
 
 // Create channel for optional parameters
 if (params.busco) { ch_busco_db = Channel.fromPath(params.busco) } else { ch_busco_db = Channel.empty() }

From 74b60dd4f46da934e9cf23ce1292584c135b78b0 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 11 Jan 2024 08:41:37 +0000
Subject: [PATCH 15/44] Support input read-sets being paired-end

---
 bin/check_fetchngs_samplesheet.py             |   8 +-
 modules.json                                  |   5 +
 modules/nf-core/cat/cat/environment.yml       |   7 +
 modules/nf-core/cat/cat/main.nf               |  70 +++++++
 modules/nf-core/cat/cat/meta.yml              |  36 ++++
 modules/nf-core/cat/cat/tests/main.nf.test    | 179 ++++++++++++++++++
 .../nf-core/cat/cat/tests/main.nf.test.snap   | 121 ++++++++++++
 .../cat/tests/nextflow_unzipped_zipped.config |   6 +
 .../cat/tests/nextflow_zipped_unzipped.config |   8 +
 modules/nf-core/cat/cat/tests/tags.yml        |   2 +
 subworkflows/local/input_check.nf             |  18 +-
 11 files changed, 457 insertions(+), 3 deletions(-)
 create mode 100644 modules/nf-core/cat/cat/environment.yml
 create mode 100644 modules/nf-core/cat/cat/main.nf
 create mode 100644 modules/nf-core/cat/cat/meta.yml
 create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test
 create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
 create mode 100644 modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
 create mode 100644 modules/nf-core/cat/cat/tests/tags.yml

diff --git a/bin/check_fetchngs_samplesheet.py b/bin/check_fetchngs_samplesheet.py
index 2e1edac9..41128254 100755
--- a/bin/check_fetchngs_samplesheet.py
+++ b/bin/check_fetchngs_samplesheet.py
@@ -35,6 +35,7 @@ def __init__(
         platform_col="instrument_platform",
         library_col="library_strategy",
         file1_col="fastq_1",
+        file2_col="fastq_2",
         **kwargs,
     ):
         """
@@ -49,6 +50,8 @@ def __init__(
                 of the instrument (default "instrument_platform").
             library_col (str): The name of the column that contains the strategy of the
                 preparation of the library (default "library_strategy").
+            file2_col (str): The name of the column that contains the second file path
+                for the paired-end read data (default "fastq_2").
         """
         super().__init__(**kwargs)
         self._accession_col = accession_col
@@ -56,6 +59,7 @@ def __init__(
         self._platform_col = platform_col
         self._library_col = library_col
         self._file1_col = file1_col
+        self._file2_col = file2_col
         self._seen = set()
         self.modified = []
 
@@ -83,6 +87,8 @@ def _validate_file(self, row):
         if len(row[self._file1_col]) <= 0:
             raise AssertionError("Data file is required.")
         self._validate_data_format(row[self._file1_col])
+        if row[self._file2_col]:
+            self._validate_data_format(row[self._file2_col])
 
     def _validate_data_format(self, filename):
         """Assert that a given filename has one of the expected FASTQ extensions."""
@@ -165,7 +171,7 @@ def check_samplesheet(file_in, file_out):
         https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv
 
     """
-    required_columns = {"run_accession", "instrument_model", "instrument_platform", "library_strategy", "fastq_1"}
+    required_columns = {"run_accession", "instrument_model", "instrument_platform", "library_strategy", "fastq_1", "fastq_2"}
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
     with file_in.open(newline="") as in_handle:
         reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
diff --git a/modules.json b/modules.json
index 97f74e42..2efbb154 100644
--- a/modules.json
+++ b/modules.json
@@ -17,6 +17,11 @@
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/busco/busco.diff"
                     },
+                    "cat/cat": {
+                        "branch": "master",
+                        "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2",
+                        "installed_by": ["modules"]
+                    },
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
                         "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e",
diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml
new file mode 100644
index 00000000..17a04ef2
--- /dev/null
+++ b/modules/nf-core/cat/cat/environment.yml
@@ -0,0 +1,7 @@
+name: cat_cat
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - conda-forge::pigz=2.3.4
diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf
new file mode 100644
index 00000000..970ab760
--- /dev/null
+++ b/modules/nf-core/cat/cat/main.nf
@@ -0,0 +1,70 @@
+process CAT_CAT {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pigz:2.3.4' :
+        'biocontainers/pigz:2.3.4' }"
+
+    input:
+    tuple val(meta), path(files_in)
+
+    output:
+    tuple val(meta), path("${prefix}"), emit: file_out
+    path "versions.yml"               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def args2 = task.ext.args2 ?: ''
+    def file_list = files_in.collect { it.toString() }
+
+    // | input     | output     | command1 | command2 |
+    // |-----------|------------|----------|----------|
+    // | gzipped   | gzipped    | cat      |          |
+    // | ungzipped | ungzipped  | cat      |          |
+    // | gzipped   | ungzipped  | zcat     |          |
+    // | ungzipped | gzipped    | cat      | pigz     |
+
+    // Use input file ending as default
+    prefix   = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}"
+    out_zip  = prefix.endsWith('.gz')
+    in_zip   = file_list[0].endsWith('.gz')
+    command1 = (in_zip && !out_zip) ? 'zcat' : 'cat'
+    command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : ''
+    if(file_list.contains(prefix.trim())) {
+        error "The name of the input file can't be the same as for the output prefix in the " +
+        "module CAT_CAT (currently `$prefix`). Please choose a different one."
+    }
+    """
+    $command1 \\
+        $args \\
+        ${file_list.join(' ')} \\
+        $command2 \\
+        > ${prefix}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+    END_VERSIONS
+    """
+
+    stub:
+    def file_list   = files_in.collect { it.toString() }
+    prefix          = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}"
+    if(file_list.contains(prefix.trim())) {
+        error "The name of the input file can't be the same as for the output prefix in the " +
+        "module CAT_CAT (currently `$prefix`). Please choose a different one."
+    }
+    """
+    touch $prefix
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml
new file mode 100644
index 00000000..00a8db0b
--- /dev/null
+++ b/modules/nf-core/cat/cat/meta.yml
@@ -0,0 +1,36 @@
+name: cat_cat
+description: A module for concatenation of gzipped or uncompressed files
+keywords:
+  - concatenate
+  - gzip
+  - cat
+tools:
+  - cat:
+      description: Just concatenation
+      documentation: https://man7.org/linux/man-pages/man1/cat.1.html
+      licence: ["GPL-3.0-or-later"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - files_in:
+      type: file
+      description: List of compressed / uncompressed files
+      pattern: "*"
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - file_out:
+      type: file
+      description: Concatenated file. Will be gzipped if file_out ends with ".gz"
+      pattern: "${file_out}"
+authors:
+  - "@erikrikarddaniel"
+  - "@FriederikeHanssen"
+maintainers:
+  - "@erikrikarddaniel"
+  - "@FriederikeHanssen"
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test
new file mode 100644
index 00000000..ed5a4f12
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/main.nf.test
@@ -0,0 +1,179 @@
+nextflow_process {
+
+    name "Test Process CAT_CAT"
+    script "../main.nf"
+    process "CAT_CAT"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "cat"
+    tag "cat/cat"
+
+    test("test_cat_name_conflict") {
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'genome', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            assertAll(
+                { assert !process.success },
+                { assert process.stdout.toString().contains("The name of the input file can't be the same as for the output prefix") }
+            )
+        }
+    }
+
+    test("test_cat_unzipped_unzipped") {
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+
+    test("test_cat_zipped_zipped") {
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+            assertAll(
+                { assert process.success },
+                { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") },
+                { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")}
+            )
+        }
+    }
+
+    test("test_cat_zipped_unzipped") {
+        config './nextflow_zipped_unzipped.config'
+
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+    test("test_cat_unzipped_zipped") {
+        config './nextflow_unzipped_zipped.config'
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+            assertAll(
+                { assert process.success },
+                { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") },
+                { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")}
+            )
+        }
+    }
+
+    test("test_cat_one_file_unzipped_zipped") {
+        config './nextflow_unzipped_zipped.config'
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+            assertAll(
+                { assert process.success },
+                { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") },
+                { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")}
+            )
+        }
+    }
+}
+
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap
new file mode 100644
index 00000000..423571ba
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap
@@ -0,0 +1,121 @@
+{
+    "test_cat_unzipped_zipped_size": {
+        "content": [
+            375
+        ],
+        "timestamp": "2023-10-16T14:33:08.049445686"
+    },
+    "test_cat_unzipped_unzipped": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ],
+                "file_out": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ]
+            }
+        ],
+        "timestamp": "2023-10-16T14:32:18.500464399"
+    },
+    "test_cat_zipped_unzipped": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ],
+                "file_out": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ]
+            }
+        ],
+        "timestamp": "2023-10-16T14:32:49.642741302"
+    },
+    "test_cat_zipped_zipped_lines": {
+        "content": [
+            [
+                "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab",
+                "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1",
+                "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
+                "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
+                "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1",
+                "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1"
+            ]
+        ],
+        "timestamp": "2023-10-16T14:32:33.629048645"
+    },
+    "test_cat_unzipped_zipped_lines": {
+        "content": [
+            [
+                ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome",
+                "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT",
+                "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG",
+                "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG",
+                "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT",
+                "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG"
+            ]
+        ],
+        "timestamp": "2023-10-16T14:33:08.038830506"
+    },
+    "test_cat_one_file_unzipped_zipped_lines": {
+        "content": [
+            [
+                ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome",
+                "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT",
+                "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG",
+                "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG",
+                "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT",
+                "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG"
+            ]
+        ],
+        "timestamp": "2023-10-16T14:33:21.39642399"
+    },
+    "test_cat_zipped_zipped_size": {
+        "content": [
+            78
+        ],
+        "timestamp": "2023-10-16T14:32:33.641869244"
+    },
+    "test_cat_one_file_unzipped_zipped_size": {
+        "content": [
+            374
+        ],
+        "timestamp": "2023-10-16T14:33:21.4094373"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
new file mode 100644
index 00000000..ec26b0fd
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
@@ -0,0 +1,6 @@
+
+process {
+    withName: CAT_CAT {
+        ext.prefix = 'cat.txt.gz'
+    }
+}
diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
new file mode 100644
index 00000000..fbc79783
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
@@ -0,0 +1,8 @@
+
+process {
+
+    withName: CAT_CAT {
+        ext.prefix = 'cat.txt'
+    }
+
+}
diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml
new file mode 100644
index 00000000..37b578f5
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/tags.yml
@@ -0,0 +1,2 @@
+cat/cat:
+  - modules/nf-core/cat/cat/**
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index f8d515fd..c0f0ad34 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -2,6 +2,7 @@
 // Check input samplesheet and get aligned read channels
 //
 
+include { CAT_CAT                   } from '../../modules/nf-core/cat/cat/main'
 include { SAMPLESHEET_CHECK         } from '../../modules/local/samplesheet_check'
 include { FETCHNGSSAMPLESHEET_CHECK } from '../../modules/local/fetchngssamplesheet_check'
 include { BLOBTOOLKIT_CONFIG        } from '../../modules/local/blobtoolkit/config'
@@ -19,10 +20,23 @@ workflow INPUT_CHECK {
         FETCHNGSSAMPLESHEET_CHECK ( samplesheet )
             .csv
             .splitCsv ( header:true, sep:',' )
-            .map { create_data_channels_from_fetchngs(it) }
-            .set { aln }
+            .branch { row ->
+                paired: row.fastq_2
+                    [[id: row.run_accession, row:row], [row.fastq_1, row.fastq_2]]
+                not_paired: true
+            }
+            .set { reads_pairedness }
         ch_versions = ch_versions.mix ( FETCHNGSSAMPLESHEET_CHECK.out.versions.first() )
 
+        CAT_CAT ( reads_pairedness.paired )
+        ch_versions = ch_versions.mix ( CAT_CAT.out.versions.first() )
+
+        CAT_CAT.out.file_out
+        | map { meta, file -> meta.row + [fastq_1: file] }
+        | mix ( reads_pairedness.not_paired )
+        | map { create_data_channels_from_fetchngs(it) }
+        | set { aln }
+
     } else {
         SAMPLESHEET_CHECK ( samplesheet )
             .csv

From a18fe529fac15ff7e76c14de5e357d36ff05c70b Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 12 Jan 2024 10:46:15 +0000
Subject: [PATCH 16/44] [lint] black

---
 bin/check_fetchngs_samplesheet.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/bin/check_fetchngs_samplesheet.py b/bin/check_fetchngs_samplesheet.py
index 41128254..324811c9 100755
--- a/bin/check_fetchngs_samplesheet.py
+++ b/bin/check_fetchngs_samplesheet.py
@@ -24,9 +24,7 @@ class RowChecker:
 
     """
 
-    VALID_FORMATS = (
-        ".fastq.gz",
-    )
+    VALID_FORMATS = (".fastq.gz",)
 
     def __init__(
         self,
@@ -171,7 +169,14 @@ def check_samplesheet(file_in, file_out):
         https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv
 
     """
-    required_columns = {"run_accession", "instrument_model", "instrument_platform", "library_strategy", "fastq_1", "fastq_2"}
+    required_columns = {
+        "run_accession",
+        "instrument_model",
+        "instrument_platform",
+        "library_strategy",
+        "fastq_1",
+        "fastq_2",
+    }
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
     with file_in.open(newline="") as in_handle:
         reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))

From 7527e7a9b9215148fb91f2d7661ecad4ca19f43a Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 12 Jan 2024 14:25:50 +0000
Subject: [PATCH 17/44] Updated CAT_CAT module that preserves the file
 extension

This means that .fastq.gz will remain .fastq.gz and can then match the
condition to bypass the SAMTOOLS_FASTA process.

There is a pull-request for this, https://github.com/nf-core/modules/pull/4230
---
 modules.json                         |  3 ++-
 modules/nf-core/cat/cat/cat-cat.diff | 34 ++++++++++++++++++++++++++++
 modules/nf-core/cat/cat/main.nf      | 11 ++++++++-
 3 files changed, 46 insertions(+), 2 deletions(-)
 create mode 100644 modules/nf-core/cat/cat/cat-cat.diff

diff --git a/modules.json b/modules.json
index 2efbb154..d77a8341 100644
--- a/modules.json
+++ b/modules.json
@@ -20,7 +20,8 @@
                     "cat/cat": {
                         "branch": "master",
                         "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/cat/cat/cat-cat.diff"
                     },
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
diff --git a/modules/nf-core/cat/cat/cat-cat.diff b/modules/nf-core/cat/cat/cat-cat.diff
new file mode 100644
index 00000000..4d2fedae
--- /dev/null
+++ b/modules/nf-core/cat/cat/cat-cat.diff
@@ -0,0 +1,34 @@
+Changes in module 'nf-core/cat/cat'
+--- modules/nf-core/cat/cat/main.nf
++++ modules/nf-core/cat/cat/main.nf
+@@ -22,6 +22,8 @@
+     def args2 = task.ext.args2 ?: ''
+     def file_list = files_in.collect { it.toString() }
+ 
++    // choose appropriate concatenation tool depending on input and output format
++
+     // | input     | output     | command1 | command2 |
+     // |-----------|------------|----------|----------|
+     // | gzipped   | gzipped    | cat      |          |
+@@ -30,7 +32,7 @@
+     // | ungzipped | gzipped    | cat      | pigz     |
+ 
+     // Use input file ending as default
+-    prefix   = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}"
++    prefix   = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}"
+     out_zip  = prefix.endsWith('.gz')
+     in_zip   = file_list[0].endsWith('.gz')
+     command1 = (in_zip && !out_zip) ? 'zcat' : 'cat'
+@@ -68,3 +70,10 @@
+     END_VERSIONS
+     """
+ }
++
++// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz
++def getFileSuffix(filename) {
++    def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/
++    return match ? match[0][1] : filename.substring(filename.lastIndexOf('.'))
++}
++
+
+************************************************************
diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf
index 970ab760..adbdbd7b 100644
--- a/modules/nf-core/cat/cat/main.nf
+++ b/modules/nf-core/cat/cat/main.nf
@@ -22,6 +22,8 @@ process CAT_CAT {
     def args2 = task.ext.args2 ?: ''
     def file_list = files_in.collect { it.toString() }
 
+    // choose appropriate concatenation tool depending on input and output format
+
     // | input     | output     | command1 | command2 |
     // |-----------|------------|----------|----------|
     // | gzipped   | gzipped    | cat      |          |
@@ -30,7 +32,7 @@ process CAT_CAT {
     // | ungzipped | gzipped    | cat      | pigz     |
 
     // Use input file ending as default
-    prefix   = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}"
+    prefix   = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}"
     out_zip  = prefix.endsWith('.gz')
     in_zip   = file_list[0].endsWith('.gz')
     command1 = (in_zip && !out_zip) ? 'zcat' : 'cat'
@@ -68,3 +70,10 @@ process CAT_CAT {
     END_VERSIONS
     """
 }
+
+// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz
+def getFileSuffix(filename) {
+    def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/
+    return match ? match[0][1] : filename.substring(filename.lastIndexOf('.'))
+}
+

From b99101bad1fa6e48e1d9667ab9aff61ac531bdbf Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 15 Jan 2024 18:26:24 +0000
Subject: [PATCH 18/44] Report all BUSCOs and in the right order

---
 modules/local/blobtoolkit/createblobdir.nf |  5 ++--
 subworkflows/local/busco_diamond_blastp.nf | 27 +++++++++++++++++-----
 subworkflows/local/collate_stats.nf        |  8 ++-----
 workflows/blobtoolkit.nf                   |  4 ++--
 4 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/modules/local/blobtoolkit/createblobdir.nf b/modules/local/blobtoolkit/createblobdir.nf
index 54810650..22399365 100644
--- a/modules/local/blobtoolkit/createblobdir.nf
+++ b/modules/local/blobtoolkit/createblobdir.nf
@@ -9,7 +9,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR {
 
     input:
     tuple val(meta), path(window, stageAs: 'windowstats/*')
-    tuple val(meta1), path(busco)
+    tuple val(meta1), path(busco, stageAs: 'lineage??/*')
     tuple val(meta2), path(blastp)
     tuple val(meta3), path(yaml)
     path(taxdump)
@@ -24,6 +24,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR {
     script:
     def args = task.ext.args ?: ''
     prefix = task.ext.prefix ?: "${meta.id}"
+    def busco_args = busco.collect { "--busco " + it } .join(' ')
     def hits_blastp = blastp ? "--hits ${blastp}" : ""
     """
     blobtools replace \\
@@ -31,7 +32,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR {
         --meta ${yaml} \\
         --taxdump ${taxdump} \\
         --taxrule buscogenes \\
-        --busco ${busco} \\
+        ${busco_args} \\
         ${hits_blastp} \\
         --threads ${task.cpus} \\
         $args \\
diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf
index 6037de19..709bffaf 100644
--- a/subworkflows/local/busco_diamond_blastp.nf
+++ b/subworkflows/local/busco_diamond_blastp.nf
@@ -51,7 +51,7 @@ workflow BUSCO_DIAMOND {
 
 
     // Add the basal lineages to the list (excluding duplicates)
-    basal_lineages = [ "archaea_odb10", "bacteria_odb10", "eukaryota_odb10" ]
+    basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ]
     ch_ancestral_lineages
     | map { lineages -> (lineages + basal_lineages).unique() }
     | flatten ()
@@ -86,11 +86,26 @@ workflow BUSCO_DIAMOND {
     ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() )
 
 
-    // Select BUSCO results for taxonomically closest database
+    // Index the lineages in the taxonomic order
+    def lineage_index = 0
+    ch_lineages
+    | map { lineage -> [lineage, lineage_index++] }
+    | set { ch_ordered_lineages }
+
+
+    // Order BUSCO results accoring to ch_lineages
     BUSCO.out.full_table
-    | combine ( ch_lineages.toList().map { it[0] } )
-    | filter { meta, table, lineage -> table =~ /$lineage/ }
-    | map { meta, table, lineage -> [ meta, table ] }
+    | map { meta, table -> [table.parent.baseName.minus("run_"), meta, table] }
+    | join ( ch_ordered_lineages )
+    | map { lineage, meta, table, index -> [meta, table, index] }
+    | groupTuple()
+    | map { meta, tables, indexes -> [ meta, tables.withIndex().sort { a, b -> indexes[a[1]] <=> indexes[b[1]] } . collect { table, i -> table } ] }
+    | set { ch_indexed_buscos }
+
+
+    // Select BUSCO results for taxonomically closest database
+    ch_indexed_buscos
+    | map { meta, tables -> [meta, tables[0]] }
     | set { ch_first_table }
 
 
@@ -102,7 +117,7 @@ workflow BUSCO_DIAMOND {
 
     emit:
     first_table = ch_first_table          // channel: [ val(meta), path(full_table) ] 
-    full_table  = BUSCO.out.full_table    // channel: [ val(meta), path(full_tables) ]
+    all_tables  = ch_indexed_buscos       // channel: [ val(meta), path(full_tables) ]
     blastp_txt  = DIAMOND_BLASTP.out.txt  // channel: [ val(meta), path(txt) ]
     taxon_id    = ch_taxid                // channel: taxon_id
     multiqc                               // channel: [ meta, summary ]
diff --git a/subworkflows/local/collate_stats.nf b/subworkflows/local/collate_stats.nf
index 21baf44a..08bc43c9 100644
--- a/subworkflows/local/collate_stats.nf
+++ b/subworkflows/local/collate_stats.nf
@@ -9,7 +9,7 @@ include { BLOBTOOLKIT_WINDOWSTATS } from '../../modules/local/blobtoolkit/window
 
 workflow COLLATE_STATS {
     take: 
-    busco_table // channel: [ val(meta), path(full_table) ]
+    busco       // channel: [ val(meta), path(full_table) ]
     bed         // channel: [ val(meta), path(bed) ]
     freq        // channel: [ val(meta), path(freq) ]
     mononuc     // channel: [ val(meta), path(mononuc) ]
@@ -20,11 +20,7 @@ workflow COLLATE_STATS {
 
 
     // Count BUSCO genes in a region
-    busco_table
-    | groupTuple()
-    | set { ch_busco }
-
-    BLOBTOOLKIT_COUNTBUSCOS ( ch_busco, bed )
+    BLOBTOOLKIT_COUNTBUSCOS ( busco, bed )
     ch_versions = ch_versions.mix ( BLOBTOOLKIT_COUNTBUSCOS.out.versions.first() )
 
 
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index aeb1bdfc..944ccc4c 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -172,7 +172,7 @@ workflow BLOBTOOLKIT {
     // SUBWORKFLOW: Collate genome statistics by various window sizes
     //
     COLLATE_STATS ( 
-        BUSCO_DIAMOND.out.full_table, 
+        BUSCO_DIAMOND.out.all_tables,
         COVERAGE_STATS.out.bed, 
         COVERAGE_STATS.out.freq, 
         COVERAGE_STATS.out.mononuc, 
@@ -186,7 +186,7 @@ workflow BLOBTOOLKIT {
     BLOBTOOLS ( 
         INPUT_CHECK.out.config,
         COLLATE_STATS.out.window_tsv,
-        BUSCO_DIAMOND.out.first_table,
+        BUSCO_DIAMOND.out.all_tables,
         BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]),
         RUN_BLASTX.out.blastx_out.ifEmpty([[],[]]),
         RUN_BLASTN.out.blastn_out.ifEmpty([[],[]]),

From 9dc30da2887d52a2e49782ff11e394266866a531 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 16 Jan 2024 13:45:32 +0000
Subject: [PATCH 19/44] Removed "--update-plot" as it is not used by default in
 the Snakemake version

---
 conf/modules.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index 9f7641b5..bebb909e 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -100,7 +100,7 @@ process {
     }
 
     withName: "BLOBTOOLKIT_UPDATEBLOBDIR" {
-        ext.args = "--evalue 1.0e-25 --hit-count 10 --update-plot"
+        ext.args = "--evalue 1.0e-25 --hit-count 10"
         publishDir = [
             path: { "${params.outdir}/" },
             mode: params.publish_dir_mode,

From 48ebdca2df9abdc72a7a6735625d65679bfb36e7 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 18 Jan 2024 17:19:58 +0000
Subject: [PATCH 20/44] Bumped up the number of retries to better accommodate
 larger genomes

---
 conf/base.config | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/conf/base.config b/conf/base.config
index 4d5e9045..6ebea12c 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -16,7 +16,7 @@ process {
     time   = { check_max( 4.h  * task.attempt, 'time'   ) }
 
     errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
-    maxRetries    = 1
+    maxRetries    = 5
     maxErrors     = '-1'
 
     // Process-specific resource requirements
@@ -52,13 +52,6 @@ process {
     withLabel:process_high_memory {
         memory = { check_max( 200.GB * task.attempt, 'memory' ) }
     }
-    withLabel:error_ignore {
-        errorStrategy = 'ignore'
-    }
-    withLabel:error_retry {
-        errorStrategy = 'retry'
-        maxRetries    = 2
-    }
     withName:CUSTOM_DUMPSOFTWAREVERSIONS {
         cache = false
     }

From 9e19765dd998471d85dd5f8d8822ede4c1e87277 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 18 Jan 2024 17:22:18 +0000
Subject: [PATCH 21/44] Bumped up the version of blobtoolkit because of a bug
 in windowsstat

---
 modules/local/blobtoolkit/chunk.nf         | 2 +-
 modules/local/blobtoolkit/config.nf        | 2 +-
 modules/local/blobtoolkit/countbuscos.nf   | 2 +-
 modules/local/blobtoolkit/createblobdir.nf | 2 +-
 modules/local/blobtoolkit/extractbuscos.nf | 2 +-
 modules/local/blobtoolkit/metadata.nf      | 2 +-
 modules/local/blobtoolkit/summary.nf       | 2 +-
 modules/local/blobtoolkit/unchunk.nf       | 2 +-
 modules/local/blobtoolkit/updateblobdir.nf | 2 +-
 modules/local/blobtoolkit/windowstats.nf   | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/modules/local/blobtoolkit/chunk.nf b/modules/local/blobtoolkit/chunk.nf
index 38bc37fe..73f27532 100644
--- a/modules/local/blobtoolkit/chunk.nf
+++ b/modules/local/blobtoolkit/chunk.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_CHUNK {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_CHUNK module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta) , path(fasta)
diff --git a/modules/local/blobtoolkit/config.nf b/modules/local/blobtoolkit/config.nf
index 0a9c2f58..d93b85b4 100644
--- a/modules/local/blobtoolkit/config.nf
+++ b/modules/local/blobtoolkit/config.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_CONFIG {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "GENERATE_CONFIG module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta), val(reads)
diff --git a/modules/local/blobtoolkit/countbuscos.nf b/modules/local/blobtoolkit/countbuscos.nf
index e151cde8..bd817bae 100644
--- a/modules/local/blobtoolkit/countbuscos.nf
+++ b/modules/local/blobtoolkit/countbuscos.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_COUNTBUSCOS {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_COUNTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta), path(table, stageAs: 'dir??/*')
diff --git a/modules/local/blobtoolkit/createblobdir.nf b/modules/local/blobtoolkit/createblobdir.nf
index 22399365..b61b8fc9 100644
--- a/modules/local/blobtoolkit/createblobdir.nf
+++ b/modules/local/blobtoolkit/createblobdir.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta), path(window, stageAs: 'windowstats/*')
diff --git a/modules/local/blobtoolkit/extractbuscos.nf b/modules/local/blobtoolkit/extractbuscos.nf
index e34bfd93..1329dd8a 100644
--- a/modules/local/blobtoolkit/extractbuscos.nf
+++ b/modules/local/blobtoolkit/extractbuscos.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_EXTRACTBUSCOS {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_EXTRACTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta), path(fasta)
diff --git a/modules/local/blobtoolkit/metadata.nf b/modules/local/blobtoolkit/metadata.nf
index 8e2d585d..96948345 100644
--- a/modules/local/blobtoolkit/metadata.nf
+++ b/modules/local/blobtoolkit/metadata.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_METADATA {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_METADATA module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta), path(yaml)
diff --git a/modules/local/blobtoolkit/summary.nf b/modules/local/blobtoolkit/summary.nf
index ac92a3b3..45f0471a 100644
--- a/modules/local/blobtoolkit/summary.nf
+++ b/modules/local/blobtoolkit/summary.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_SUMMARY {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_SUMMARY module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta), path(blobdir)
diff --git a/modules/local/blobtoolkit/unchunk.nf b/modules/local/blobtoolkit/unchunk.nf
index b544bf1f..f9797178 100644
--- a/modules/local/blobtoolkit/unchunk.nf
+++ b/modules/local/blobtoolkit/unchunk.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_UNCHUNK {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_UNCHUNK module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta), path(blast_table)
diff --git a/modules/local/blobtoolkit/updateblobdir.nf b/modules/local/blobtoolkit/updateblobdir.nf
index 7a677828..cbcdc7b5 100644
--- a/modules/local/blobtoolkit/updateblobdir.nf
+++ b/modules/local/blobtoolkit/updateblobdir.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEBLOBDIR {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta), path(input)
diff --git a/modules/local/blobtoolkit/windowstats.nf b/modules/local/blobtoolkit/windowstats.nf
index dde880e6..2975ede1 100644
--- a/modules/local/blobtoolkit/windowstats.nf
+++ b/modules/local/blobtoolkit/windowstats.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_WINDOWSTATS {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "GET_WINDOW_STATS module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/genomehubs/blobtoolkit:4.3.2"
+    container "docker.io/genomehubs/blobtoolkit:4.3.3"
 
     input:
     tuple val(meta), path(tsv)

From a815df01ed78c624ee2daf378d40ad5eb469cf87 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 18 Jan 2024 22:44:36 +0000
Subject: [PATCH 22/44] Updated these two modules from upstream

---
 modules.json                                  |  4 +-
 .../nf-core/blast/blastn/blast-blastn.diff    | 14 +--
 .../nf-core/blast/blastn/tests/main.nf.test   |  1 +
 modules/nf-core/cat/cat/cat-cat.diff          | 34 -------
 modules/nf-core/cat/cat/tests/main.nf.test    |  6 +-
 .../nf-core/cat/cat/tests/main.nf.test.snap   | 92 ++++++++++++-------
 6 files changed, 65 insertions(+), 86 deletions(-)
 delete mode 100644 modules/nf-core/cat/cat/cat-cat.diff

diff --git a/modules.json b/modules.json
index d77a8341..7ba1a8db 100644
--- a/modules.json
+++ b/modules.json
@@ -7,7 +7,7 @@
                 "nf-core": {
                     "blast/blastn": {
                         "branch": "master",
-                        "git_sha": "f0d13ae7e1f9b24a705764f8673af859268d7077",
+                        "git_sha": "209e5a3e2753c5e628736a662c877c20f341ee15",
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/blast/blastn/blast-blastn.diff"
                     },
@@ -19,7 +19,7 @@
                     },
                     "cat/cat": {
                         "branch": "master",
-                        "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2",
+                        "git_sha": "81f27e75847087865299cc46605deb3b09b4e0a2",
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/cat/cat/cat-cat.diff"
                     },
diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff
index cfac21d4..1695c793 100644
--- a/modules/nf-core/blast/blastn/blast-blastn.diff
+++ b/modules/nf-core/blast/blastn/blast-blastn.diff
@@ -9,7 +9,7 @@ Changes in module 'nf-core/blast/blastn'
  
      output:
      tuple val(meta), path('*.txt'), emit: txt
-@@ -23,17 +24,24 @@
+@@ -23,6 +24,7 @@
      def prefix = task.ext.prefix ?: "${meta.id}"
      def is_compressed = fasta.getExtension() == "gz" ? true : false
      def fasta_name = is_compressed ? fasta.getBaseName() : fasta
@@ -17,17 +17,7 @@ Changes in module 'nf-core/blast/blastn'
  
      """
      if [ "${is_compressed}" == "true" ]; then
-         gzip -c -d ${fasta} > ${fasta_name}
-     fi
- 
--    DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'`
-+    DB=`find -L ./ -name "*.nal" | sed 's/\\.nal\$//'`
-+    if [ -z "\$DB" ]; then
-+        DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'`
-+    fi
-+    echo Using \$DB
-+
-     blastn \\
+@@ -39,6 +41,7 @@
          -num_threads ${task.cpus} \\
          -db \$DB \\
          -query ${fasta_name} \\
diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test b/modules/nf-core/blast/blastn/tests/main.nf.test
index 0e909a7e..02ecfab5 100644
--- a/modules/nf-core/blast/blastn/tests/main.nf.test
+++ b/modules/nf-core/blast/blastn/tests/main.nf.test
@@ -8,6 +8,7 @@ nextflow_process {
     tag "modules_nfcore"
     tag "blast"
     tag "blast/blastn"
+    tag "blast/makeblastdb"
 
     setup {
         run("BLAST_MAKEBLASTDB") {
diff --git a/modules/nf-core/cat/cat/cat-cat.diff b/modules/nf-core/cat/cat/cat-cat.diff
deleted file mode 100644
index 4d2fedae..00000000
--- a/modules/nf-core/cat/cat/cat-cat.diff
+++ /dev/null
@@ -1,34 +0,0 @@
-Changes in module 'nf-core/cat/cat'
---- modules/nf-core/cat/cat/main.nf
-+++ modules/nf-core/cat/cat/main.nf
-@@ -22,6 +22,8 @@
-     def args2 = task.ext.args2 ?: ''
-     def file_list = files_in.collect { it.toString() }
- 
-+    // choose appropriate concatenation tool depending on input and output format
-+
-     // | input     | output     | command1 | command2 |
-     // |-----------|------------|----------|----------|
-     // | gzipped   | gzipped    | cat      |          |
-@@ -30,7 +32,7 @@
-     // | ungzipped | gzipped    | cat      | pigz     |
- 
-     // Use input file ending as default
--    prefix   = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}"
-+    prefix   = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}"
-     out_zip  = prefix.endsWith('.gz')
-     in_zip   = file_list[0].endsWith('.gz')
-     command1 = (in_zip && !out_zip) ? 'zcat' : 'cat'
-@@ -68,3 +70,10 @@
-     END_VERSIONS
-     """
- }
-+
-+// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz
-+def getFileSuffix(filename) {
-+    def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/
-+    return match ? match[0][1] : filename.substring(filename.lastIndexOf('.'))
-+}
-+
-
-************************************************************
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test
index ed5a4f12..aaae04f9 100644
--- a/modules/nf-core/cat/cat/tests/main.nf.test
+++ b/modules/nf-core/cat/cat/tests/main.nf.test
@@ -83,8 +83,7 @@ nextflow_process {
             def lines = path(process.out.file_out.get(0).get(1)).linesGzip
             assertAll(
                 { assert process.success },
-                { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") },
-                { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")}
+                { assert snapshot(process.out).match() }
             )
         }
     }
@@ -142,8 +141,7 @@ nextflow_process {
             def lines = path(process.out.file_out.get(0).get(1)).linesGzip
             assertAll(
                 { assert process.success },
-                { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") },
-                { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")}
+                { assert snapshot(process.out).match() }
             )
         }
     }
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap
index 423571ba..0c9bfe8d 100644
--- a/modules/nf-core/cat/cat/tests/main.nf.test.snap
+++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap
@@ -1,10 +1,4 @@
 {
-    "test_cat_unzipped_zipped_size": {
-        "content": [
-            375
-        ],
-        "timestamp": "2023-10-16T14:33:08.049445686"
-    },
     "test_cat_unzipped_unzipped": {
         "content": [
             {
@@ -67,31 +61,36 @@
         ],
         "timestamp": "2023-10-16T14:32:49.642741302"
     },
-    "test_cat_zipped_zipped_lines": {
-        "content": [
-            [
-                "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab",
-                "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1",
-                "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
-                "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
-                "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1",
-                "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1"
-            ]
-        ],
-        "timestamp": "2023-10-16T14:32:33.629048645"
-    },
-    "test_cat_unzipped_zipped_lines": {
+    "test_cat_zipped_zipped": {
         "content": [
-            [
-                ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome",
-                "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT",
-                "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG",
-                "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG",
-                "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT",
-                "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG"
-            ]
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "test.gff3.gz:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ],
+                "file_out": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "test.gff3.gz:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ]
+            }
         ],
-        "timestamp": "2023-10-16T14:33:08.038830506"
+        "timestamp": "2024-01-12T14:02:02.999254641"
     },
     "test_cat_one_file_unzipped_zipped_lines": {
         "content": [
@@ -106,16 +105,41 @@
         ],
         "timestamp": "2023-10-16T14:33:21.39642399"
     },
-    "test_cat_zipped_zipped_size": {
+    "test_cat_unzipped_zipped": {
         "content": [
-            78
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "cat.txt.gz:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ],
+                "file_out": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "cat.txt.gz:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ]
+            }
         ],
-        "timestamp": "2023-10-16T14:32:33.641869244"
+        "timestamp": "2024-01-12T14:08:26.948048418"
     },
     "test_cat_one_file_unzipped_zipped_size": {
         "content": [
             374
         ],
-        "timestamp": "2023-10-16T14:33:21.4094373"
+        "timestamp": "2024-01-12T14:10:22.445700266"
     }
-}
\ No newline at end of file
+}

From ac42a3af676cdeaacf12baf399f8f2aa9a44024c Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:24:05 +0000
Subject: [PATCH 23/44] Updated the CHANGELOG

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8da8f3dd..34f49933 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - The pipeline now supports samplesheets generated by the
   [nf-core/fetchngs](https://nf-co.re/fetchngs) pipeline by passing the
   `--fetchngs_samplesheet true` option.
+- FastQ files can bypass the conversion to Fasta
+- Fixed missing BUSCO results from the blobdir (only 1 BUSCO was loaded)
+- Fixed the default category used to colour the blob plots
 
 ### Parameters
 

From 864d3bc8ef7c1310cfbe463cf2e3640db8f9678d Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:29:37 +0000
Subject: [PATCH 24/44] Documented the decision tree

---
 subworkflows/local/input_check.nf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index c0f0ad34..5b028911 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -98,6 +98,8 @@ def create_data_channels_from_fetchngs(LinkedHashMap row) {
     def meta = [:]
     meta.id         = row.run_accession
 
+    // Same as https://github.com/blobtoolkit/blobtoolkit/blob/4.3.3/src/blobtoolkit-pipeline/src/lib/functions.py#L30-L39
+    // with the addition of "hic"
     switch (row.instrument_platform) {
         case "ILLUMINA":
             meta.datatype = (row.library_strategy == "Hi-C" ? "hic" : "illumina")

From 4c27f5272d095418128f8d70a4ca3663f91c5b69 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:33:05 +0000
Subject: [PATCH 25/44] Renamed the variables to make the code easier to
 understand

---
 subworkflows/local/busco_diamond_blastp.nf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf
index 709bffaf..a43b26dd 100644
--- a/subworkflows/local/busco_diamond_blastp.nf
+++ b/subworkflows/local/busco_diamond_blastp.nf
@@ -87,19 +87,19 @@ workflow BUSCO_DIAMOND {
 
 
     // Index the lineages in the taxonomic order
-    def lineage_index = 0
+    def lineage_position = 0
     ch_lineages
-    | map { lineage -> [lineage, lineage_index++] }
+    | map { lineage -> [lineage, lineage_position++] }
     | set { ch_ordered_lineages }
 
 
-    // Order BUSCO results accoring to ch_lineages
+    // Order BUSCO results according to ch_ordered_lineages
     BUSCO.out.full_table
     | map { meta, table -> [table.parent.baseName.minus("run_"), meta, table] }
     | join ( ch_ordered_lineages )
     | map { lineage, meta, table, index -> [meta, table, index] }
     | groupTuple()
-    | map { meta, tables, indexes -> [ meta, tables.withIndex().sort { a, b -> indexes[a[1]] <=> indexes[b[1]] } . collect { table, i -> table } ] }
+    | map { meta, tables, positions -> [ meta, tables.withIndex().sort { a, b -> positions[a[1]] <=> positions[b[1]] } . collect { table, i -> table } ] }
     | set { ch_indexed_buscos }
 
 

From 8dc4f0cb41af8232bbb8b8b84f172fbb19b0f31b Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:34:30 +0000
Subject: [PATCH 26/44] Not used

---
 docs/decision-records/README.md | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 docs/decision-records/README.md

diff --git a/docs/decision-records/README.md b/docs/decision-records/README.md
deleted file mode 100644
index bd17babb..00000000
--- a/docs/decision-records/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-Design decisions about the pipeline are indexed and recorded as individual files in this directory.
-
-To add a new decision, please create a pull request that adds a new markdown file named `XX-short-summary.md` to this directory. When replacing a previous decision, change the status of the latter to "Superseded" and add this to the title of the file `superseded-XX-short-summary.md`. The new file should have the following structure:
-
-## Title – Decision Statement
-
-## Status – Either Proposed, Rejected, Current, Deprecated or Superseded
-
-If this issue has been superseded, please add a line saying 'Superseded by <link-to-new-record>'.
-
-## Context
-
-Explain why a decision is needed (problem statement) and provide details of the different options considered when making this decision.
-
-## Decision
-
-State what option was selected and why was it picked over other choices.
-
-## Consequences
-
-Reflect on how this decision will impact other planned work, or what new work needs to be planned to implement the decision.
-
-## Discussion Notes and Linked Issues or Pull Requests
-
-Add any offline discussion notes here, along with associated issue(s) and pull request links.

From 6789b6f27441bb13caa4e648db5c3e47a9f56adb Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:34:52 +0000
Subject: [PATCH 27/44] Fixed the link

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 34f49933..f41ecd64 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.2.0)] – Poliwag – [2024-01-XX]
+## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.3.0)] – Poliwag – [2024-01-XX]
 
 ### Enhancements & fixes
 

From 5e7d53a2d544dc780344fbc671b674825924d7e2 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:35:05 +0000
Subject: [PATCH 28/44] Let's make it happen today

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f41ecd64..a326925f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.3.0)] – Poliwag – [2024-01-XX]
+## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.3.0)] – Poliwag – [2024-01-19]
 
 ### Enhancements & fixes
 

From 9a0c6a0dd023041dce96fa2adfd72ac89ac951c7 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:41:16 +0000
Subject: [PATCH 29/44] Updated the documentation

---
 README.md      | 22 ++--------------------
 docs/output.md |  2 ++
 docs/usage.md  |  6 +++++-
 3 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index fab2a350..c2f2a9fc 100644
--- a/README.md
+++ b/README.md
@@ -13,19 +13,6 @@
 
 **sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes. It takes a samplesheet and aligned CRAM files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.
 
-<!--
-   Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
-   major pipeline sections and the types of output it produces. You're giving an overview to someone new
-   to nf-core here, in 15-20 seconds. For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#introduction
--->
-
-<!-- Include a figure that guides the user through the major workflow steps. Many nf-core
-     workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples. -->
-
-<!-- # ![sanger-tol/blobtoolkit](https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/docs/images/sanger-tol-blobtoolkit_workflow.png) -->
-
-<!-- Fill in short bullet-pointed list of the default steps in the pipeline -->
-
 1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))
 2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))
 3. Fetch associated BUSCO lineages ([`goat/taxonsearch`](https://github.com/genomehubs/goat-cli))
@@ -44,9 +31,6 @@
 > [!NOTE]
 > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.
 
-<!-- Describe the minimum required steps to execute the pipeline, e.g. how to prepare samplesheets.
-     Explain what rows and columns represent. For instance (please edit as appropriate): -->
-
 First, prepare a samplesheet with your input data that looks as follows:
 
 `samplesheet.csv`:
@@ -58,12 +42,10 @@ mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram
 mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram
 ```
 
-Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, pacbio_clr, illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.
+Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.
 
 Now, you can run the pipeline using:
 
-<!-- update the following command to include all required parameters for a minimal example -->
-
 ```bash
 nextflow run sanger-tol/blobtoolkit \
    -profile <docker/singularity/.../institute> \
@@ -86,7 +68,7 @@ For more details, please refer to the [usage documentation](https://pipelines.to
 
 ## Pipeline output
 
-<!-- To see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/blobtoolkit/results) tab on the sanger-tol website pipeline page. --> For more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).
+For more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).
 
 ## Credits
 
diff --git a/docs/output.md b/docs/output.md
index ffa089a9..e6efe8bc 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -29,6 +29,8 @@ The files in the BlobDir dataset which is used to create the online interactive
   - `*.json`: files generated from genome and alignment coverage statistics
   - `*.png`: static plot images
 
+More information about visualising the data in the [BlobToolKit repository](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/viewer)
+
 </details>
 
 ### MultiQC
diff --git a/docs/usage.md b/docs/usage.md
index 143de417..dcec96e2 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -48,6 +48,11 @@ sample3,ont,ont.cram
 
 An [example samplesheet](assets/test/samplesheet.csv) has been provided with the pipeline.
 
+### Support for [nf-core/fetchngs](https://nf-co.re/fetchngs)
+
+The pipeline can also accept a samplesheet generated by the [nf-core/fetchngs](https://nf-co.re/fetchngs) pipeline (tested with version 1.11.0).
+The pipeline then needs the `--fetchngs_samplesheet true` option *and* `--align true`, since the data files would all be unaligned.
+
 ## Getting databases ready for the pipeline
 
 The BlobToolKit pipeline can be run in many different ways. The default way requires access to several databases:
@@ -91,7 +96,6 @@ Retrieve the NCBI blast nt database (version 5) files and tar gunzip them. We ar
 
 ```bash
 wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.???.tar.gz" -P $NT/ &&
-wget https://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz -P $NT &&
 for file in $NT/*.tar.gz; do
     tar xf $file -C $NT && rm $file;
 done

From 1fbde37229657bd6e152db9450c3006924262ef3 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:46:24 +0000
Subject: [PATCH 30/44] Explained the purpose of this release

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a326925f..054367fd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.3.0)] – Poliwag – [2024-01-19]
 
+The pipeline has now been validated on five genomes, all under 100 Mbp: a
+sponge, a platyhelminth, and three fungi.
+
 ### Enhancements & fixes
 
 - Fixed the conditional runs of blastn

From b8d3ae035013f8e5d4c560990a1f6159584d66b7 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:47:29 +0000
Subject: [PATCH 31/44] [lint] prettier

---
 docs/usage.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/usage.md b/docs/usage.md
index dcec96e2..84229b17 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -51,7 +51,7 @@ An [example samplesheet](assets/test/samplesheet.csv) has been provided with the
 ### Support for [nf-core/fetchngs](https://nf-co.re/fetchngs)
 
 The pipeline can also accept a samplesheet generated by the [nf-core/fetchngs](https://nf-co.re/fetchngs) pipeline (tested with version 1.11.0).
-The pipeline then needs the `--fetchngs_samplesheet true` option *and* `--align true`, since the data files would all be unaligned.
+The pipeline then needs the `--fetchngs_samplesheet true` option _and_ `--align true`, since the data files would all be unaligned.
 
 ## Getting databases ready for the pipeline
 

From 50d2a71a35fc7cd4d02ecd61e503845719b740cf Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 19 Jan 2024 12:49:15 +0000
Subject: [PATCH 32/44] Revert "Let's make it happen today"

This reverts commit 5e7d53a2d544dc780344fbc671b674825924d7e2.
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 054367fd..2988faed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.3.0)] – Poliwag – [2024-01-19]
+## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.3.0)] – Poliwag – [2024-01-XX]
 
 The pipeline has now been validated on five genomes, all under 100 Mbp: a
 sponge, a platyhelminth, and three fungi.

From 5388d779c3664695a40efbbc9571c723a5e80a55 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 22 Jan 2024 14:38:48 +0000
Subject: [PATCH 33/44] Pin the version of prettier to match the nf-core
 dependency we have

---
 .github/workflows/linting.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 905c58e4..5c001706 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -32,7 +32,7 @@ jobs:
       - uses: actions/setup-node@v4
 
       - name: Install Prettier
-        run: npm install -g prettier
+        run: npm install -g prettier@3.1.0
 
       - name: Run Prettier --check
         run: prettier --check ${GITHUB_WORKSPACE}

From 1dbbe9eb79e1b718690099217cda396fda5a406f Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 22 Jan 2024 14:45:49 +0000
Subject: [PATCH 34/44] Need to make an exception for this file

---
 .nf-core.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.nf-core.yml b/.nf-core.yml
index 3e9d09b4..2a47982a 100644
--- a/.nf-core.yml
+++ b/.nf-core.yml
@@ -17,6 +17,7 @@ lint:
     - docs/images/nf-core-blobtoolkit_logo_dark.png
     - .github/ISSUE_TEMPLATE/bug_report.yml
     - .github/PULL_REQUEST_TEMPLATE.md
+    - .github/workflows/linting.yml
   multiqc_config:
     - report_comment
   nextflow_config:

From 1303a6993963e3e6030ea1486b760ac3744aa8d1 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 25 Jan 2024 16:09:28 +0000
Subject: [PATCH 35/44] bugfix: if only 1 BUSCO is provided (only lineage with
 a match), the object is a path, not a list

---
 modules/local/blobtoolkit/countbuscos.nf   | 2 +-
 modules/local/blobtoolkit/createblobdir.nf | 2 +-
 modules/local/blobtoolkit/extractbuscos.nf | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/local/blobtoolkit/countbuscos.nf b/modules/local/blobtoolkit/countbuscos.nf
index bd817bae..203633e1 100644
--- a/modules/local/blobtoolkit/countbuscos.nf
+++ b/modules/local/blobtoolkit/countbuscos.nf
@@ -21,7 +21,7 @@ process BLOBTOOLKIT_COUNTBUSCOS {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def busco_inputs = table.collect{"--in $it"}.join(' ')
+    def busco_inputs = (table instanceof List ? table : [table]).collect{"--in $it"}.join(' ')
     """
     btk pipeline count-busco-genes \\
         $busco_inputs \\
diff --git a/modules/local/blobtoolkit/createblobdir.nf b/modules/local/blobtoolkit/createblobdir.nf
index b61b8fc9..2c8517ab 100644
--- a/modules/local/blobtoolkit/createblobdir.nf
+++ b/modules/local/blobtoolkit/createblobdir.nf
@@ -24,7 +24,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR {
     script:
     def args = task.ext.args ?: ''
     prefix = task.ext.prefix ?: "${meta.id}"
-    def busco_args = busco.collect { "--busco " + it } .join(' ')
+    def busco_args = (busco instanceof List ? busco : [busco]).collect { "--busco " + it } .join(' ')
     def hits_blastp = blastp ? "--hits ${blastp}" : ""
     """
     blobtools replace \\
diff --git a/modules/local/blobtoolkit/extractbuscos.nf b/modules/local/blobtoolkit/extractbuscos.nf
index 1329dd8a..128780fe 100644
--- a/modules/local/blobtoolkit/extractbuscos.nf
+++ b/modules/local/blobtoolkit/extractbuscos.nf
@@ -21,7 +21,7 @@ process BLOBTOOLKIT_EXTRACTBUSCOS {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def seq_args = seq.collect { "--busco " + it } .join(' ')
+    def seq_args = (seq instanceof List ? seq : [seq]).collect { "--busco " + it } .join(' ')
     """
     btk pipeline extract-busco-genes \\
         $seq_args \\

From 8d2ebed978ad3d15c0a7fc3d4535655a08a6bb95 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 7 Feb 2024 14:48:45 +0000
Subject: [PATCH 36/44] Allow chosing the output format (png vs svg)

---
 CHANGELOG.md                   | 2 ++
 modules/local/blobtk/images.nf | 4 +++-
 nextflow.config                | 3 +++
 nextflow_schema.json           | 6 ++++++
 subworkflows/local/view.nf     | 7 ++++---
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2988faed..ca1aa205 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,12 +21,14 @@ sponge, a platyhelminth, and three fungi.
 - FastQ files can bypass the conversion to Fasta
 - Fixed missing BUSCO results from the blobdir (only 1 BUSCO was loaded)
 - Fixed the default category used to colour the blob plots
+- Added an option to select the format of the images (PNG or SVG)
 
 ### Parameters
 
 | Old parameter | New parameter          |
 | ------------- | ---------------------- |
 |               | --fetchngs_samplesheet |
+|               | --image_format         |
 
 > **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
 
diff --git a/modules/local/blobtk/images.nf b/modules/local/blobtk/images.nf
index 1b6e8087..94669298 100644
--- a/modules/local/blobtk/images.nf
+++ b/modules/local/blobtk/images.nf
@@ -10,9 +10,11 @@ process BLOBTK_IMAGES {
     input:
     tuple val(meta), path(blobdir)
     each plot
+    val format
 
     output:
     tuple val(meta), path('*.png') , emit: png
+    tuple val(meta), path('*.svg') , emit: svg
     path "versions.yml"            , emit: versions
 
     when:
@@ -26,7 +28,7 @@ process BLOBTK_IMAGES {
     blobtk plot \\
         -v ${plot} \\
         -d ${blobdir} \\
-        -o ${prefix}.${plot}.png \\
+        -o ${prefix}.${plot}.${format} \\
         ${legend} \\
         $args
 
diff --git a/nextflow.config b/nextflow.config
index 25d6f721..97085869 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -23,6 +23,9 @@ params {
     taxon                      = null
     taxa_file                  = null
 
+    // Output options
+    image_format               = 'png'
+
     // Databases and related options
     taxdump                    = null
     busco                      = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index a9dc4885..b9ca59bf 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -43,6 +43,12 @@
                     "description": "Custom config file for draft assembly",
                     "fa_icon": "fas fa-file-alt"
                 },
+                "image_format": {
+                    "type": "string",
+                    "enum": ["png", "svg"],
+                    "description": "Select the format of the output images.",
+                    "fa_icon": "fas fa-image"
+                },
                 "outdir": {
                     "type": "string",
                     "format": "directory-path",
diff --git a/subworkflows/local/view.nf b/subworkflows/local/view.nf
index 505d6c36..e2de7ede 100644
--- a/subworkflows/local/view.nf
+++ b/subworkflows/local/view.nf
@@ -22,16 +22,17 @@ workflow VIEW {
 
 
     //
-    // Generate static plots in png format
+    // Generate static plots in png/svg format
     //
     plots = [ "blob", "cumulative", "snail" ]
 
-    BLOBTK_IMAGES ( blobdir, plots )
+    BLOBTK_IMAGES ( blobdir, plots, params.image_format )
     ch_versions = ch_versions.mix( BLOBTK_IMAGES.out.versions )
 
+    ch_images = BLOBTK_IMAGES.out.png.mix(BLOBTK_IMAGES.out.svg)
 
     emit:
     summary  = BLOBTOOLKIT_SUMMARY.out.json  // channel: [ val(meta), path(json) ]
-    images   = BLOBTK_IMAGES.out.png         // channel: [ val(meta), path(png) ]
+    images   = ch_images                     // channel: [ val(meta), path(png/svg) ]
     versions = ch_versions                   // channel: [ versions.yml ]
 }

From 2a3eca79c686a4b57eac6ae105c08fb474e622ad Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 7 Feb 2024 14:49:00 +0000
Subject: [PATCH 37/44] typo

---
 modules/local/blobtoolkit/windowstats.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/blobtoolkit/windowstats.nf b/modules/local/blobtoolkit/windowstats.nf
index 2975ede1..26bd49f5 100644
--- a/modules/local/blobtoolkit/windowstats.nf
+++ b/modules/local/blobtoolkit/windowstats.nf
@@ -3,7 +3,7 @@ process BLOBTOOLKIT_WINDOWSTATS {
     label 'process_single'
 
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
-        exit 1, "GET_WINDOW_STATS module does not support Conda. Please use Docker / Singularity / Podman instead."
+        exit 1, "BLOBTOOLKIT_WINDOWSTATS module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
     container "docker.io/genomehubs/blobtoolkit:4.3.3"
 

From c4c53242e8695890ff19e077463309cfa83b6bef Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 7 Feb 2024 14:49:39 +0000
Subject: [PATCH 38/44] Changed the output directory of the images

---
 CHANGELOG.md        | 1 +
 conf/modules.config | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ca1aa205..a0a9ff5f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ sponge, a platyhelminth, and three fungi.
 - FastQ files can bypass the conversion to Fasta
 - Fixed missing BUSCO results from the blobdir (only 1 BUSCO was loaded)
 - Fixed the default category used to colour the blob plots
+- Fixed the output directory of the images
 - Added an option to select the format of the images (PNG or SVG)
 
 ### Parameters
diff --git a/conf/modules.config b/conf/modules.config
index bebb909e..a12be614 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -118,7 +118,7 @@ process {
 
     withName: "BLOBTK_IMAGES" {
         publishDir = [
-            path: { "${params.outdir}/${blobdir.name}/" },
+            path: { "${params.outdir}/blobtoolkit/plots" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
         ]

From 1f4284b45abc2578d1825f2556e102c162738b70 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 9 Feb 2024 11:07:32 +0000
Subject: [PATCH 39/44] Because only one format can be chosen, they have to be
 marked as optional

---
 modules/local/blobtk/images.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/local/blobtk/images.nf b/modules/local/blobtk/images.nf
index 94669298..48a9b1d4 100644
--- a/modules/local/blobtk/images.nf
+++ b/modules/local/blobtk/images.nf
@@ -13,8 +13,8 @@ process BLOBTK_IMAGES {
     val format
 
     output:
-    tuple val(meta), path('*.png') , emit: png
-    tuple val(meta), path('*.svg') , emit: svg
+    tuple val(meta), path('*.png') , optional: true, emit: png
+    tuple val(meta), path('*.svg') , optional: true, emit: svg
     path "versions.yml"            , emit: versions
 
     when:

From fcfdb10dfe50fc8ab55715e8e7722c8e123a44e1 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 7 Feb 2024 14:49:53 +0000
Subject: [PATCH 40/44] Moved the blobdir a level below, as per our convention

Since the blobdir is incrementally built, only the last version needs to be published
---
 conf/modules.config | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index a12be614..5e672cc2 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -92,25 +92,15 @@ process {
 
     withName: "BLOBTOOLKIT_CREATEBLOBDIR" {
         ext.args = "--evalue 1.0e-25 --hit-count 10"
-        publishDir = [
-            path: { "${params.outdir}/" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
-        ]
     }
 
     withName: "BLOBTOOLKIT_UPDATEBLOBDIR" {
         ext.args = "--evalue 1.0e-25 --hit-count 10"
-        publishDir = [
-            path: { "${params.outdir}/" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
-        ]
     }
 
     withName: "BLOBTOOLKIT_SUMMARY" {
         publishDir = [
-            path: { "${params.outdir}/${blobdir.name}/" },
+            path: { "${params.outdir}/blobtoolkit/${blobdir.name}" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
         ]
@@ -150,7 +140,7 @@ process {
 
     withName: "BLOBTOOLKIT_UPDATEMETA" {
         publishDir = [
-            path: { "${params.outdir}/" },
+            path: { "${params.outdir}/blobtoolkit" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
         ]

From a105dd358bb1fc7ab4d94ffc48bdf54136a9f3b2 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 9 Feb 2024 11:11:08 +0000
Subject: [PATCH 41/44] Lower case, please

---
 conf/modules.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index 5e672cc2..974728f5 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -68,7 +68,7 @@ process {
                         '--force --metaeuk_parameters \'"-s=2"\' --metaeuk_rerun_parameters \'"-s=2"\''
                     : '--force' }
         publishDir = [
-            path: { "${params.outdir}/BUSCO" },
+            path: { "${params.outdir}/busco" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
         ]

From ff87d76a2bccc75312f67e84f1b11e8bbb45edd6 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 9 Feb 2024 11:26:56 +0000
Subject: [PATCH 42/44] Pin the version of nf-core

---
 .github/workflows/linting.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 5c001706..8d12b98a 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -84,7 +84,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install nf-core
+          pip install nf-core==2.11
 
       - name: Run nf-core lint
         env:

From 21db9cf0f65d5ab57fc432a995083d3c41697bb6 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 9 Feb 2024 12:54:33 +0000
Subject: [PATCH 43/44] Bumped the version number up. A release is coming !

---
 CHANGELOG.md    | 2 +-
 nextflow.config | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a0a9ff5f..56007304 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.3.0)] – Poliwag – [2024-01-XX]
+## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.3.0)] – Poliwag – [2024-02-09]
 
 The pipeline has now been validated on five genomes, all under 100 Mbp: a
 sponge, a platyhelminth, and three fungi.
diff --git a/nextflow.config b/nextflow.config
index 97085869..6c9fadf8 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -247,7 +247,7 @@ manifest {
     description     = """Quality assessment of genome assemblies"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.2.0'
+    version         = '0.3.0'
     doi             = '10.5281/zenodo.7949058'
 }
 

From aabcfc6ee57fc9d8a96e3bdf51907aeb003cf07e Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Sat, 10 Feb 2024 09:36:21 +0000
Subject: [PATCH 44/44] Allow integers (taxon_ids) to be passed

---
 nextflow_schema.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index b9ca59bf..97c84534 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -78,8 +78,8 @@
             "required": ["taxon", "accession", "fasta"],
             "properties": {
                 "taxon": {
-                    "type": "string",
-                    "description": "NCBI taxonomy ID for the genome species"
+                    "type": ["string", "integer"],
+                    "description": "Name or taxonomy ID for the genome species"
                 },
                 "accession": {
                     "type": "string",