Merge pull request #99 from sanger-tol/more_published_files

Publish peripheral data as well, even if we don't use it ourselves
sanger-tol · Jul 1, 2024 · 7547ead · 7547ead
2 parents d16a4df + aef0532
commit 7547ead
Show file tree

Hide file tree

Showing 12 changed files with 287 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ General tidy up of the configuration and the pipeline
 
 - Increased the resources for blastn
 - Removed some options that were not used or not needed
+- All relevant outputs are now copied to the output directory
 
 ### Parameters
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -48,6 +48,14 @@ process {
         ext.args = { "-ax map-ont -I" + Math.ceil(meta2.genome_size/1e9) + 'G' }
     }
 
+    withName: "MINIMAP2_.*" {
+        publishDir = [
+            path: { "${params.outdir}/read_mapping/${meta.datatype}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
     withName: "SAMTOOLS_VIEW" {
         ext.args = "--output-fmt bam --write-index"
     }
@@ -60,6 +68,22 @@ process {
         ext.args = "--lineage --busco"
     }
 
+    withName: "PIGZ_COMPRESS" {
+        publishDir = [
+            path: { "${params.outdir}/base_content" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename.minus("fw_out/") }
+        ]
+    }
+
+    withName: "BLOBTK_DEPTH" {
+        publishDir = [
+            path: { "${params.outdir}/read_mapping/${meta.datatype}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : "${meta.id}.coverage.1k.bed.gz" }
+        ]
+    }
+
     withName: "BUSCO" {
         scratch = true
         ext.args = { 'test' in workflow.profile.tokenize(',') ?

diff --git a/docs/output.md b/docs/output.md
@@ -15,6 +15,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [BlobDir](#blobdir) - Output files viewable on a [BlobToolKit viewer](https://github.com/blobtoolkit/blobtoolkit)
 - [Static plots](#static-plots) - Static versions of the BlobToolKit plots
 - [BUSCO](#busco) - BUSCO results
+- [Read alignments](#read-alignments) - Aligned reads (optional)
+- [Read coverage](#read-coverage) - Read coverage tracks
+- [Base content](#base-content) - _k_-mer statistics (for k &le; 4)
 - [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
@@ -26,8 +29,8 @@ The files in the BlobDir dataset which is used to create the online interactive
 <summary>Output files</summary>
 
 - `blobtoolkit/`
-  - `<accession>/`
-    - `*.json.gz`: files generated from genome and alignment coverage statistics
+  - `<assembly-name>/`
+    - `*.json.gz`: files generated from genome and alignment coverage statistics.
 
 More information about visualising the data in the [BlobToolKit repository](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/viewer)
 
@@ -53,12 +56,56 @@ BUSCO results generated by the pipeline (all BUSCO lineages that match the claas
 <details markdown="1">
 <summary>Output files</summary>
 
-- `blobtoolkit/`
-  - `busco/`
-    - `*.batch_summary.txt`: BUSCO scores as tab-separated files (1 file per lineage).
-    - `*.fasta.txt`: BUSCO scores as formatted text (1 file per lineage).
-    - `*.json`: BUSCO scores as JSON (1 file per lineage).
-    - `*/`: all output BUSCO files, including the coordinate and sequence files of the annotated genes.
+- `busco/`
+  - `<lineage-name>/`
+    - `short_summary.json`: BUSCO scores for that lineage as a tab-separated file.
+    - `short_summary.tsv`: BUSCO scores for that lineage as JSON.
+    - `short_summary.txt`: BUSCO scores for that lineage as formatted text.
+    - `full_table.tsv`: Coordinates of the annotated BUSCO genes as a tab-separated file.
+    - `missing_busco_list.tsv`: List of the BUSCO genes that could not be found.
+    - `*_busco_sequences.tar.gz`: Sequences of the annotated BUSCO genes. 1 _tar_ archive for each of the three annotation levels (`single_copy`, `multi_copy`, `fragmented`), with 1 file per gene.
+    - `hmmer_output.tar.gz`: Archive of the HMMER alignment scores.
+
+</details>
+
+### Read alignments
+
+Read alignments in BAM format -- only if the pipeline is run with `--align`.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `read_mapping/`
+  - `<datatype>/`
+    - `<sample>.bam`: alignments of that sample's reads in BAM format.
+
+</details>
+
+### Read coverage
+
+Read coverage statistics as computed by the pipeline.
+Those files are the raw data used to build the BlobDir.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `read_mapping/`
+  - `<datatype>/`
+    - `<sample>.coverage.1k.bed.gz`: Bedgraph file with the coverage of the alignments of that sample per 1 kbp windows.
+
+</details>
+
+### Base content
+
+_k_-mer statistics.
+Those files are the raw data used to build the BlobDir.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `base_content/`
+  - `<assembly-name>_*nuc_windows.tsv.gz`: Tab-separated files with the counts of every _k_-mer for k &le; 4 in 1 kbp windows. The first three columns correspond to the coordinates (sequence name, start, end), followed by each _k_-mer.
+  - `<assembly-name>_freq_windows.tsv.gz`: Tab-separated files with frequencies derived from the _k_-mer counts.
 
 </details>
 

diff --git a/modules.json b/modules.json
@@ -64,6 +64,11 @@
                         "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a",
                         "installed_by": ["modules"]
                     },
+                    "pigz/compress": {
+                        "branch": "master",
+                        "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/fasta": {
                         "branch": "master",
                         "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62",

diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEMETA {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_UPDATEMETA module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/pacificbiosciences/pyyaml:5.3.1"
+    container "docker.io/genomehubs/blobtoolkit:4.3.9"
 
     input:
     tuple val(meta), path(input)

diff --git a/modules/nf-core/pigz/compress/environment.yml b/modules/nf-core/pigz/compress/environment.yml
diff --git a/modules/nf-core/pigz/compress/main.nf b/modules/nf-core/pigz/compress/main.nf
diff --git a/modules/nf-core/pigz/compress/meta.yml b/modules/nf-core/pigz/compress/meta.yml
diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test b/modules/nf-core/pigz/compress/tests/main.nf.test
diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test.snap b/modules/nf-core/pigz/compress/tests/main.nf.test.snap
diff --git a/modules/nf-core/pigz/compress/tests/tags.yml b/modules/nf-core/pigz/compress/tests/tags.yml
diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf
@@ -6,6 +6,7 @@ include { SAMTOOLS_VIEW  } from '../../modules/nf-core/samtools/view/main'
 include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main'
 include { BLOBTK_DEPTH   } from '../../modules/local/blobtk/depth'
 include { FASTAWINDOWS   } from '../../modules/nf-core/fastawindows/main'
+include { PIGZ_COMPRESS  } from '../../modules/nf-core/pigz/compress/main'
 include { CREATE_BED     } from '../../modules/local/create_bed'
 
 
@@ -53,6 +54,17 @@ workflow COVERAGE_STATS {
     ch_versions = ch_versions.mix ( FASTAWINDOWS.out.versions.first() )
 
 
+    // Compress the TSV files
+    PIGZ_COMPRESS (
+        FASTAWINDOWS.out.mononuc
+        | mix ( FASTAWINDOWS.out.dinuc )
+        | mix ( FASTAWINDOWS.out.trinuc )
+        | mix ( FASTAWINDOWS.out.tetranuc )
+        | mix ( FASTAWINDOWS.out.freq )
+    )
+    ch_versions = ch_versions.mix ( PIGZ_COMPRESS.out.versions.first() )
+
+
     // Create genome windows file in BED format
     CREATE_BED ( FASTAWINDOWS.out.mononuc )
     ch_versions = ch_versions.mix ( CREATE_BED.out.versions.first() )