Merge pull request #112 from sanger-tol/1.2_release

1.2 release
sanger-tol · May 1, 2024 · d4b2698 · d4b2698
2 parents 769c00f + 9203311
commit d4b2698
Show file tree

Hide file tree

Showing 27 changed files with 309 additions and 179 deletions.
diff --git a/.github/workflows/sanger_test.yml b/.github/workflows/sanger_test.yml
@@ -18,6 +18,7 @@ jobs:
           parameters: |
             {
               "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}",
+              "use_work_dir_as_temp": true,
             }
           profiles: test,sanger,singularity,cleanup
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,37 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[1.2.0](https://github.com/sanger-tol/genomenote/releases/tag/1.2.0)] - Pyrenean Mountain Dog - [2024-05-10]
+
+### Enhancements & fixes
+
+- Updated the MerquryFK resources to cope with mistletoe (the pipeline as a
+  whole is not yet fully compatible with mistletoe, though).
+- Updated the Busco resources to better deal with large genomes.
+- Round the chromosome lengths to 2 decimal points.
+- The pipeline is now publishing the Busco output directories.
+- The pipeline now generates a contact map for each Hi-C sample (instead of
+  randomly picking one) and reports them all in the CSV.
+- The Hi-C contact map is now ordered according to the karyotype (as defined in
+  the assembly record) by default, and added the `--cool_order` option to
+  override it.
+
+### Software dependencies
+
+Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference.
+
+| Dependency | Old version | New version |
+| ---------- | ----------- | ----------- |
+| busco      | 5.4.3       | 5.5.0       |
+
+> **NB:** Dependency has been **updated** if both old and new version information is present. </br> **NB:** Dependency has been **added** if just the new version information is present. </br> **NB:** Dependency has been **removed** if version information isn't present.
+
+### Parameters
+
+| Old parameter | New parameter |
+| ------------- | ------------- |
+|               | --cool_order  |
+
 ## [[1.1.2](https://github.com/sanger-tol/genomenote/releases/tag/1.1.2)] [2024-04-29]
 
 ### Enhancements & fixes

diff --git a/CITATION.cff b/CITATION.cff
@@ -2,7 +2,7 @@
 # Visit https://bit.ly/cffinit to generate yours today!
 
 cff-version: 1.2.0
-title: sanger-tol/genomenote v1.1.2
+title: sanger-tol/genomenote v1.2.0
 message: >-
     If you use this software, please cite it using the
     metadata from this file.
@@ -34,5 +34,5 @@ identifiers:
 repository-code: "https://github.com/sanger-tol/genomenote"
 license: MIT
 commit: TODO
-version: 1.1.2
+version: 1.2.0
 date-released: "2022-10-07"
diff --git a/README.md b/README.md
@@ -17,15 +17,14 @@
 
 <!--![sanger-tol/genomenote workflow](https://raw.githubusercontent.com/sanger-tol/genomenote/main/docs/images/sanger-tol-genomenote_workflow.png)-->
 
-1. Filter genome index ([`samtools faidx`](https://www.htslib.org/doc/samtools-faidx.html), `filter genome`)
+1. Summary statistics ([`NCBI datasets summary genome accession`](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/datasets/summary/genome/datasets_summary_genome_accession/))
 2. Convert alignment to BED ([`samtools view`](https://www.htslib.org/doc/samtools-view.html), [`bedtools bamtobed`](https://bedtools.readthedocs.io/en/latest/content/tools/bamtobed.html))
 3. Filter BED ([`GNU sort`](https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html), [`filter bed`](https://raw.githubusercontent.com/sanger-tol/genomenote/main/bin/filter_bed.sh))
 4. Contact maps ([`Cooler cload`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-cload-pairs), [`Cooler zoomify`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-zoomify), [`Cooler dump`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-dump))
-5. Summary statistics ([`NCBI datasets summary genome accession`](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/datasets/summary/genome/datasets_summary_genome_accession/))
-6. Genome completeness ([`NCBI API`](https://www.ncbi.nlm.nih.gov/datasets/docs/v1/reference-docs/rest-api/), [`BUSCO`](https://busco.ezlab.org))
-7. Consensus quality and k-mer completeness ([`FASTK`](https://github.com/thegenemyers/FASTK), [`MERQURY.FK`](https://github.com/thegenemyers/MERQURY.FK))
-8. Collated summary table ([`createtable`](bin/create_table.py))
-9. Present results and visualisations ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
+5. Genome completeness ([`NCBI API`](https://www.ncbi.nlm.nih.gov/datasets/docs/v1/reference-docs/rest-api/), [`BUSCO`](https://busco.ezlab.org))
+6. Consensus quality and k-mer completeness ([`FASTK`](https://github.com/thegenemyers/FASTK), [`MERQURY.FK`](https://github.com/thegenemyers/MERQURY.FK))
+7. Collated summary table ([`createtable`](bin/create_table.py))
+8. Present results and visualisations ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
 
 ## Usage
 

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,4 +1,5 @@
 sample,datatype,datafile
 uoEpiScrs1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/genomic_data/uoEpiScrs1/pacbio/m64228e_220617_134154.ccs.bc1015_BAK8B_OA--bc1015_BAK8B_OA.rmdup.subset.bam
 uoEpiScrs1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/genomic_data/uoEpiScrs1/pacbio/m64016e_220621_193126.ccs.bc1008_BAK8A_OA--bc1008_BAK8A_OA.rmdup.subset.bam
-uoEpiScrs1,hic,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/analysis/uoEpiScrs1.1/read_mapping/hic/GCA_946965045.1.unmasked.hic.uoEpiScrs1.subsampled.cram
+uoEpiScrs1c,hic,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/analysis/uoEpiScrs1.1/read_mapping/hic/GCA_946965045.1.unmasked.hic.uoEpiScrs1.subsampled.cram
+uoEpiScrs1b,hic,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/analysis/uoEpiScrs1.1/read_mapping/hic/GCA_946965045.1.unmasked.hic.uoEpiScrs1.subsampled.bam
diff --git a/bin/create_table.py b/bin/create_table.py
@@ -17,10 +17,10 @@ def parse_args(args=None):
     parser.add_argument("--busco", help="Input BUSCO short summary JSON file.")
     parser.add_argument("--qv", nargs="*", help="Input QV TSV file from MERQURYFK.")
     parser.add_argument("--completeness", nargs="*", help="Input COMPLETENESS stats TSV file from MERQURYFK.")
-    parser.add_argument("--hic", help="HiC sample ID used for contact maps.")
-    parser.add_argument("--flagstat", help="HiC flagstat file created by Samtools.")
+    parser.add_argument("--hic", action="append", help="HiC sample ID used for contact maps.")
+    parser.add_argument("--flagstat", action="append", help="HiC flagstat file created by Samtools.")
     parser.add_argument("--outcsv", help="Output CSV file.", required=True)
-    parser.add_argument("--version", action="version", version="%(prog)s 2.0")
+    parser.add_argument("--version", action="version", version="%(prog)s 3.1")
     return parser.parse_args(args)
 
 
@@ -89,7 +89,13 @@ def ncbi_stats(genome_in, seq_in, writer):
             if not chromosome_header:
                 writer.writerow(["##Chromosome", "Length", "GC_Percent"])
                 chromosome_header = True
-            writer.writerow([mol["chr_name"], mol["length"], mol["gc_percent"]])
+            writer.writerow(
+                [
+                    mol["chr_name"],
+                    round(mol["length"] / 1000000, 2),
+                    mol["gc_percent"],
+                ]
+            )
     organelle_header = False
     for mol in seq:
         if "gc_percent" in mol and mol["assembly_unit"] == "non-nuclear":
@@ -99,7 +105,7 @@ def ncbi_stats(genome_in, seq_in, writer):
             writer.writerow(
                 [
                     mol["assigned_molecule_location_type"],
-                    mol["length"],
+                    round(mol["length"] / 1000000, 2),
                     mol["gc_percent"],
                 ]
             )
@@ -168,7 +174,8 @@ def main(args=None):
         if args.qv and args.completeness is not None:
             extract_pacbio(args.qv, args.completeness, writer)
         if args.hic is not None:
-            extract_mapped(args.hic, args.flagstat, writer)
+            for hic, flagstat in zip(args.hic, args.flagstat):
+                extract_mapped(hic, flagstat, writer)
 
 
 if __name__ == "__main__":

diff --git a/bin/get_chr_list.sh b/bin/get_chr_list.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+infile=$1
+filter_lst=$2
+ord_list=$3
+
+jq -r '.reports[] | [.genbank_accession, .length] | @tsv' < "$infile" | sort -k2,2 -nr > "$filter_lst"
+
+if [[ -n "$ord_list" ]]; then
+    echo "Working with ordered list"
+    new_lst="new_filter.lst"
+
+    while read -r line; do
+        grep -F "$line" "$filter_lst"
+    done < "$ord_list" > "$new_lst"
+
+    grep -v -F -f "$ord_list" "$filter_lst" >> "$new_lst"
+
+    c1=$(wc -l < "$filter_lst")
+    c2=$(wc -l < "$new_lst")
+
+    if [ "$c1" -ne "$c2" ]; then
+        echo "The old and new files have a different number of rows."
+        exit 1
+    fi
+
+    cp "$new_lst" "$filter_lst"
+fi
+
diff --git a/bin/get_odb.py b/bin/get_odb.py
@@ -18,7 +18,7 @@ def parse_args(args=None):
     parser.add_argument("NCBI_SUMMARY_JSON", help="NCBI entry for this assembly for this assembly (in JSON).")
     parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.")
     parser.add_argument("FILE_OUT", help="Output CSV file.")
-    parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+    parser.add_argument("--version", action="version", version="%(prog)s 1.1")
     return parser.parse_args(args)
 
 
@@ -48,7 +48,7 @@ def get_odb(ncbi_summary, lineage_tax_ids, file_out):
     odb_arr = [lineage_tax_ids_dict[taxon_id] for taxon_id in ancestor_taxon_ids if taxon_id in lineage_tax_ids_dict]
 
     # The most recent [-1] OBD10 lineage is selected
-    odb_val = odb_arr[-1]
+    odb_val = odb_arr[-1] + "_odb10"
     out_dir = os.path.dirname(file_out)
     make_dir(out_dir)
 

diff --git a/conf/base.config b/conf/base.config
@@ -40,25 +40,25 @@ process {
     }
 
     withName: MERQURYFK_MERQURYFK {
-        // Memory usage seems to be following two different linear rules:
-        //  - 1 GB for every 60 Mbp for genomes < 840 Mbp
-        //  - 2 GB for every 1 Gbp for genomes > 840 Mbp, with a 12 GB offset to match the previous rule
-        memory = { check_max( 1.GB + ((meta.genome_size < 840000000) ? (Math.ceil(meta.genome_size/60000000) * 1.GB * task.attempt) : (Math.ceil(meta.genome_size/1000000000) * 2.GB * task.attempt + 12.GB)), 'memory' ) }
+        time   = { check_max( 10.min * Math.ceil(meta.genome_size/1000000000) * task.attempt, 'time') }
+        // Memory usage is probably correlated to the diversity of k-mers found
+        // in the assembly, which grows with the genome size but then plateaus.
+        // The memory is increased by half of the base value at every attempt.
+        memory = { check_max( (
+                        meta.genome_size <  100000000 ?  6.GB :
+                        meta.genome_size <  500000000 ? 12.GB : 24.GB
+                    ) * ((task.attempt+1)/2) , 'memory' ) }
         cpus   = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
     }
 
     withName: BUSCO {
-        // No straightforward formula, so using ranges instead.
-        // The memory is increased by half of the base value at every attempt.
-        memory = { check_max( (
-                        meta.genome_size <  100000000 ?  4.GB :
-                        meta.genome_size <  500000000 ?  8.GB :
-                        meta.genome_size < 1000000000 ? 16.GB :
-                        meta.genome_size < 2000000000 ? 32.GB :
-                        meta.genome_size < 5000000000 ? 64.GB : 192.GB
-                    ) * ((task.attempt+1)/2) , 'memory' ) }
-        cpus   = { log_increase_cpus(4, 2*task.attempt, Math.ceil(meta.genome_size/1000000000), 2) }
-        time   = { check_max( 2.h * Math.ceil(meta.genome_size/1000000000) * task.attempt, 'time') }
+        // The formulas below are equivalent to these ranges:
+        // Gbp:    [ 1,  2,  4,   8,  16]
+        // CPUs:   [ 8, 12, 16,  20,  24]
+        // GB RAM: [16, 32, 64, 128, 256]
+        memory = { check_max( 1.GB * Math.pow(2, 3 + task.attempt + Math.ceil(positive_log(meta.genome_size/1000000000, 2))) , 'memory' ) }
+        cpus   = { log_increase_cpus(4, 4*task.attempt, Math.ceil(meta.genome_size/1000000000), 2) }
+        time   = { check_max( 3.h * Math.ceil(meta.genome_size/1000000000) * task.attempt, 'time') }
     }
 
     withName: 'BED_SORT|FILTER_SORT' {

diff --git a/conf/modules.config b/conf/modules.config
@@ -58,6 +58,14 @@ process {
                     : '--mode genome --tar' }
     }
 
+    withName: "RESTRUCTUREBUSCODIR" {
+        publishDir = [
+            path: { "${params.outdir}/busco" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
     withName: SUMMARYSEQUENCE {
         ext.prefix = { "${meta.id}_sequence" }
         ext.args   = "--report sequence"

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -24,4 +24,7 @@ params {
 
     // Databases
     lineage_db = "/lustre/scratch123/tol/resources/busco/v5"
+
+    // Need to be set to avoid overfilling /tmp
+    use_work_dir_as_temp = true
 }
diff --git a/docs/output.md b/docs/output.md
@@ -12,6 +12,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 - [Contact maps](#contact-maps) – Contact matrix created using HiC sequencing data
 - [Genome statistics](#genome-statistics) – Collated assembly information, genome statistics and alignment quality information
+- [BUSCO](#busco) - BUSCO results
 - [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
@@ -41,6 +42,23 @@ This pipeline collates (1) assembly information, statistics and chromosome detai
 
 </details>
 
+### BUSCO
+
+BUSCO results generated by the pipeline (all BUSCO lineages that match the claassification of the species).
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `busco/`
+  - `<lineage_name>`
+    - `short_summary.{json|tsv|txt}`: BUSCO scores in various formats.
+    - `full_table.tsv`: list and coordinates of BUSCO genes that could be found
+    - `missing_busco_list.tsv`: BUSCO genes that could not be found
+    - `{single,multi,fragmented}_busco_sequences.tar.gz`: sequence files of the annotated genes.
+    - `hmmer_output.tar.gz`: Scores and outputs from the HMMER searches
+
+</details>
+
 ### MultiQC
 
 <details markdown="1">

diff --git a/modules.json b/modules.json
@@ -12,7 +12,7 @@
                     },
                     "busco": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "e3126f437c336c826f242842fe51769cfce0ec2d",
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/busco/busco.diff"
                     },
@@ -63,11 +63,6 @@
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
-                    "samtools/faidx": {
-                        "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
-                    },
                     "samtools/view": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",

diff --git a/modules/local/createtable.nf b/modules/local/createtable.nf
@@ -11,7 +11,7 @@ process CREATETABLE {
     tuple val(meta), path(genome_summary), path(sequence_summary)
     tuple val(meta1), path(busco)
     tuple val(meta2), path(qv), path(completeness)
-    tuple val(meta3), path(flagstat)
+    tuple val(meta3s), path(flagstats, stageAs: "?/*")
 
     output:
     tuple val(meta), path("*.csv"), emit: csv
@@ -27,8 +27,8 @@ process CREATETABLE {
     def bus = busco ? "--busco ${busco}" : ""
     def mqv = qv ? "--qv ${qv}" : ""
     def mco = completeness ? "--completeness ${completeness}" : ""
-    def hic = flagstat ? "--hic ${meta3.id}" : ""
-    def fst = flagstat ? "--flagstat ${flagstat}" : ""
+    def hic = meta3s.collect { "--hic " + it.id } .join(' ')
+    def fst = (flagstats instanceof List ? flagstats : [flagstats]).collect { "--flagstat " + it } .join(' ')
     """
     create_table.py \\
         $gen \\

diff --git a/modules/local/filter/genome.nf → modules/local/ncbidatasets/get_chromlist.nf b/modules/local/filter/genome.nf → modules/local/ncbidatasets/get_chromlist.nf
@@ -1,17 +1,18 @@
-process FILTER_GENOME {
+process GET_CHROMLIST {
     tag "$meta.id"
     label 'process_single'
 
-    conda "bioconda::coreutils=8.25"
+    conda "conda-forge::jq=1.6"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/coreutils:8.25--1' :
-        'biocontainers/coreutils:8.25--1' }"
+        'https://depot.galaxyproject.org/singularity/jq:1.6' :
+        'biocontainers/jq:1.6' }"
 
     input:
-    tuple val(meta), path(fai)
+    tuple val(meta), path(json)
+    path ord
 
     output:
-    tuple val(meta), path("*.list"), emit: list
+    tuple val(meta), path("*_chrom.list"), emit: list
     path "versions.yml"            , emit: versions
 
     when:
@@ -21,7 +22,7 @@ process FILTER_GENOME {
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    cut -f1,2 $fai | sed 's/-/_/g' | sort -k2,2 -nr > ${prefix}_filtered.list
+    get_chr_list.sh $json ${prefix}_chrom.list $ord
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":