Merge pull request #82 from sanger-tol/resource_optimisation

Resource optimisation
sanger-tol · Dec 18, 2023 · 0f4e2a1 · 0f4e2a1
2 parents bd02f75 + 79bcbbc
commit 0f4e2a1
Show file tree

Hide file tree

Showing 23 changed files with 320 additions and 350 deletions.
diff --git a/conf/base.config b/conf/base.config
@@ -2,64 +2,135 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     sanger-tol/readmapping Nextflow base config file
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    A 'blank slate' config file, appropriate for general use on most high performance
-    compute environments. Assumes that all software is installed and available on
-    the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
-----------------------------------------------------------------------------------------
 */
 
-process {
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Increasing the number of CPUs often gives diminishing returns, so we increase it
+    following a logarithm curve. Example:
+        - 0 < value <= 1: start + step
+        - 1 < value <= 2: start + 2*step
+        - 2 < value <= 4: start + 3*step
+        - 4 < value <= 8: start + 4*step
+    In order to support re-runs, the step increase may be multiplied by the attempt
+    number prior to calling this function.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
 
-    cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
-    memory = { check_max( 6.GB * task.attempt, 'memory' ) }
-    time   = { check_max( 4.h  * task.attempt, 'time'   ) }
+// Modified logarithm function that doesn't return negative numbers
+def positive_log(value, base) {
+    if (value <= 1) {
+        return 0
+    } else {
+        return Math.log(value)/Math.log(base)
+    }
+}
+
+def log_increase_cpus(start, step, value, base) {
+    return check_max(start + step * (1 + Math.ceil(positive_log(value, base))), 'cpus')
+}
 
-    errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
-    maxRetries    = 1
+
+process {
+
+    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+    maxRetries    = 5
     maxErrors     = '-1'
 
-    // Process-specific resource requirements
-    // NOTE - Please try and re-use the labels below as much as possible.
-    //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
-    //        If possible, it would be nice to keep the same label naming convention when
-    //        adding in your local modules too.
-    // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
-    withLabel:process_single {
-        cpus   = { check_max( 1                  , 'cpus'    ) }
-        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h  * task.attempt, 'time'    ) }
+    // In this configuration file, we give little resources by default and
+    // explicitly bump them up for some processes.
+    // All rules should still increase resources every attempt to allow the
+    // pipeline to self-heal from MEMLIMIT/RUNLIMIT.
+
+    // Default
+    cpus   = 1
+    memory = { check_max( 50.MB * task.attempt, 'memory' ) }
+    time   = { check_max( 30.min * task.attempt, 'time' ) }
+
+    withName: 'SAMTOOLS_(CONVERT|FILTER)' {
+        time   = { check_max( 1.hour * task.attempt, 'time' ) }
+    }
+
+    withName: 'SAMTOOLS_(FASTA)' {
+        time   = { check_max( 2.hour * task.attempt, 'time' ) }
+    }
+
+    withName: 'SAMTOOLS_(STATS)' {
+        // Actually less than 1 hour for PacBio HiFi data, but confirmed 3 hours for Hi-C
+        time   = { check_max( 4.hour * task.attempt, 'time' ) }
     }
-    withLabel:process_low {
-        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+
+    withName: 'SAMTOOLS_(COLLATE|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' {
+        time   = { check_max( 8.hour * task.attempt, 'time' ) }
     }
-    withLabel:process_medium {
-        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+
+    withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS)' {
+        memory = { check_max( 250.MB  * task.attempt, 'memory'  ) }
     }
-    withLabel:process_high {
-        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+
+    withName: '.*:ALIGN_(HIFI|HIC|ILLUMINA):.*:SAMTOOLS_(STATS|VIEW)' {
+        memory = { check_max( 1.GB  * task.attempt, 'memory'  ) }
     }
-    withLabel:process_long {
-        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+    withName: '.*:ALIGN_(CLR|ONT):.*:SAMTOOLS_(STATS|VIEW)' {
+        memory = { check_max( 2.GB  * task.attempt, 'memory'  ) }
     }
-    withLabel:process_high_memory {
-        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+
+    withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' {
+        cpus   = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
+        memory = { check_max( 1.GB  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
     }
-    withLabel:error_ignore {
-        errorStrategy = 'ignore'
+
+    withName: 'SAMTOOLS_SORMADUP' {
+        cpus   = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
+        memory = { check_max( 10.GB + 0.6.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) }
+        time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
     }
-    withLabel:error_retry {
-        errorStrategy = 'retry'
-        maxRetries    = 2
+
+    withName: SAMTOOLS_SORT {
+        cpus   = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
+        // Memory increases by 768M for each thread
+        memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) }
+        time   = { check_max(        8.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) }
     }
-    withName:BWAMEM2_INDEX {
-        memory = { check_max( 1.GB * Math.ceil( 28 * fasta.size() / 1000000000 ) * task.attempt, 'memory' ) }
+
+    withName: BLAST_BLASTN {
+        time   = { check_max(          2.hour  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
+        memory = { check_max( 100.MB + 20.MB   * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
+        // The tool never seems to use more than 1 core even when given multiple. Sticking to 1 (the default)
     }
+
+    withName: BWAMEM2_INDEX {
+        memory = { check_max( 24.GB  * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'memory' ) }
+        time   = { check_max( 30.min * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'time'   ) }
+        // Not multithreaded
+    }
+
+    withName: BWAMEM2_MEM {
+        // Corresponds to 12 threads as the minimum, 24 threads if 3 billion reads
+        cpus   = { log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2) }
+        // Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size
+        // Runtime is considered proportional to the number of reads and inversely to number of threads
+        time   = { check_max( 3.h * task.attempt *  Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) }
+        // Base RAM usage is about 6 times the genome size. Each thread takes an additional 800 MB RAM
+        // Memory usage of SAMTOOLS_VIEW is negligible.
+        memory = { check_max( 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 800.MB * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) }
+    }
+
+    withName: MINIMAP2_ALIGN {
+        cpus   = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) }
+        memory = { check_max( (6.GB * Math.ceil( reference.size()  / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) }
+        time   = { check_max(        3.h  * Math.ceil( meta.read_count   / 1000000   ) * task.attempt, 'time'   ) }
+    }
+
+    withName: CRUMBLE {
+        // No correlation between memory usage and the number of reads or the genome size.
+        // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB.
+        // The formula below tries to mimic that growth and relies on job retries being allowed.
+        memory = { check_max( task.attempt * (task.attempt + 1) * 512.MB, 'memory' ) }
+        // Slightly better correlation between runtime and the number of reads.
+        time   = { check_max( 1.5.h + 1.h  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
+    }
+
     withName:CUSTOM_DUMPSOFTWAREVERSIONS {
         cache = false
     }

diff --git a/conf/modules.config b/conf/modules.config
@@ -33,18 +33,10 @@ process {
     }
 
     withName: SAMTOOLS_COLLATE {
+        ext.args   = { (params.use_work_dir_as_temp ? "-T." : "") }
         ext.prefix = { "${meta.id}.collate" }
     }
 
-    withName: SAMTOOLS_FIXMATE {
-        ext.args = '-m'
-        ext.prefix = { "${meta.id}.fixmate" }
-    }
-
-    withName: SAMTOOLS_MARKDUP {
-        ext.prefix = { "${meta.id}.markdup" }
-    }
-
     withName: BLAST_BLASTN {
         ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6'
     }
@@ -58,7 +50,14 @@ process {
     }
 
     withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' {
-        ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" }
+        // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp.
+        // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes
+        // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp
+        // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values.
+        // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes
+        // NOTE: Use `reference.size()` for now, and switch to `meta2.genome_size` once we update the modules.
+        // ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' }
+        ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' }
     }
 
     withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' {

diff --git a/docs/output.md b/docs/output.md
@@ -34,7 +34,7 @@ PacBio reads generated using both CLR and CCS technology are filtered using `BLA
 
 ### Short reads
 
-Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` markduplicate workflow. The mark duplicate alignments is output in the CRAM format, along with the index.
+Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` [mark-duplicate workflow](https://www.htslib.org/algorithms/duplicate.html#workflow). The mark duplicate alignments is output in the CRAM format, along with the index.
 
 <details markdown="1">
 <summary>Output files</summary>

diff --git a/modules.json b/modules.json
@@ -61,11 +61,6 @@
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
-                    "samtools/fixmate": {
-                        "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
-                    },
                     "samtools/flagstat": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
@@ -76,11 +71,6 @@
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
-                    "samtools/markdup": {
-                        "branch": "master",
-                        "git_sha": "9e51255c4f8ec69fb6ccf68593392835f14fecb8",
-                        "installed_by": ["modules"]
-                    },
                     "samtools/merge": {
                         "branch": "master",
                         "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013",

diff --git a/modules/local/pacbio_filter.nf b/modules/local/pacbio_filter.nf
@@ -5,7 +5,7 @@ process PACBIO_FILTER {
     conda "conda-forge::gawk=5.1.0"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
-        'quay.io/biocontainers/gawk:5.1.0' }"
+        'biocontainers/gawk:5.1.0' }"
 
     input:
     tuple val(meta), path(txt)

diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
@@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK {
     conda "conda-forge::python=3.8.3"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/python:3.8.3' :
-        'quay.io/biocontainers/python:3.8.3' }"
+        'biocontainers/python:3.8.3' }"
 
     input:
     path samplesheet

diff --git a/modules/local/samtools_sormadup.nf b/modules/local/samtools_sormadup.nf
@@ -0,0 +1,77 @@
+// Copied from https://github.com/nf-core/modules/pull/3310
+// Author: Matthias De Smet, https://github.com/matthdsm
+process SAMTOOLS_SORMADUP {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::samtools=1.17"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
+        'biocontainers/samtools:1.17--h00cdaf9_0' }"
+
+    input:
+    tuple val(meta), path(input)
+    tuple val(meta2), path(fasta)
+
+    output:
+    tuple val(meta), path("*.{bam,cram}")   , emit: bam
+    tuple val(meta), path("*.{bai,crai}")   , optional:true, emit: bam_index
+    tuple val(meta), path("*.metrics")      , emit: metrics
+    path "versions.yml"                     , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args  = task.ext.args  ?: ''
+    def args2 = task.ext.args2 ?: ''
+    def args3 = task.ext.args3 ?: ''
+    def args4 = task.ext.args4 ?: ''
+
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def extension = args.contains("--output-fmt sam") ? "sam" :
+                    args.contains("--output-fmt bam") ? "bam" :
+                    args.contains("--output-fmt cram") ? "cram" :
+                    "bam"
+    def reference = fasta ? "--reference ${fasta}" : ""
+
+    """
+    samtools collate \\
+        $args \\
+        -O \\
+        -u \\
+        -T ${prefix}.collate \\
+        --threads $task.cpus \\
+        ${reference} \\
+        ${input}  \\
+        - \\
+    | \\
+    samtools fixmate \\
+        $args2 \\
+        -m \\
+        -u \\
+        --threads $task.cpus \\
+        - \\
+        - \\
+    | \\
+    samtools sort \\
+        $args3 \\
+        -u \\
+        -T ${prefix}.sort \\
+        --threads $task.cpus \\
+        - \\
+    | \\
+    samtools markdup \\
+        -T ${prefix}.markdup \\
+        -f ${prefix}.metrics \\
+        --threads $task.cpus \\
+        $args4 \\
+        - \\
+        ${prefix}.${extension}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/unmask.nf b/modules/local/unmask.nf
@@ -5,7 +5,7 @@ process UNMASK {
     conda "conda-forge::gawk=5.1.0"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
-        'quay.io/biocontainers/gawk:5.1.0' }"
+        'biocontainers/gawk:5.1.0' }"
 
     input:
     tuple val(meta), path(fasta)

diff --git a/modules/nf-core/crumble/crumble.diff b/modules/nf-core/crumble/crumble.diff
diff --git a/modules/nf-core/crumble/main.nf b/modules/nf-core/crumble/main.nf