Merge pull request #92 from sanger-tol/resource_optimisation

Resource optimisation
sanger-tol · Nov 24, 2023 · 48f7738 · 48f7738
2 parents 273bdf5 + a5d21f4
commit 48f7738
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 62 deletions.
diff --git a/conf/base.config b/conf/base.config
@@ -2,66 +2,107 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     sanger-tol/genomenote Nextflow base config file
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    A 'blank slate' config file, appropriate for general use on most high performance
-    compute environments. Assumes that all software is installed and available on
-    the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
-----------------------------------------------------------------------------------------
 */
 
-process {
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Increasing the number of CPUs often gives diminishing returns, so we increase it
+    following a logarithm curve. Example:
+        - 0 < value <= 1: start + step
+        - 1 < value <= 2: start + 2*step
+        - 2 < value <= 4: start + 3*step
+        - 4 < value <= 8: start + 4*step
+    In order to support re-runs, the step increase may be multiplied by the attempt
+    number prior to calling this function.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+// Modified logarithm function that doesn't return negative numbers
+def positive_log(value, base) {
+    if (value <= 1) {
+        return 0
+    } else {
+        return Math.log(value)/Math.log(base)
+    }
+}
+
+def log_increase_cpus(start, step, value, base) {
+    return check_max(start + step * (1 + Math.ceil(positive_log(value, base))), 'cpus')
+}
 
-    cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
-    memory = { check_max( 6.GB * task.attempt, 'memory' ) }
-    time   = { check_max( 4.h  * task.attempt, 'time'   ) }
+
+process {
 
     errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
-    maxRetries    = 5
+    maxRetries    = 3
     maxErrors     = '-1'
 
-    // Process-specific resource requirements
-    // NOTE - Please try and re-use the labels below as much as possible.
-    //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
-    //        If possible, it would be nice to keep the same label naming convention when
-    //        adding in your local modules too.
-    // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
-    withLabel:process_single {
-        cpus   = { check_max( 1                  , 'cpus'    ) }
-        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h  * task.attempt, 'time'    ) }
+    // In this configuration file, we give little resources by default and
+    // explicitly bump them up for some processes.
+    // All rules should still increase resources every attempt to allow the
+    // pipeline to self-heal from MEMLIMIT/RUNLIMIT.
+
+    // Default
+    cpus   = 1
+    memory = { check_max( 50.MB * task.attempt, 'memory' ) }
+    time   = { check_max( 30.min * task.attempt, 'time' ) }
+
+    // These processes typically complete within 30 min to 1.5 hours.
+    withName: 'BED_SORT|BEDTOOLS_BAMTOBED|COOLER_CLOAD|COOLER_ZOOMIFY|FILTER_BED' {
+        time   = { check_max( 4.hour * task.attempt, 'time' ) }
     }
-    withLabel:process_low {
-        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
+
+    // These processes may take a few hours.
+    withName: 'FILTER_SORT|SAMTOOLS_VIEW' {
+        time   = { check_max( 8.hour * task.attempt, 'time' ) }
+    }
+
+    withName: SAMTOOLS_VIEW {
+        memory = { check_max( 1.GB  * task.attempt, 'memory'  ) }
+    }
+
+    withName: FASTK_FASTK {
         memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+        cpus   = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
     }
-    withLabel:process_medium {
-        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+
+    withName: MERQURYFK_MERQURYFK {
+        // Memory usage seems to be following two different linear rules:
+        //  - 1 GB for every 60 Mbp for genomes < 840 Mbp
+        //  - 2 GB for every 1 Gbp for genomes > 840 Mbp, with a 12 GB offset to match the previous rule
+        memory = { check_max( 1.GB + ((meta.genome_size < 840000000) ? (Math.ceil(meta.genome_size/60000000) * 1.GB * task.attempt) : (Math.ceil(meta.genome_size/1000000000) * 2.GB * task.attempt + 12.GB)), 'memory' ) }
+        cpus   = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
     }
-    withLabel:process_high {
-        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+
+    withName: BUSCO {
+        // Weird memory growth. The formula below is to fit the actual usage and avoid BUSCO being killed.
+        memory = { check_max(1.GB * Math.max(Math.pow(2, positive_log(meta.genome_size/1000000, 10)+task.attempt),  Math.floor(meta.genome_size/1000000000) * 16 * task.attempt), 'cpus'  ) }
+        cpus   = { log_increase_cpus(4, 2*task.attempt, Math.ceil(meta.genome_size/1000000000), 2) }
+        time   = { check_max( 2.h * Math.ceil(meta.genome_size/1000000000), 'time') }
     }
-    withLabel:process_long {
-        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+
+    withName: 'BED_SORT|FILTER_SORT' {
+        cpus   = { log_increase_cpus(2, 2*task.attempt, 1, 2) }
+        memory = { check_max( 16.GB * task.attempt, 'memory'  ) }
     }
-    withLabel:process_high_memory {
-        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+
+    withName: COOLER_CLOAD {
+        memory = { check_max( 6.GB  * task.attempt, 'memory'  ) }
     }
-    withName:"COOLER_.*" {
-        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+
+    withName: COOLER_DUMP {
+        memory = { check_max( 100.MB * task.attempt, 'memory'  ) }
     }
-    withLabel:error_ignore {
-        errorStrategy = 'ignore'
+
+    withName: COOLER_ZOOMIFY {
+        cpus   = { log_increase_cpus(2, 2*task.attempt, 1, 2) }
+        memory = { check_max( (meta.genome_size < 1000000000 ? 16.GB : 24.GB) * task.attempt, 'memory' ) }
     }
-    withLabel:error_retry {
-        errorStrategy = 'retry'
-        maxRetries    = 2
+
+    withName: MULTIQC {
+        memory = { check_max( 150.MB  * task.attempt, 'memory'  ) }
     }
+
     withName:CUSTOM_DUMPSOFTWAREVERSIONS {
         cache = false
     }

diff --git a/nextflow.config b/nextflow.config
@@ -207,6 +207,7 @@ report {
 trace {
     enabled = true
     file    = "${params.tracedir}/execution_trace_${trace_timestamp}.txt"
+    fields  = 'task_id,hash,native_id,process,tag,status,exit,cpus,memory,time,attempt,submit,start,complete,duration,%cpu,%mem,peak_rss,rchar,wchar'
 }
 dag {
     enabled = true

diff --git a/subworkflows/local/genome_statistics.nf b/subworkflows/local/genome_statistics.nf
@@ -73,12 +73,14 @@ workflow GENOME_STATISTICS {
     | join ( FASTK_FASTK.out.ktab )
     | set { ch_combo }
 
-    ch_grab = GrabFiles ( ch_pacbio.dir )
+    ch_pacbio.dir
+    | map { meta, dir -> [ meta, [dir.listFiles().findAll { it.toString().endsWith(".hist") }], [dir.listFiles().findAll { it.toString().contains(".ktab") }] ] }
+    | set { ch_grab }
 
     ch_combo
     | mix ( ch_grab )
     | combine ( genome )
-    | map { meta, hist, ktab, meta2, fasta -> [ meta, hist, ktab, fasta, [] ] }
+    | map { meta, hist, ktab, meta2, fasta -> [ meta + [genome_size: meta2.genome_size], hist, ktab, fasta, [] ] }
     | set { ch_merq }
 
 
@@ -98,9 +100,9 @@ workflow GENOME_STATISTICS {
 
     MERQURYFK_MERQURYFK.out.qv
     | join ( MERQURYFK_MERQURYFK.out.stats )
-    | ifEmpty ( [ [], [], [] ] )
     | map { meta, qv, comp -> [ meta + [ id: "merq" ], qv, comp ] }
     | groupTuple ()
+    | ifEmpty ( [ [], [], [] ] )
     | set { ch_merqury }
 
     CREATETABLE ( ch_summary, ch_busco, ch_merqury, flagstat )
@@ -119,16 +121,3 @@ workflow GENOME_STATISTICS {
 
 }
 
-
-process GrabFiles {
-    tag "${meta.id}"
-    executor 'local'
-
-    input:
-    tuple val(meta), path("in")
-
-    output:
-    tuple val(meta), path("in/*.hist"), path("in/*.ktab*", hidden:true)
-
-    "true"
-}
diff --git a/workflows/genomenote.nf b/workflows/genomenote.nf
@@ -97,16 +97,20 @@ workflow GENOMENOTE {
     // MODULE: Uncompress fasta file if needed
     //
     ch_fasta
-    | map { file -> [ [ 'id': file.baseName.tokenize('.')[0..1].join('.') ], file ] }
+    | map { file -> [ [ 'id': file.baseName ], file ] }
     | set { ch_genome }
 
     if ( params.fasta.endsWith('.gz') ) {
-        ch_fasta    = GUNZIP ( ch_genome ).gunzip
+        ch_unzipped = GUNZIP ( ch_genome ).gunzip
         ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() )
     } else {
-        ch_fasta    = ch_genome
+        ch_unzipped = ch_genome
     }
 
+    ch_unzipped
+    | map { meta, fa -> [ meta + [id: fa.baseName, genome_size: fa.size()], fa] }
+    | set { ch_fasta }
+
 
     //
     // SUBWORKFLOW: Create contact map matrices from HiC alignment files