Skip to content

Commit

Permalink
Merge pull request #82 from sanger-tol/resource_optimisation
Browse files Browse the repository at this point in the history
Resource optimisation
  • Loading branch information
muffato authored Dec 18, 2023
2 parents bd02f75 + 79bcbbc commit 0f4e2a1
Show file tree
Hide file tree
Showing 23 changed files with 320 additions and 350 deletions.
157 changes: 114 additions & 43 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,64 +2,135 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sanger-tol/readmapping Nextflow base config file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A 'blank slate' config file, appropriate for general use on most high performance
compute environments. Assumes that all software is installed and available on
the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
----------------------------------------------------------------------------------------
*/

process {
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Increasing the number of CPUs often gives diminishing returns, so we increase it
following a logarithm curve. Example:
- 0 < value <= 1: start + step
- 1 < value <= 2: start + 2*step
- 2 < value <= 4: start + 3*step
- 4 < value <= 8: start + 4*step
In order to support re-runs, the step increase may be multiplied by the attempt
number prior to calling this function.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
// Modified logarithm function that doesn't return negative numbers
def positive_log(value, base) {
if (value <= 1) {
return 0
} else {
return Math.log(value)/Math.log(base)
}
}

def log_increase_cpus(start, step, value, base) {
return check_max(start + step * (1 + Math.ceil(positive_log(value, base))), 'cpus')
}

errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 1

process {

errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
maxRetries = 5
maxErrors = '-1'

// Process-specific resource requirements
// NOTE - Please try and re-use the labels below as much as possible.
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_single {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
// In this configuration file, we give little resources by default and
// explicitly bump them up for some processes.
// All rules should still increase resources every attempt to allow the
// pipeline to self-heal from MEMLIMIT/RUNLIMIT.

// Default
cpus = 1
memory = { check_max( 50.MB * task.attempt, 'memory' ) }
time = { check_max( 30.min * task.attempt, 'time' ) }

withName: 'SAMTOOLS_(CONVERT|FILTER)' {
time = { check_max( 1.hour * task.attempt, 'time' ) }
}

withName: 'SAMTOOLS_(FASTA)' {
time = { check_max( 2.hour * task.attempt, 'time' ) }
}

withName: 'SAMTOOLS_(STATS)' {
// Actually less than 1 hour for PacBio HiFi data, but confirmed 3 hours for Hi-C
time = { check_max( 4.hour * task.attempt, 'time' ) }
}
withLabel:process_low {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }

withName: 'SAMTOOLS_(COLLATE|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' {
time = { check_max( 8.hour * task.attempt, 'time' ) }
}
withLabel:process_medium {
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 36.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }

withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS)' {
memory = { check_max( 250.MB * task.attempt, 'memory' ) }
}
withLabel:process_high {
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
memory = { check_max( 72.GB * task.attempt, 'memory' ) }
time = { check_max( 16.h * task.attempt, 'time' ) }

withName: '.*:ALIGN_(HIFI|HIC|ILLUMINA):.*:SAMTOOLS_(STATS|VIEW)' {
memory = { check_max( 1.GB * task.attempt, 'memory' ) }
}
withLabel:process_long {
time = { check_max( 20.h * task.attempt, 'time' ) }
withName: '.*:ALIGN_(CLR|ONT):.*:SAMTOOLS_(STATS|VIEW)' {
memory = { check_max( 2.GB * task.attempt, 'memory' ) }
}
withLabel:process_high_memory {
memory = { check_max( 200.GB * task.attempt, 'memory' ) }

withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' {
cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
}
withLabel:error_ignore {
errorStrategy = 'ignore'

withName: 'SAMTOOLS_SORMADUP' {
cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
memory = { check_max( 10.GB + 0.6.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) }
time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
}
withLabel:error_retry {
errorStrategy = 'retry'
maxRetries = 2

withName: SAMTOOLS_SORT {
cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
// Memory increases by 768M for each thread
memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) }
time = { check_max( 8.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) }
}
withName:BWAMEM2_INDEX {
memory = { check_max( 1.GB * Math.ceil( 28 * fasta.size() / 1000000000 ) * task.attempt, 'memory' ) }

withName: BLAST_BLASTN {
time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
// The tool never seems to use more than 1 core even when given multiple. Sticking to 1 (the default)
}

withName: BWAMEM2_INDEX {
memory = { check_max( 24.GB * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'memory' ) }
time = { check_max( 30.min * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'time' ) }
// Not multithreaded
}

withName: BWAMEM2_MEM {
// Corresponds to 12 threads as the minimum, 24 threads if 3 billion reads
cpus = { log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2) }
// Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size
// Runtime is considered proportional to the number of reads and inversely to number of threads
time = { check_max( 3.h * task.attempt * Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) }
// Base RAM usage is about 6 times the genome size. Each thread takes an additional 800 MB RAM
// Memory usage of SAMTOOLS_VIEW is negligible.
memory = { check_max( 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 800.MB * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) }
}

withName: MINIMAP2_ALIGN {
cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) }
memory = { check_max( (6.GB * Math.ceil( reference.size() / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) }
time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
}

withName: CRUMBLE {
// No correlation between memory usage and the number of reads or the genome size.
// Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB.
// The formula below tries to mimic that growth and relies on job retries being allowed.
memory = { check_max( task.attempt * (task.attempt + 1) * 512.MB, 'memory' ) }
// Slightly better correlation between runtime and the number of reads.
time = { check_max( 1.5.h + 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
}

withName:CUSTOM_DUMPSOFTWAREVERSIONS {
cache = false
}
Expand Down
19 changes: 9 additions & 10 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,10 @@ process {
}

withName: SAMTOOLS_COLLATE {
ext.args = { (params.use_work_dir_as_temp ? "-T." : "") }
ext.prefix = { "${meta.id}.collate" }
}

withName: SAMTOOLS_FIXMATE {
ext.args = '-m'
ext.prefix = { "${meta.id}.fixmate" }
}

withName: SAMTOOLS_MARKDUP {
ext.prefix = { "${meta.id}.markdup" }
}

withName: BLAST_BLASTN {
ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6'
}
Expand All @@ -58,7 +50,14 @@ process {
}

withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' {
ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" }
// minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp.
// In fact, we can also use -I to *decrease* the memory requirements for smaller genomes
// NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp
// NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values.
// NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes
// NOTE: Use `reference.size()` for now, and switch to `meta2.genome_size` once we update the modules.
// ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' }
ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' }
}

withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' {
Expand Down
2 changes: 1 addition & 1 deletion docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ PacBio reads generated using both CLR and CCS technology are filtered using `BLA

### Short reads

Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` markduplicate workflow. The mark duplicate alignments is output in the CRAM format, along with the index.
Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` [mark-duplicate workflow](https://www.htslib.org/algorithms/duplicate.html#workflow). The mark duplicate alignments is output in the CRAM format, along with the index.

<details markdown="1">
<summary>Output files</summary>
Expand Down
10 changes: 0 additions & 10 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,6 @@
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"installed_by": ["modules"]
},
"samtools/fixmate": {
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"installed_by": ["modules"]
},
"samtools/flagstat": {
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
Expand All @@ -76,11 +71,6 @@
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"installed_by": ["modules"]
},
"samtools/markdup": {
"branch": "master",
"git_sha": "9e51255c4f8ec69fb6ccf68593392835f14fecb8",
"installed_by": ["modules"]
},
"samtools/merge": {
"branch": "master",
"git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013",
Expand Down
2 changes: 1 addition & 1 deletion modules/local/pacbio_filter.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process PACBIO_FILTER {
conda "conda-forge::gawk=5.1.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
'quay.io/biocontainers/gawk:5.1.0' }"
'biocontainers/gawk:5.1.0' }"

input:
tuple val(meta), path(txt)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/samplesheet_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK {
conda "conda-forge::python=3.8.3"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/python:3.8.3' :
'quay.io/biocontainers/python:3.8.3' }"
'biocontainers/python:3.8.3' }"

input:
path samplesheet
Expand Down
77 changes: 77 additions & 0 deletions modules/local/samtools_sormadup.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copied from https://github.com/nf-core/modules/pull/3310
// Author: Matthias De Smet, https://github.com/matthdsm
process SAMTOOLS_SORMADUP {
tag "$meta.id"
label 'process_medium'

conda "bioconda::samtools=1.17"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
'biocontainers/samtools:1.17--h00cdaf9_0' }"

input:
tuple val(meta), path(input)
tuple val(meta2), path(fasta)

output:
tuple val(meta), path("*.{bam,cram}") , emit: bam
tuple val(meta), path("*.{bai,crai}") , optional:true, emit: bam_index
tuple val(meta), path("*.metrics") , emit: metrics
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def args2 = task.ext.args2 ?: ''
def args3 = task.ext.args3 ?: ''
def args4 = task.ext.args4 ?: ''

def prefix = task.ext.prefix ?: "${meta.id}"
def extension = args.contains("--output-fmt sam") ? "sam" :
args.contains("--output-fmt bam") ? "bam" :
args.contains("--output-fmt cram") ? "cram" :
"bam"
def reference = fasta ? "--reference ${fasta}" : ""

"""
samtools collate \\
$args \\
-O \\
-u \\
-T ${prefix}.collate \\
--threads $task.cpus \\
${reference} \\
${input} \\
- \\
| \\
samtools fixmate \\
$args2 \\
-m \\
-u \\
--threads $task.cpus \\
- \\
- \\
| \\
samtools sort \\
$args3 \\
-u \\
-T ${prefix}.sort \\
--threads $task.cpus \\
- \\
| \\
samtools markdup \\
-T ${prefix}.markdup \\
-f ${prefix}.metrics \\
--threads $task.cpus \\
$args4 \\
- \\
${prefix}.${extension}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""
}
2 changes: 1 addition & 1 deletion modules/local/unmask.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process UNMASK {
conda "conda-forge::gawk=5.1.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
'quay.io/biocontainers/gawk:5.1.0' }"
'biocontainers/gawk:5.1.0' }"

input:
tuple val(meta), path(fasta)
Expand Down
9 changes: 8 additions & 1 deletion modules/nf-core/crumble/crumble.diff

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions modules/nf-core/crumble/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 0f4e2a1

Please sign in to comment.