From 2109bed08430acce75f008631be2fef7c1aa5202 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 31 Oct 2023 12:56:47 +0000 Subject: [PATCH 01/50] Collect the read counts from the input files --- subworkflows/local/input_check.nf | 40 ++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 379d2fa..681a935 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -3,6 +3,7 @@ // include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' workflow INPUT_CHECK { @@ -11,20 +12,35 @@ workflow INPUT_CHECK { main: + ch_versions = Channel.empty() + + // Read the samplesheet SAMPLESHEET_CHECK ( samplesheet ).csv | splitCsv ( header:true, sep:',' ) - | map { create_data_channel( it ) } + // Prepare the channel for SAMTOOLS_FLAGSTAT + | map { row -> [row + [id: file(row.datafile).baseName], file(row.datafile, checkIfExists: true), []] } + | set { samplesheet_rows } + ch_versions = ch_versions.mix ( SAMPLESHEET_CHECK.out.versions.first() ) + + // Get stats from each input file + SAMTOOLS_FLAGSTAT ( samplesheet_rows ) + ch_versions = ch_versions.mix ( SAMTOOLS_FLAGSTAT.out.versions.first() ) + + // Create the read channel for the rest of the pipeline + samplesheet_rows + | join( SAMTOOLS_FLAGSTAT.out.flagstat ) + | map { meta, datafile, dummy, stats -> create_data_channel( meta, datafile, stats ) } | set { reads } emit: reads // channel: [ val(meta), /path/to/datafile ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } // Function to get list of [ meta, reads ] -def create_data_channel ( LinkedHashMap row ) { +def create_data_channel ( LinkedHashMap row, datafile, stats ) { // create meta map def meta = [:] meta.id = row.sample @@ -39,13 +55,15 @@ def create_data_channel ( LinkedHashMap row ) { } meta.read_group = "\'@RG\\tID:" + row.datafile.split('/')[-1].split('\\.')[0] + "\\tPL:" + platform + "\\tSM:" + meta.id.split('_')[0..-2].join('_') + "\'" - - // add path(s) of the read file(s) to the meta map - def data_meta = [] - if ( !file(row.datafile).exists() ) { - exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.datafile}" - } else { - data_meta = [ meta, file(row.datafile) ] + // Read the first line of the flagstat file + // 3127898040 + 0 in total (QC-passed reads + QC-failed reads) + // and make the sum of both integers + stats.withReader { + line = it.readLine() + def lspl = line.split() + def read_count = lspl[0].toInteger() + lspl[2].toInteger() + meta.read_count = read_count } - return data_meta + + return [meta, datafile] } From e2e8ead830fb5d5161fc0dc9cb9f463c8165c4e0 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 31 Oct 2023 12:57:15 +0000 Subject: [PATCH 02/50] Collect the size of the genome (file size is a good proxy) --- subworkflows/local/prepare_genome.nf | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index b5f23fd..5264569 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -19,12 +19,15 @@ workflow PREPARE_GENOME { // Uncompress genome fasta file if required if ( params.fasta.endsWith('.gz') ) { - ch_fasta = GUNZIP ( fasta ).gunzip + ch_unzipped = GUNZIP ( fasta ).gunzip ch_versions = ch_versions.mix ( GUNZIP.out.versions ) } else { - ch_fasta = fasta + ch_unzipped = fasta } + ch_unzipped + | map { meta, fa -> [ meta + [id: fa.baseName, genome_size: fa.size()], fa] } + | set { ch_fasta } // Unmask genome fasta UNMASK ( ch_fasta ) From 11678fc5db5d56b57c77e10e0442302042e3a0fc Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 31 Oct 2023 12:58:46 +0000 Subject: [PATCH 03/50] Simply use the whole base name to name the channel --- workflows/readmapping.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index 75995b4..9d12b0b 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -81,7 +81,7 @@ workflow READMAPPING { // SUBWORKFLOW: Uncompress and prepare reference genome files // ch_fasta - | map { [ [ id: it.baseName.tokenize(".")[0..1].join(".") ], it ] } + | map { [ [ id: it.baseName ], it ] } | set { ch_genome } PREPARE_GENOME ( ch_genome ) From 84caa1072dabf9f97043f7dbdaab1683673be9f8 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 31 Oct 2023 13:01:00 +0000 Subject: [PATCH 04/50] Deal with genomes > 4 Gbp Closes #81 --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 08c127c..88450e5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -58,7 +58,7 @@ process { } withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { - ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" } + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" + (meta.genome_size > 4294967296 ? (" -I" + Math.ceil(meta.genome_size/1073741824)+"G") : "") } } withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { From 8dca0f3e93a4b64dc943729d8ea065ad9a4a0130 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 2 Nov 2023 14:30:09 +0000 Subject: [PATCH 05/50] Update the read count for SAMTOOLS_MERGE --- subworkflows/local/align_ont.nf | 3 ++- subworkflows/local/align_pacbio.nf | 3 ++- subworkflows/local/markdup_stats.nf | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index 3d28217..e37e01d 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -29,8 +29,9 @@ workflow ALIGN_ONT { // Collect all alignment output by sample name MINIMAP2_ALIGN.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } | set { ch_bams } diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 080af18..0365619 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -36,8 +36,9 @@ workflow ALIGN_PACBIO { // Collect all alignment output by sample name MINIMAP2_ALIGN.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } | set { ch_bams } diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf index 4fca771..a749ec2 100644 --- a/subworkflows/local/markdup_stats.nf +++ b/subworkflows/local/markdup_stats.nf @@ -25,8 +25,9 @@ workflow MARKDUP_STATS { // Collect all BWAMEM2 output by sample name SAMTOOLS_SORT.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } | set { ch_bams } From 8c67dafe53af6084853f6ba27a9f541b4293a653 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 2 Nov 2023 15:22:24 +0000 Subject: [PATCH 06/50] Some values are large and need 64 bits --- subworkflows/local/input_check.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 681a935..26fe88d 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -61,7 +61,7 @@ def create_data_channel ( LinkedHashMap row, datafile, stats ) { stats.withReader { line = it.readLine() def lspl = line.split() - def read_count = lspl[0].toInteger() + lspl[2].toInteger() + def read_count = lspl[0].toLong() + lspl[2].toLong() meta.read_count = read_count } From e432d894961fd4c737da4e05093776e216e221c6 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 2 Nov 2023 16:19:53 +0000 Subject: [PATCH 07/50] Wrong information --- conf/base.config | 4 ---- 1 file changed, 4 deletions(-) diff --git a/conf/base.config b/conf/base.config index 284ac0b..f782727 100644 --- a/conf/base.config +++ b/conf/base.config @@ -2,10 +2,6 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sanger-tol/readmapping Nextflow base config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - A 'blank slate' config file, appropriate for general use on most high performance - compute environments. Assumes that all software is installed and available on - the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. ----------------------------------------------------------------------------------------- */ process { From ae6942b9cf51ccf9102cf294bcc4cce982d22e82 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 2 Nov 2023 16:20:38 +0000 Subject: [PATCH 08/50] Updated the error codes to the latest template versions. Covers all LSF exit codes --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index f782727..c27163e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,7 @@ process { memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' From 46e991068773f6c355bfe7a26ead661ce1abdf26 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 2 Nov 2023 16:21:18 +0000 Subject: [PATCH 09/50] Estimate the resource requirements based on the size of the inputs --- conf/base.config | 91 +++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/conf/base.config b/conf/base.config index c27163e..f73ece2 100644 --- a/conf/base.config +++ b/conf/base.config @@ -6,56 +6,69 @@ process { - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 + maxRetries = 2 maxErrors = '-1' - // Process-specific resource requirements - // NOTE - Please try and re-use the labels below as much as possible. - // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. - // If possible, it would be nice to keep the same label naming convention when - // adding in your local modules too. - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors - withLabel:process_single { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } - } - withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } - } - withLabel:process_medium { + // In this configuration file, we give little resources by default and + // explicitly bump them up for some processes. + // All rules should still increase resources every attempt to allow the + // pipeline to self-heal from MEMLIMIT/RUNLIMIT. + + // Default + cpus = 1 + memory = { check_max( 50.MB * task.attempt, 'memory' ) } + time = { check_max( 30.min * task.attempt, 'time' ) } + + // These SAMTOOLS processes can take more than 30 min, and sometimes several hours. + // Let's give them 8 hours, which should be plenty of time. + withName: 'SAMTOOLS_(COLLATE|CONVERT|FASTA|FASTQ|FIXMATE|MARKDUP|MERGE|SORT|STATS|VIEW)' { + time = { check_max( 8.hour * task.attempt, 'time' ) } + } + + // A bit more memory for these SAMTOOLS. + withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS|MARKDUP|STATS|VIEW)' { + memory = { check_max( 1.GB * task.attempt, 'memory' ) } + } + + withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' { cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } } - withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 72.GB * task.attempt, 'memory' ) } - time = { check_max( 16.h * task.attempt, 'time' ) } + withName: '.*:ALIGN_HIC:.*:SAMTOOLS_COLLATE' { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 1.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } } - withLabel:process_long { - time = { check_max( 20.h * task.attempt, 'time' ) } + + withName: SAMTOOLS_SORT { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB + 2.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } } - withLabel:process_high_memory { - memory = { check_max( 200.GB * task.attempt, 'memory' ) } + + withName: BLAST_BLASTN { + time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } + // The tool never seems to use more than 1 core even when given multiple. Sticking to 1 (the default) } - withLabel:error_ignore { - errorStrategy = 'ignore' + + withName: BWAMEM2_INDEX { + memory = { check_max( 24.GB * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'memory' ) } + time = { check_max( 20.min * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'time' ) } + // Not multithreaded } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 + + withName: BWAMEM2_MEM { + memory = { check_max( 8.GB + 8.GB * Math.ceil( meta2.genome_size / 1000000000 ) * task.attempt, 'memory' ) } + time = { check_max( 1.min * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + cpus = { check_max( 8 + 4 * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'cpus' ) } } - withName:BWAMEM2_INDEX { - memory = { check_max( 1.GB * Math.ceil( 28 * fasta.size() / 1000000000 ) * task.attempt, 'memory' ) } + + withName: MINIMAP2_ALIGN { + memory = { check_max( 8.GB + 6.GB * Math.ceil( reference.size() / 1000000000 ) * task.attempt, 'memory' ) } + time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + cpus = { check_max( 4 + 2 * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'cpus' ) } } + withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } From 5f2d78636f9ff3906c696f900a1e0d4c6825a88a Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 07:19:16 +0000 Subject: [PATCH 10/50] bugfix: minimap2 uses the decimal system and understands floating point values ! --- conf/modules.config | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 88450e5..3c4b10d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -58,7 +58,11 @@ process { } withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { - ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" + (meta.genome_size > 4294967296 ? (" -I" + Math.ceil(meta.genome_size/1073741824)+"G") : "") } + // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. + // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp + // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. + // NOTE: minimap2 2.25 raises the default to 8G + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" + (meta.genome_size > 4e9 ? (" -I" + Math.ceil(meta.genome_size/1e9)+"G") : "") } } withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { From 35a646d427f25e70bd0db1d4e2a8ca729df1a475 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 07:54:13 +0000 Subject: [PATCH 11/50] The output of SAMTOOLS_MERGE is sorted --- subworkflows/local/align_ont.nf | 7 +------ subworkflows/local/align_pacbio.nf | 8 +------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index e37e01d..f773696 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -40,13 +40,8 @@ workflow ALIGN_ONT { ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_MERGE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORT.out.bam + SAMTOOLS_MERGE.out.bam | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 0365619..fcf8a88 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -5,7 +5,6 @@ include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' @@ -47,13 +46,8 @@ workflow ALIGN_PACBIO { ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_MERGE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORT.out.bam + SAMTOOLS_MERGE.out.bam | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } From 04b2660680ab729b2a9cb2cc7a89f037ba811ee7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 08:23:27 +0000 Subject: [PATCH 12/50] Logically, SAMTOOLS_MERGE should happen in the calling sub-workflow --- subworkflows/local/markdup_stats.nf | 14 ++++++++++---- subworkflows/local/markduplicate.nf | 10 ++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf index a749ec2..ef90f2e 100644 --- a/subworkflows/local/markdup_stats.nf +++ b/subworkflows/local/markdup_stats.nf @@ -3,9 +3,10 @@ // Convert to CRAM and calculate statistics // -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { MARKDUPLICATE } from '../../subworkflows/local/markduplicate' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' +include { MARKDUPLICATE } from '../../subworkflows/local/markduplicate' +include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' workflow MARKDUP_STATS { @@ -31,8 +32,13 @@ workflow MARKDUP_STATS { | set { ch_bams } + // Merge position sorted bam files + SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + // Mark duplicates - MARKDUPLICATE ( ch_bams ) + MARKDUPLICATE ( SAMTOOLS_MERGE.out.bam ) ch_versions = ch_versions.mix ( MARKDUPLICATE.out.versions ) diff --git a/subworkflows/local/markduplicate.nf b/subworkflows/local/markduplicate.nf index 3f47aa4..7568826 100644 --- a/subworkflows/local/markduplicate.nf +++ b/subworkflows/local/markduplicate.nf @@ -2,7 +2,6 @@ // Merge BAM files and mark duplicates // -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' include { SAMTOOLS_COLLATE } from '../../modules/nf-core/samtools/collate/main' include { SAMTOOLS_FIXMATE } from '../../modules/nf-core/samtools/fixmate/main' include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' @@ -11,20 +10,15 @@ include { SAMTOOLS_MARKDUP } from '../../modules/nf-core/samtools/markdup/main' workflow MARKDUPLICATE { take: - bams // channel: [ val(meta), [ /path/to/bams ] ] + bam // channel: val(meta), [ /path/to/bam ] main: ch_versions = Channel.empty() - // Merge position sorted bam files - SAMTOOLS_MERGE ( bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - // Collate merged BAM file - SAMTOOLS_COLLATE ( SAMTOOLS_MERGE.out.bam, [] ) + SAMTOOLS_COLLATE ( bam, [] ) ch_versions = ch_versions.mix ( SAMTOOLS_COLLATE.out.versions.first() ) From f9b27630619fd9b0f2f40ff06bd4a492f5a8c2c7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 08:31:56 +0000 Subject: [PATCH 13/50] Skip SAMTOOLS_MERGE if there is a single file --- subworkflows/local/align_ont.nf | 8 +++++++- subworkflows/local/align_pacbio.nf | 8 +++++++- subworkflows/local/markdup_stats.nf | 14 ++++++++++++-- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index f773696..07cd319 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -32,16 +32,22 @@ workflow ALIGN_ONT { | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } | set { ch_bams } // Merge - SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) // Convert merged BAM to CRAM and calculate indices and statistics SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index fcf8a88..fac8ac1 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -38,16 +38,22 @@ workflow ALIGN_PACBIO { | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } | set { ch_bams } // Merge - SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) // Convert merged BAM to CRAM and calculate indices and statistics SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf index ef90f2e..6ce10c5 100644 --- a/subworkflows/local/markdup_stats.nf +++ b/subworkflows/local/markdup_stats.nf @@ -29,16 +29,26 @@ workflow MARKDUP_STATS { | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } | set { ch_bams } // Merge position sorted bam files - SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) + | set { ch_bam } + + // Mark duplicates - MARKDUPLICATE ( SAMTOOLS_MERGE.out.bam ) + MARKDUPLICATE ( ch_bam ) ch_versions = ch_versions.mix ( MARKDUPLICATE.out.versions ) From fb1cd6d175aca0191ee70b78ebeff9133b9d0d99 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 08:36:10 +0000 Subject: [PATCH 14/50] Explain why there is no SAMTOOLS_SORT --- subworkflows/local/align_ont.nf | 1 + subworkflows/local/align_pacbio.nf | 1 + 2 files changed, 2 insertions(+) diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index 07cd319..bace9e8 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -23,6 +23,7 @@ workflow ALIGN_ONT { | map { meta, file -> file } | set { ch_fasta } + // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM MINIMAP2_ALIGN ( reads, ch_fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index fac8ac1..d3e517a 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -29,6 +29,7 @@ workflow ALIGN_PACBIO { | map { meta, file -> file } | set { ch_fasta } + // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, ch_fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) From c00eee62ffc680e26f1f0700d6111bce78d38b1f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 09:04:49 +0000 Subject: [PATCH 15/50] Replaced the markduplicate workflow by a single module / bash pipeline --- conf/base.config | 9 +-- conf/modules.config | 9 --- docs/output.md | 2 +- modules.json | 10 --- modules/local/samtools_sormadup.nf | 82 +++++++++++++++++++++++ modules/nf-core/samtools/fixmate/main.nf | 37 ---------- modules/nf-core/samtools/fixmate/meta.yml | 49 -------------- modules/nf-core/samtools/markdup/main.nf | 63 ----------------- modules/nf-core/samtools/markdup/meta.yml | 44 ------------ subworkflows/local/markdup_stats.nf | 14 ++-- subworkflows/local/markduplicate.nf | 43 ------------ 11 files changed, 95 insertions(+), 267 deletions(-) create mode 100644 modules/local/samtools_sormadup.nf delete mode 100644 modules/nf-core/samtools/fixmate/main.nf delete mode 100644 modules/nf-core/samtools/fixmate/meta.yml delete mode 100644 modules/nf-core/samtools/markdup/main.nf delete mode 100644 modules/nf-core/samtools/markdup/meta.yml delete mode 100644 subworkflows/local/markduplicate.nf diff --git a/conf/base.config b/conf/base.config index f73ece2..b07c447 100644 --- a/conf/base.config +++ b/conf/base.config @@ -22,12 +22,12 @@ process { // These SAMTOOLS processes can take more than 30 min, and sometimes several hours. // Let's give them 8 hours, which should be plenty of time. - withName: 'SAMTOOLS_(COLLATE|CONVERT|FASTA|FASTQ|FIXMATE|MARKDUP|MERGE|SORT|STATS|VIEW)' { + withName: 'SAMTOOLS_(COLLATE|CONVERT|FASTA|FASTQ|MERGE|SORMADUP|SORT|STATS|VIEW)' { time = { check_max( 8.hour * task.attempt, 'time' ) } } // A bit more memory for these SAMTOOLS. - withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS|MARKDUP|STATS|VIEW)' { + withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS|STATS|VIEW)' { memory = { check_max( 1.GB * task.attempt, 'memory' ) } } @@ -35,9 +35,10 @@ process { cpus = { check_max( 6 * task.attempt, 'cpus' ) } memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } } - withName: '.*:ALIGN_HIC:.*:SAMTOOLS_COLLATE' { + + withName: 'SAMTOOLS_SORMADUP' { cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 1.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } + memory = { check_max( 1.GB + 1.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } } withName: SAMTOOLS_SORT { diff --git a/conf/modules.config b/conf/modules.config index 3c4b10d..c27c024 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -36,15 +36,6 @@ process { ext.prefix = { "${meta.id}.collate" } } - withName: SAMTOOLS_FIXMATE { - ext.args = '-m' - ext.prefix = { "${meta.id}.fixmate" } - } - - withName: SAMTOOLS_MARKDUP { - ext.prefix = { "${meta.id}.markdup" } - } - withName: BLAST_BLASTN { ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' } diff --git a/docs/output.md b/docs/output.md index 8cc61c3..9722d11 100644 --- a/docs/output.md +++ b/docs/output.md @@ -34,7 +34,7 @@ PacBio reads generated using both CLR and CCS technology are filtered using `BLA ### Short reads -Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` markduplicate workflow. The mark duplicate alignments is output in the CRAM format, along with the index. +Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` [mark-duplicate workflow](https://www.htslib.org/algorithms/duplicate.html#workflow). The mark duplicate alignments is output in the CRAM format, along with the index.
Output files diff --git a/modules.json b/modules.json index ac47073..a9a06be 100644 --- a/modules.json +++ b/modules.json @@ -61,11 +61,6 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, - "samtools/fixmate": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] - }, "samtools/flagstat": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -76,11 +71,6 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, - "samtools/markdup": { - "branch": "master", - "git_sha": "9e51255c4f8ec69fb6ccf68593392835f14fecb8", - "installed_by": ["modules"] - }, "samtools/merge": { "branch": "master", "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", diff --git a/modules/local/samtools_sormadup.nf b/modules/local/samtools_sormadup.nf new file mode 100644 index 0000000..c81737f --- /dev/null +++ b/modules/local/samtools_sormadup.nf @@ -0,0 +1,82 @@ +process SAMTOOLS_SORMADUP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.{bam,cram}") , emit: bam + tuple val(meta), path("*.{bai,crai}") , optional:true, emit: bam_index + tuple val(meta), path("*.metrics") , emit: metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' + def args5 = task.ext.args5 ?: '' + + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + "bam" + def reference = fasta ? "--reference ${fasta}" : "" + def sort_memory = (task.memory.mega/task.cpus*0.75).intValue() + + """ + samtools cat \\ + $args2 \\ + --threads $task.cpus \\ + ${input} \\ + | \\ + samtools collate \\ + $args3 \\ + -O \\ + -u \\ + -T ${prefix}.collate \\ + --threads $task.cpus \\ + ${reference} \\ + - \\ + | \\ + samtools fixmate \\ + $args4 \\ + -m \\ + -u \\ + --threads $task.cpus \\ + - \\ + - \\ + | \\ + samtools sort \\ + $args5 \\ + -u \\ + -T ${prefix}.sort \\ + --threads $task.cpus \\ + -m ${sort_memory}M \\ + - \\ + | \\ + samtools markdup \\ + -T ${prefix}.markdup \\ + -f ${prefix}.metrics \\ + --threads $task.cpus \\ + $args \\ + - \\ + ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fixmate/main.nf b/modules/nf-core/samtools/fixmate/main.nf deleted file mode 100644 index 7127bff..0000000 --- a/modules/nf-core/samtools/fixmate/main.nf +++ /dev/null @@ -1,37 +0,0 @@ -process SAMTOOLS_FIXMATE { - tag "$meta.id" - label 'process_low' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(bam) - - output: - tuple val(meta), path("*.bam"), emit: bam - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools \\ - fixmate \\ - $args \\ - --threads ${task.cpus-1} \\ - $bam \\ - ${prefix}.bam \\ - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/fixmate/meta.yml b/modules/nf-core/samtools/fixmate/meta.yml deleted file mode 100644 index a72c5ca..0000000 --- a/modules/nf-core/samtools/fixmate/meta.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: samtools_fixmate -description: Samtools fixmate is a tool that can fill in information (insert size, cigar, mapq) about paired end reads onto the corresponding other read. Also has options to remove secondary/unmapped alignments and recalculate whether reads are proper pairs. -keywords: - - fixmate - - samtools - - insert size - - repair - - bam - - paired - - read pairs -tools: - - samtools: - description: | - SAMtools is a set of utilities for interacting with and post-processing - short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. - These files are generated as output by short read aligners like BWA. - homepage: http://www.htslib.org/ - documentation: http://www.htslib.org/doc/samtools.html - tool_dev_url: https://github.com/samtools/samtools - doi: 10.1093/bioinformatics/btp352 - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file, must be sorted by name, not coordinate - pattern: "*.{bam,cram,sam}" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - bam: - type: file - description: A BAM/CRAM/SAM file with mate information added and/or proper pairs recalled - pattern: "*.{bam,cram,sam}" - -authors: - - "@sppearce" diff --git a/modules/nf-core/samtools/markdup/main.nf b/modules/nf-core/samtools/markdup/main.nf deleted file mode 100644 index 218cf97..0000000 --- a/modules/nf-core/samtools/markdup/main.nf +++ /dev/null @@ -1,63 +0,0 @@ -process SAMTOOLS_MARKDUP { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(input) - path fasta - - output: - tuple val(meta), path("*.bam"), emit: bam, optional: true - tuple val(meta), path("*.cram"), emit: cram, optional: true - tuple val(meta), path("*.sam"), emit: sam, optional: true - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def reference = fasta ? "--reference ${fasta}" : "" - def extension = args.contains("--output-fmt sam") ? "sam" : - args.contains("--output-fmt bam") ? "bam" : - args.contains("--output-fmt cram") ? "cram" : - "bam" - if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools \\ - markdup \\ - $args \\ - ${reference} \\ - -@ $task.cpus \\ - -T $prefix \\ - $input \\ - ${prefix}.${extension} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def extension = args.contains("--output-fmt sam") ? "sam" : - args.contains("--output-fmt bam") ? "bam" : - args.contains("--output-fmt cram") ? "cram" : - "bam" - if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - touch ${prefix}.${extension} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/markdup/meta.yml b/modules/nf-core/samtools/markdup/meta.yml deleted file mode 100644 index 4207c93..0000000 --- a/modules/nf-core/samtools/markdup/meta.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: "samtools_markdup" -description: mark duplicate alignments in a coordinate sorted file -keywords: - - bam - - duplicates - - markduplicates - - samtools -tools: - - "samtools": - description: "Tools for dealing with SAM, BAM and CRAM files" - homepage: "http://www.htslib.org" - documentation: "https://www.htslib.org/doc/samtools-markdup.html" - tool_dev_url: "https://github.com/samtools/samtools" - doi: "10.1093/bioinformatics/btp352" - licence: "['MIT']" - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - output: - type: file - description: Sorted BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -authors: - - "@priyanka-surana" diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf index 6ce10c5..cd4de47 100644 --- a/subworkflows/local/markdup_stats.nf +++ b/subworkflows/local/markdup_stats.nf @@ -3,10 +3,10 @@ // Convert to CRAM and calculate statistics // -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { MARKDUPLICATE } from '../../subworkflows/local/markduplicate' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' +include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' workflow MARKDUP_STATS { @@ -48,12 +48,12 @@ workflow MARKDUP_STATS { // Mark duplicates - MARKDUPLICATE ( ch_bam ) - ch_versions = ch_versions.mix ( MARKDUPLICATE.out.versions ) + SAMTOOLS_SORMADUP ( ch_bam, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) // Convert merged BAM to CRAM and calculate indices and statistics - MARKDUPLICATE.out.bam + SAMTOOLS_SORMADUP.out.bam | map { meta, bam -> [ meta, bam, [] ] } | set { ch_stat } diff --git a/subworkflows/local/markduplicate.nf b/subworkflows/local/markduplicate.nf deleted file mode 100644 index 7568826..0000000 --- a/subworkflows/local/markduplicate.nf +++ /dev/null @@ -1,43 +0,0 @@ -// -// Merge BAM files and mark duplicates -// - -include { SAMTOOLS_COLLATE } from '../../modules/nf-core/samtools/collate/main' -include { SAMTOOLS_FIXMATE } from '../../modules/nf-core/samtools/fixmate/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_MARKDUP } from '../../modules/nf-core/samtools/markdup/main' - - -workflow MARKDUPLICATE { - take: - bam // channel: val(meta), [ /path/to/bam ] - - - main: - ch_versions = Channel.empty() - - - // Collate merged BAM file - SAMTOOLS_COLLATE ( bam, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_COLLATE.out.versions.first() ) - - - // Fill in mate coordinates and insert size fields - SAMTOOLS_FIXMATE ( SAMTOOLS_COLLATE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_FIXMATE.out.versions.first() ) - - - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_FIXMATE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - - // Mark duplicates - SAMTOOLS_MARKDUP ( SAMTOOLS_SORT.out.bam, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MARKDUP.out.versions.first() ) - - - emit: - bam = SAMTOOLS_MARKDUP.out.bam // channel: [ val(meta), /path/to/bam ] - versions = ch_versions // channel: [ versions.yml ] -} From 5c39de583f07cdff84802974a34387d7fab72247 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 09:59:56 +0000 Subject: [PATCH 16/50] The runtime of samtools sort depends on the number of reads --- conf/base.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index b07c447..776452f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -22,7 +22,7 @@ process { // These SAMTOOLS processes can take more than 30 min, and sometimes several hours. // Let's give them 8 hours, which should be plenty of time. - withName: 'SAMTOOLS_(COLLATE|CONVERT|FASTA|FASTQ|MERGE|SORMADUP|SORT|STATS|VIEW)' { + withName: 'SAMTOOLS_(COLLATE|CONVERT|FASTA|FASTQ|MERGE|STATS|VIEW)' { time = { check_max( 8.hour * task.attempt, 'time' ) } } @@ -39,11 +39,13 @@ process { withName: 'SAMTOOLS_SORMADUP' { cpus = { check_max( 6 * task.attempt, 'cpus' ) } memory = { check_max( 1.GB + 1.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } + time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } withName: SAMTOOLS_SORT { cpus = { check_max( 6 * task.attempt, 'cpus' ) } memory = { check_max( 4.GB + 2.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } + time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } withName: BLAST_BLASTN { From ff9ff03e05bf624df9f580bef727d44e8930e650 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 10:00:36 +0000 Subject: [PATCH 17/50] Updated requirements for SAMTOOLS_SORMADUP --- conf/base.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index 776452f..86cf0a2 100644 --- a/conf/base.config +++ b/conf/base.config @@ -37,9 +37,9 @@ process { } withName: 'SAMTOOLS_SORMADUP' { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 1.GB + 1.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } - time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB + 2.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } + time = { check_max( 6.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } withName: SAMTOOLS_SORT { From d54d46fca4313fbe9923c057eb816e47a51d5ed8 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 10:01:46 +0000 Subject: [PATCH 18/50] The MINIMAP2_ALIGN includes SAMTOOLS_SORT. Need some extra memory related to the number of reads --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 86cf0a2..fde9d49 100644 --- a/conf/base.config +++ b/conf/base.config @@ -67,7 +67,7 @@ process { } withName: MINIMAP2_ALIGN { - memory = { check_max( 8.GB + 6.GB * Math.ceil( reference.size() / 1000000000 ) * task.attempt, 'memory' ) } + memory = { check_max( (6.GB * Math.ceil( reference.size() / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) } time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } cpus = { check_max( 4 + 2 * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'cpus' ) } } From 582bedd90a982c39ca6aeef704c6b9b6781a1859 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 10:02:14 +0000 Subject: [PATCH 19/50] In my latest tests, it seems BWAMEM2_MEM memory usage is correlated with the logarithm of the genome size --- conf/base.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index fde9d49..56ca0c3 100644 --- a/conf/base.config +++ b/conf/base.config @@ -61,9 +61,9 @@ process { } withName: BWAMEM2_MEM { - memory = { check_max( 8.GB + 8.GB * Math.ceil( meta2.genome_size / 1000000000 ) * task.attempt, 'memory' ) } - time = { check_max( 1.min * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } - cpus = { check_max( 8 + 4 * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB + 1.GB * Math.ceil(24 * Math.log(Math.ceil( meta2.genome_size / 1000000000 ))) * task.attempt, 'memory' ) } + time = { check_max( 1.min * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + cpus = { check_max( 8 + 4 * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'cpus' ) } } withName: MINIMAP2_ALIGN { From d884d470ccea1ee5f3ff0ad985d447de3816d60c Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 10:02:32 +0000 Subject: [PATCH 20/50] I don't need samtools cat --- modules/local/samtools_sormadup.nf | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/modules/local/samtools_sormadup.nf b/modules/local/samtools_sormadup.nf index c81737f..d6a1397 100644 --- a/modules/local/samtools_sormadup.nf +++ b/modules/local/samtools_sormadup.nf @@ -25,7 +25,6 @@ process SAMTOOLS_SORMADUP { def args2 = task.ext.args2 ?: '' def args3 = task.ext.args3 ?: '' def args4 = task.ext.args4 ?: '' - def args5 = task.ext.args5 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def extension = args.contains("--output-fmt sam") ? "sam" : @@ -36,22 +35,18 @@ process SAMTOOLS_SORMADUP { def sort_memory = (task.memory.mega/task.cpus*0.75).intValue() """ - samtools cat \\ - $args2 \\ - --threads $task.cpus \\ - ${input} \\ - | \\ samtools collate \\ - $args3 \\ + $args \\ -O \\ -u \\ -T ${prefix}.collate \\ --threads $task.cpus \\ ${reference} \\ + ${input} \\ - \\ | \\ samtools fixmate \\ - $args4 \\ + $args2 \\ -m \\ -u \\ --threads $task.cpus \\ @@ -59,7 +54,7 @@ process SAMTOOLS_SORMADUP { - \\ | \\ samtools sort \\ - $args5 \\ + $args3 \\ -u \\ -T ${prefix}.sort \\ --threads $task.cpus \\ @@ -70,7 +65,7 @@ process SAMTOOLS_SORMADUP { -T ${prefix}.markdup \\ -f ${prefix}.metrics \\ --threads $task.cpus \\ - $args \\ + $args4 \\ - \\ ${prefix}.${extension} From a581ae1a50535b71e00d1f7cc9396b3821951c4e Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 6 Nov 2023 10:05:16 +0000 Subject: [PATCH 21/50] Alignment is nice --- conf/base.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index 56ca0c3..490ac6d 100644 --- a/conf/base.config +++ b/conf/base.config @@ -43,9 +43,9 @@ process { } withName: SAMTOOLS_SORT { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 4.GB + 2.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } - time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB + 2.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } + time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } withName: BLAST_BLASTN { From d1414931f3917ddacd6e62b4dc5bd91ab499a998 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 8 Nov 2023 12:13:42 +0000 Subject: [PATCH 22/50] Also use -I to decrease the memory requirement of MINIMAP2_ALIGN --- conf/modules.config | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c27c024..763a4a0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -50,10 +50,11 @@ process { withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. + // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. - // NOTE: minimap2 2.25 raises the default to 8G - ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" + (meta.genome_size > 4e9 ? (" -I" + Math.ceil(meta.genome_size/1e9)+"G") : "") } + // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' } } withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { From 613c4cbd9f68d102022fa16e4b59276da31283df Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 8 Nov 2023 12:14:16 +0000 Subject: [PATCH 23/50] 20 minutes seems a bit too close to the real usage. We may want a larger buffer --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 490ac6d..de8a7bb 100644 --- a/conf/base.config +++ b/conf/base.config @@ -56,7 +56,7 @@ process { withName: BWAMEM2_INDEX { memory = { check_max( 24.GB * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'memory' ) } - time = { check_max( 20.min * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'time' ) } + time = { check_max( 30.min * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'time' ) } // Not multithreaded } From bc844bcb8f95bc5d062dc3d28fa55c03d2de05c9 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 8 Nov 2023 12:16:30 +0000 Subject: [PATCH 24/50] Don't increase the number of CPUs too high as there are diminishing returns --- conf/base.config | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/conf/base.config b/conf/base.config index de8a7bb..01b4f89 100644 --- a/conf/base.config +++ b/conf/base.config @@ -4,6 +4,24 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Increasing the number of CPUs often gives diminishing returns, so we increase it + following a logarithm curve. Example: + - 0 < value <= 1: base + step + - 1 < value <= 2: base + 2*step + - 2 < value <= 4: base + 3*step + - 4 < value <= 8: base + 4*step + In order to support re-runs, the step increase may be multiplied by the attempt + number prior to calling this function. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +def log_increase_cpus(base, step, value) { + return check_max(base + step * Math.ceil(Math.log(Math.ceil(value))/Math.log(2)), 'cpus') +} + + process { errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } @@ -32,18 +50,18 @@ process { } withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } + cpus = { log_increase_cpus(4, 2*task.attempt, 1) } memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } } withName: 'SAMTOOLS_SORMADUP' { - cpus = { check_max( 8 * task.attempt, 'cpus' ) } + cpus = { log_increase_cpus(8, 4*task.attempt, 1) } memory = { check_max( 6.GB + 2.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } time = { check_max( 6.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } withName: SAMTOOLS_SORT { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } + cpus = { log_increase_cpus(4, 2*task.attempt, 1) } memory = { check_max( 4.GB + 2.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } @@ -67,9 +85,9 @@ process { } withName: MINIMAP2_ALIGN { + cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000) } memory = { check_max( (6.GB * Math.ceil( reference.size() / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) } time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } - cpus = { check_max( 4 + 2 * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'cpus' ) } } withName:CUSTOM_DUMPSOFTWAREVERSIONS { From ec695e600bdb7384da2b5b2e3b5e27f61e74849f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 8 Nov 2023 12:17:44 +0000 Subject: [PATCH 25/50] Some samtools commands have a fixed memory usage per thread, so include the CPU count in the memory requirement --- conf/base.config | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/conf/base.config b/conf/base.config index 01b4f89..c17a4ed 100644 --- a/conf/base.config +++ b/conf/base.config @@ -56,13 +56,14 @@ process { withName: 'SAMTOOLS_SORMADUP' { cpus = { log_increase_cpus(8, 4*task.attempt, 1) } - memory = { check_max( 6.GB + 2.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } + memory = { check_max( 8.GB + 1600.MB * log_increase_cpus(8, 4*task.attempt, 1), 'memory' ) } time = { check_max( 6.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } withName: SAMTOOLS_SORT { cpus = { log_increase_cpus(4, 2*task.attempt, 1) } - memory = { check_max( 4.GB + 2.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } + // Memory increases by 768M for each thread + memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1), 'memory' ) } time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } From d8043a5dd2a98203f0c584dbaf953bada0b7498a Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 9 Nov 2023 16:50:27 +0000 Subject: [PATCH 26/50] There was supposed to be a +1 there, to provide the "* task.attempt" mechanism --- conf/base.config | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/conf/base.config b/conf/base.config index c17a4ed..db83d76 100644 --- a/conf/base.config +++ b/conf/base.config @@ -8,17 +8,17 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Increasing the number of CPUs often gives diminishing returns, so we increase it following a logarithm curve. Example: - - 0 < value <= 1: base + step - - 1 < value <= 2: base + 2*step - - 2 < value <= 4: base + 3*step - - 4 < value <= 8: base + 4*step + - 0 < value <= 1: start + step + - 1 < value <= 2: start + 2*step + - 2 < value <= 4: start + 3*step + - 4 < value <= 8: start + 4*step In order to support re-runs, the step increase may be multiplied by the attempt number prior to calling this function. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ def log_increase_cpus(base, step, value) { - return check_max(base + step * Math.ceil(Math.log(Math.ceil(value))/Math.log(2)), 'cpus') + return check_max(base + step * (1 + Math.ceil(Math.log(Math.ceil(value))/Math.log(2))), 'cpus') } From 647a748695530163d01eae76bde177c236f17434 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 9 Nov 2023 17:03:12 +0000 Subject: [PATCH 27/50] Introduced a helper method that clearly shows how the logarithm is modified --- conf/base.config | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/conf/base.config b/conf/base.config index db83d76..05927f7 100644 --- a/conf/base.config +++ b/conf/base.config @@ -17,8 +17,17 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def log_increase_cpus(base, step, value) { - return check_max(base + step * (1 + Math.ceil(Math.log(Math.ceil(value))/Math.log(2))), 'cpus') +// Modified logarithm function that doesn't return negative numbers +def positive_log(value, base) { + if (value <= 1) { + return 0 + } else { + return Math.log(value)/Math.log(base) + } +} + +def log_increase_cpus(start, step, value, base) { + return check_max(start + step * (1 + Math.ceil(positive_log(value, base))), 'cpus') } @@ -50,20 +59,20 @@ process { } withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' { - cpus = { log_increase_cpus(4, 2*task.attempt, 1) } + cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } } withName: 'SAMTOOLS_SORMADUP' { - cpus = { log_increase_cpus(8, 4*task.attempt, 1) } - memory = { check_max( 8.GB + 1600.MB * log_increase_cpus(8, 4*task.attempt, 1), 'memory' ) } + cpus = { log_increase_cpus(8, 4*task.attempt, 1, 2) } + memory = { check_max( 8.GB + 1600.MB * log_increase_cpus(8, 4*task.attempt, 1, 2), 'memory' ) } time = { check_max( 6.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } withName: SAMTOOLS_SORT { - cpus = { log_increase_cpus(4, 2*task.attempt, 1) } + cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } // Memory increases by 768M for each thread - memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1), 'memory' ) } + memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) } time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } @@ -86,7 +95,7 @@ process { } withName: MINIMAP2_ALIGN { - cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000) } + cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } memory = { check_max( (6.GB * Math.ceil( reference.size() / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) } time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } } From d0b305ab413372517cd6eef2c396ad839e3cfa65 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 9 Nov 2023 17:08:48 +0000 Subject: [PATCH 28/50] Increasing the number of attempts to give more resilience --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 05927f7..4c7120f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -34,7 +34,7 @@ def log_increase_cpus(start, step, value, base) { process { errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 2 + maxRetries = 3 maxErrors = '-1' // In this configuration file, we give little resources by default and From 2465d7438a74d7de0f6cf93066270065f9c01d46 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 9 Nov 2023 17:10:08 +0000 Subject: [PATCH 29/50] New formula that is less greedy --- conf/base.config | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index 4c7120f..b8f8d04 100644 --- a/conf/base.config +++ b/conf/base.config @@ -89,9 +89,15 @@ process { } withName: BWAMEM2_MEM { - memory = { check_max( 12.GB + 1.GB * Math.ceil(24 * Math.log(Math.ceil( meta2.genome_size / 1000000000 ))) * task.attempt, 'memory' ) } - time = { check_max( 1.min * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } - cpus = { check_max( 8 + 4 * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'cpus' ) } + // Corresponds to 12 threads as the mininum, 24 threads if 3 billion reads + cpus = { log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2) } + // Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size + // Runtime is considered proportional to the number of reads and inversely to number of threads + time = { check_max( 3.h * task.attempt * Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) } + // Memory usage for 1 BWAMEM2 thread is 500 MB for every 1 Gbp. + // Memory usage of BWAMEM2 is about proportional to the number of threads but need to add about 6 GB to avoid MEMLIMIT. + // Memory usage of SAMTOOLS_VIEW is negligible. + memory = { check_max( 6.GB + 500.MB * Math.ceil(meta2.genome_size / 1000000000) * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } } withName: MINIMAP2_ALIGN { From a9ec952c2062c518617959b8af26fda2cc5a5e44 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 9 Nov 2023 17:40:58 +0000 Subject: [PATCH 30/50] Adjusted resource requirements for SAMTOOLS_SORMADUP --- conf/base.config | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index b8f8d04..b879983 100644 --- a/conf/base.config +++ b/conf/base.config @@ -64,9 +64,10 @@ process { } withName: 'SAMTOOLS_SORMADUP' { - cpus = { log_increase_cpus(8, 4*task.attempt, 1, 2) } - memory = { check_max( 8.GB + 1600.MB * log_increase_cpus(8, 4*task.attempt, 1, 2), 'memory' ) } - time = { check_max( 6.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } + // The main factor is the number of CPUs. + cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } + memory = { check_max( 12.GB + 1.5.GB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) } + time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(4, 2*task.attempt, 1, 2), 'time' ) } } withName: SAMTOOLS_SORT { From a249a763beceb6d54d1b814a0295e1cac8298028 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 10 Nov 2023 09:29:09 +0000 Subject: [PATCH 31/50] Added a note about minimap2 --- conf/modules.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 763a4a0..c247b55 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -54,7 +54,9 @@ process { // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes - ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + // NOTE: Use `reference.size()` for now, and switch to `meta2.genome_size` once we update the modules. + // ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } } withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { From 99d289d26aa0b5dc9ce4ecc4cc5dc0cb8dcb3b5e Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 10 Nov 2023 09:43:20 +0000 Subject: [PATCH 32/50] Added some credit --- modules/local/samtools_sormadup.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/local/samtools_sormadup.nf b/modules/local/samtools_sormadup.nf index d6a1397..dbdeeaa 100644 --- a/modules/local/samtools_sormadup.nf +++ b/modules/local/samtools_sormadup.nf @@ -1,3 +1,5 @@ +// Copied from https://github.com/nf-core/modules/pull/3310 +// Author: Matthias De Smet, https://github.com/matthdsm process SAMTOOLS_SORMADUP { tag "$meta.id" label 'process_medium' From 9afa76312b6267ae81edd816ebcbf1723972f00a Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 10 Nov 2023 09:45:44 +0000 Subject: [PATCH 33/50] More consistent comment --- subworkflows/local/align_ont.nf | 2 +- subworkflows/local/align_pacbio.nf | 2 +- subworkflows/local/markdup_stats.nf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index bace9e8..c1d2263 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -41,7 +41,7 @@ workflow ALIGN_ONT { | set { ch_bams } - // Merge + // Merge, but only if there is more than 1 file SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index d3e517a..01cd1ac 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -47,7 +47,7 @@ workflow ALIGN_PACBIO { | set { ch_bams } - // Merge + // Merge, but only if there is more than 1 file SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf index cd4de47..9b271f8 100644 --- a/subworkflows/local/markdup_stats.nf +++ b/subworkflows/local/markdup_stats.nf @@ -37,7 +37,7 @@ workflow MARKDUP_STATS { | set { ch_bams } - // Merge position sorted bam files + // Merge, but only if there is more than 1 file SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) From eae32611f3c8b931719f509cebb24a6f00ded330 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 10 Nov 2023 09:48:21 +0000 Subject: [PATCH 34/50] Tell it's a meta map --- subworkflows/local/input_check.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 26fe88d..d8a8a53 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -29,7 +29,7 @@ workflow INPUT_CHECK { // Create the read channel for the rest of the pipeline samplesheet_rows | join( SAMTOOLS_FLAGSTAT.out.flagstat ) - | map { meta, datafile, dummy, stats -> create_data_channel( meta, datafile, stats ) } + | map { meta, datafile, meta2, stats -> create_data_channel( meta, datafile, stats ) } | set { reads } From f211fb4c10546dd7b411f44d4b89e04e83a27be5 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 10 Nov 2023 10:25:39 +0000 Subject: [PATCH 35/50] Indentation should be a multiple of 4 --- conf/base.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/base.config b/conf/base.config index b879983..728a8ea 100644 --- a/conf/base.config +++ b/conf/base.config @@ -8,10 +8,10 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Increasing the number of CPUs often gives diminishing returns, so we increase it following a logarithm curve. Example: - - 0 < value <= 1: start + step - - 1 < value <= 2: start + 2*step - - 2 < value <= 4: start + 3*step - - 4 < value <= 8: start + 4*step + - 0 < value <= 1: start + step + - 1 < value <= 2: start + 2*step + - 2 < value <= 4: start + 3*step + - 4 < value <= 8: start + 4*step In order to support re-runs, the step increase may be multiplied by the attempt number prior to calling this function. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 5b70604ef2775dbc882b2c2ba2f5b1fcd130e4e5 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 10 Nov 2023 17:58:29 +0000 Subject: [PATCH 36/50] SAMTOOLS_FLAGSTAT may take more time --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 728a8ea..aca833a 100644 --- a/conf/base.config +++ b/conf/base.config @@ -49,7 +49,7 @@ process { // These SAMTOOLS processes can take more than 30 min, and sometimes several hours. // Let's give them 8 hours, which should be plenty of time. - withName: 'SAMTOOLS_(COLLATE|CONVERT|FASTA|FASTQ|MERGE|STATS|VIEW)' { + withName: 'SAMTOOLS_(COLLATE|CONVERT|FASTA|FASTQ|FLAGSTAT|MERGE|STATS|VIEW)' { time = { check_max( 8.hour * task.attempt, 'time' ) } } From 0c5e4f9f14fbd8c0aeb7a0d8efb85082f475f6c2 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 16 Nov 2023 17:45:55 +0000 Subject: [PATCH 37/50] typo --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index aca833a..18134c1 100644 --- a/conf/base.config +++ b/conf/base.config @@ -90,7 +90,7 @@ process { } withName: BWAMEM2_MEM { - // Corresponds to 12 threads as the mininum, 24 threads if 3 billion reads + // Corresponds to 12 threads as the minimum, 24 threads if 3 billion reads cpus = { log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2) } // Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size // Runtime is considered proportional to the number of reads and inversely to number of threads From c27dd301bcdc21d0191a67d62c290eef1082c5f4 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 16 Nov 2023 18:30:11 +0000 Subject: [PATCH 38/50] Increased runtime, just in case --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 18134c1..6ffff82 100644 --- a/conf/base.config +++ b/conf/base.config @@ -74,7 +74,7 @@ process { cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } // Memory increases by 768M for each thread memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) } - time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } + time = { check_max( 8.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } withName: BLAST_BLASTN { From a2924b05697bf347a1f42761f24c62f67ddc6231 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 21 Nov 2023 08:30:28 +0000 Subject: [PATCH 39/50] Updated runtime and memory requirements --- conf/base.config | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/conf/base.config b/conf/base.config index 6ffff82..445a07c 100644 --- a/conf/base.config +++ b/conf/base.config @@ -47,14 +47,28 @@ process { memory = { check_max( 50.MB * task.attempt, 'memory' ) } time = { check_max( 30.min * task.attempt, 'time' ) } - // These SAMTOOLS processes can take more than 30 min, and sometimes several hours. - // Let's give them 8 hours, which should be plenty of time. - withName: 'SAMTOOLS_(COLLATE|CONVERT|FASTA|FASTQ|FLAGSTAT|MERGE|STATS|VIEW)' { + withName: 'SAMTOOLS_(CONVERT|FILTER)' { + time = { check_max( 1.hour * task.attempt, 'time' ) } + } + + withName: 'SAMTOOLS_(FASTA)' { + time = { check_max( 2.hour * task.attempt, 'time' ) } + } + + withName: 'SAMTOOLS_(STATS)' { + // Actually less than 1 hour for PacBio HiFi data, but confirmed 3 hours for Hi-C + time = { check_max( 4.hour * task.attempt, 'time' ) } + } + + withName: 'SAMTOOLS_(COLLATE|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' { time = { check_max( 8.hour * task.attempt, 'time' ) } } - // A bit more memory for these SAMTOOLS. - withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS|STATS|VIEW)' { + withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS)' { + memory = { check_max( 250.MB * task.attempt, 'memory' ) } + } + + withName: 'SAMTOOLS_(STATS|VIEW)' { memory = { check_max( 1.GB * task.attempt, 'memory' ) } } @@ -64,6 +78,12 @@ process { } withName: 'SAMTOOLS_SORMADUP' { + // COLLATE + FIXMATE + SORT + MARKDUP + // COLLATE + //cpus = { check_max( 6 * task.attempt, 'cpus' ) } + //memory = { check_max( 1.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } + // MARKDUP + //memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1500000000 ) * task.attempt, 'memory' ) } // The main factor is the number of CPUs. cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } memory = { check_max( 12.GB + 1.5.GB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) } From b2f22a27db033f458cfaa4fafeecf85598829d0d Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 27 Nov 2023 09:16:23 +0000 Subject: [PATCH 40/50] Need these fields in the debug output --- nextflow.config | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow.config b/nextflow.config index 8e22d19..6b34823 100644 --- a/nextflow.config +++ b/nextflow.config @@ -166,6 +166,7 @@ report { trace { enabled = true file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + fields = 'task_id,hash,native_id,process,tag,status,exit,cpus,memory,time,attempt,submit,start,complete,duration,%cpu,%mem,peak_rss,rchar,wchar' } dag { enabled = true From cfac7c807e14bb8efafddd43845dad1a01f84708 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 27 Nov 2023 11:45:17 +0000 Subject: [PATCH 41/50] Like in the genome note pipeline, use the work directory instead of /tmp Had to bump this samtools to 1.17 as the option is not available in earlier versions. --- conf/modules.config | 1 + nextflow.config | 3 +++ nextflow_schema.json | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index c247b55..31f4c6b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -33,6 +33,7 @@ process { } withName: SAMTOOLS_COLLATE { + ext.args = { (params.use_work_dir_as_temp ? "-T." : "") } ext.prefix = { "${meta.id}.collate" } } diff --git a/nextflow.config b/nextflow.config index 6b34823..9df2682 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,9 @@ params { bwamem2_index = null fasta = null + // Execution options + use_work_dir_as_temp = false + // Boilerplate options outdir = "./results" diff --git a/nextflow_schema.json b/nextflow_schema.json index 5f44c81..737b3c2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -67,6 +67,21 @@ } } }, + "execution": { + "title": "Execution", + "type": "object", + "description": "Control the execution of the pipeline.", + "default": "", + "properties": { + "use_work_dir_as_temp": { + "type": "boolean", + "description": "Set to true to make tools (e.g. sort, FastK, MerquryFK) use the work directory for their temporary files, rather than the system default.", + "fa_icon": "fas fa-arrow-circle-down", + "hidden": true + } + }, + "fa_icon": "fas fa-running" + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -236,6 +251,9 @@ { "$ref": "#/definitions/reference_genome_options" }, + { + "$ref": "#/definitions/execution" + }, { "$ref": "#/definitions/institutional_config_options" }, From 9da9f3f9e9b340f5aa5bab01dc42e64663fb3825 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 29 Nov 2023 09:55:46 +0000 Subject: [PATCH 42/50] Updated the settings for SORMADUP --- conf/base.config | 13 +++---------- modules/local/samtools_sormadup.nf | 2 +- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/conf/base.config b/conf/base.config index 445a07c..d528ed0 100644 --- a/conf/base.config +++ b/conf/base.config @@ -78,16 +78,9 @@ process { } withName: 'SAMTOOLS_SORMADUP' { - // COLLATE + FIXMATE + SORT + MARKDUP - // COLLATE - //cpus = { check_max( 6 * task.attempt, 'cpus' ) } - //memory = { check_max( 1.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } - // MARKDUP - //memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1500000000 ) * task.attempt, 'memory' ) } - // The main factor is the number of CPUs. - cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } - memory = { check_max( 12.GB + 1.5.GB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) } - time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(4, 2*task.attempt, 1, 2), 'time' ) } + cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) } + memory = { check_max( 10.GB + 6.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } + time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } withName: SAMTOOLS_SORT { diff --git a/modules/local/samtools_sormadup.nf b/modules/local/samtools_sormadup.nf index dbdeeaa..8c1c530 100644 --- a/modules/local/samtools_sormadup.nf +++ b/modules/local/samtools_sormadup.nf @@ -34,7 +34,7 @@ process SAMTOOLS_SORMADUP { args.contains("--output-fmt cram") ? "cram" : "bam" def reference = fasta ? "--reference ${fasta}" : "" - def sort_memory = (task.memory.mega/task.cpus*0.75).intValue() + def sort_memory = ((task.memory.mega - 1024 * (1 + Math.ceil( meta.read_count / 100000000 )) * task.attempt)/task.cpus).intValue() """ samtools collate \\ From 5e155894f3a2011230adb0c76d25ed5ef79fdc63 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 29 Nov 2023 10:00:08 +0000 Subject: [PATCH 43/50] Updated the BWA_MEM memory requirement --- conf/base.config | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index d528ed0..c560e05 100644 --- a/conf/base.config +++ b/conf/base.config @@ -108,10 +108,9 @@ process { // Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size // Runtime is considered proportional to the number of reads and inversely to number of threads time = { check_max( 3.h * task.attempt * Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) } - // Memory usage for 1 BWAMEM2 thread is 500 MB for every 1 Gbp. - // Memory usage of BWAMEM2 is about proportional to the number of threads but need to add about 6 GB to avoid MEMLIMIT. + // Base RAM usage is about 6 times the genome size. Each thread takes an additional 800 MB RAM // Memory usage of SAMTOOLS_VIEW is negligible. - memory = { check_max( 6.GB + 500.MB * Math.ceil(meta2.genome_size / 1000000000) * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } + memory = { check_max( 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 800.MB * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } } withName: MINIMAP2_ALIGN { From bdbcbb4b3d039091df8ca58331411c393762ce5b Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 29 Nov 2023 10:01:51 +0000 Subject: [PATCH 44/50] Increased the number of retries --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index c560e05..78e0e04 100644 --- a/conf/base.config +++ b/conf/base.config @@ -34,7 +34,7 @@ def log_increase_cpus(start, step, value, base) { process { errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 3 + maxRetries = 5 maxErrors = '-1' // In this configuration file, we give little resources by default and From 3905219c7e89e81c82dbeb29c7f759ef0d552ef7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 5 Dec 2023 10:19:13 +0000 Subject: [PATCH 45/50] The default memory settings work just fine and make things easier to manipulate --- modules/local/samtools_sormadup.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/local/samtools_sormadup.nf b/modules/local/samtools_sormadup.nf index 8c1c530..c5a31ef 100644 --- a/modules/local/samtools_sormadup.nf +++ b/modules/local/samtools_sormadup.nf @@ -34,7 +34,6 @@ process SAMTOOLS_SORMADUP { args.contains("--output-fmt cram") ? "cram" : "bam" def reference = fasta ? "--reference ${fasta}" : "" - def sort_memory = ((task.memory.mega - 1024 * (1 + Math.ceil( meta.read_count / 100000000 )) * task.attempt)/task.cpus).intValue() """ samtools collate \\ @@ -60,7 +59,6 @@ process SAMTOOLS_SORMADUP { -u \\ -T ${prefix}.sort \\ --threads $task.cpus \\ - -m ${sort_memory}M \\ - \\ | \\ samtools markdup \\ From c15d92385f1734d5b5a8db4313097c6a6873251b Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 5 Dec 2023 10:21:52 +0000 Subject: [PATCH 46/50] Usage is very close to the trend line. Smaller bins work fine --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 78e0e04..378947a 100644 --- a/conf/base.config +++ b/conf/base.config @@ -79,7 +79,7 @@ process { withName: 'SAMTOOLS_SORMADUP' { cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) } - memory = { check_max( 10.GB + 6.GB * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'memory' ) } + memory = { check_max( 10.GB + 0.6.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } From 40947244b85b09e1e84b86e64cb3133ae9b96cf1 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 7 Dec 2023 15:22:55 +0000 Subject: [PATCH 47/50] quay.io/ is now the default --- modules/local/pacbio_filter.nf | 2 +- modules/local/samplesheet_check.nf | 2 +- modules/local/samtools_sormadup.nf | 2 +- modules/local/unmask.nf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/local/pacbio_filter.nf b/modules/local/pacbio_filter.nf index 18dd11c..e4deaa4 100644 --- a/modules/local/pacbio_filter.nf +++ b/modules/local/pacbio_filter.nf @@ -5,7 +5,7 @@ process PACBIO_FILTER { conda "conda-forge::gawk=5.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : - 'quay.io/biocontainers/gawk:5.1.0' }" + 'biocontainers/gawk:5.1.0' }" input: tuple val(meta), path(txt) diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 9c44c61..f0a3073 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK { conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" + 'biocontainers/python:3.8.3' }" input: path samplesheet diff --git a/modules/local/samtools_sormadup.nf b/modules/local/samtools_sormadup.nf index c5a31ef..5aadab5 100644 --- a/modules/local/samtools_sormadup.nf +++ b/modules/local/samtools_sormadup.nf @@ -7,7 +7,7 @@ process SAMTOOLS_SORMADUP { conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(input) diff --git a/modules/local/unmask.nf b/modules/local/unmask.nf index 482eefc..72d1a07 100644 --- a/modules/local/unmask.nf +++ b/modules/local/unmask.nf @@ -5,7 +5,7 @@ process UNMASK { conda "conda-forge::gawk=5.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : - 'quay.io/biocontainers/gawk:5.1.0' }" + 'biocontainers/gawk:5.1.0' }" input: tuple val(meta), path(fasta) From 3ef0477fb53d157077b71576946545983b01ff41 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 8 Dec 2023 13:06:56 +0000 Subject: [PATCH 48/50] Added optimised settings for crumble --- conf/base.config | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/conf/base.config b/conf/base.config index 378947a..ca983bb 100644 --- a/conf/base.config +++ b/conf/base.config @@ -119,6 +119,15 @@ process { time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } } + withName: CRUMBLE { + // No correlation between memory usage and the number of reads or the genome size. + // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB. + // The formula below tries to mimic that growth and relies on job retries being allowed. + memory = { check_max( task.attempt * (task.attempt + 1) * 512.MB, 'memory' ) } + // Slightly better correlation between runtime and the number of reads. + time = { check_max( 1.5.h + 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + } + withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } From f0c1ba4f823e66a92d2b0ab1b5428134409b0e48 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 8 Dec 2023 14:12:06 +0000 Subject: [PATCH 49/50] Need to fake REF_PATH to force crumble to use the Fasta file defined in the UR field of the @SQ headers --- modules/nf-core/crumble/crumble.diff | 9 ++++++++- modules/nf-core/crumble/main.nf | 3 +++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/modules/nf-core/crumble/crumble.diff b/modules/nf-core/crumble/crumble.diff index 00c5857..2c4cb1e 100644 --- a/modules/nf-core/crumble/crumble.diff +++ b/modules/nf-core/crumble/crumble.diff @@ -1,7 +1,7 @@ Changes in module 'nf-core/crumble' --- modules/nf-core/crumble/main.nf +++ modules/nf-core/crumble/main.nf -@@ -30,7 +30,7 @@ +@@ -30,11 +30,14 @@ args.contains("-O cram") ? "cram" : "sam" def bedin = keepbed ? "-R ${keepbed}" : "" @@ -10,5 +10,12 @@ Changes in module 'nf-core/crumble' if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ ++ # Need to fake REF_PATH to force crumble to use the Fasta file defined in ++ # the UR field of the @SQ headers. (bug reported to the samtools team). ++ env REF_PATH=/missing \\ + crumble \\ + $args \\ + $bedin \\ ************************************************************ diff --git a/modules/nf-core/crumble/main.nf b/modules/nf-core/crumble/main.nf index a250829..44c0c59 100644 --- a/modules/nf-core/crumble/main.nf +++ b/modules/nf-core/crumble/main.nf @@ -35,6 +35,9 @@ process CRUMBLE { def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ + # Need to fake REF_PATH to force crumble to use the Fasta file defined in + # the UR field of the @SQ headers. (bug reported to the samtools team). + env REF_PATH=/missing \\ crumble \\ $args \\ $bedin \\ From 79bcbbc2a96d21bd053590c6f14586ce20887601 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sat, 9 Dec 2023 09:15:45 +0000 Subject: [PATCH 50/50] There is a difference for ONT, which I assume would be there for CLR too --- conf/base.config | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index ca983bb..cdffac4 100644 --- a/conf/base.config +++ b/conf/base.config @@ -68,9 +68,12 @@ process { memory = { check_max( 250.MB * task.attempt, 'memory' ) } } - withName: 'SAMTOOLS_(STATS|VIEW)' { + withName: '.*:ALIGN_(HIFI|HIC|ILLUMINA):.*:SAMTOOLS_(STATS|VIEW)' { memory = { check_max( 1.GB * task.attempt, 'memory' ) } } + withName: '.*:ALIGN_(CLR|ONT):.*:SAMTOOLS_(STATS|VIEW)' { + memory = { check_max( 2.GB * task.attempt, 'memory' ) } + } withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' { cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) }