diff --git a/conf/base.config b/conf/base.config index 284ac0b..cdffac4 100644 --- a/conf/base.config +++ b/conf/base.config @@ -2,64 +2,135 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sanger-tol/readmapping Nextflow base config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - A 'blank slate' config file, appropriate for general use on most high performance - compute environments. Assumes that all software is installed and available on - the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. ----------------------------------------------------------------------------------------- */ -process { +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Increasing the number of CPUs often gives diminishing returns, so we increase it + following a logarithm curve. Example: + - 0 < value <= 1: start + step + - 1 < value <= 2: start + 2*step + - 2 < value <= 4: start + 3*step + - 4 < value <= 8: start + 4*step + In order to support re-runs, the step increase may be multiplied by the attempt + number prior to calling this function. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } +// Modified logarithm function that doesn't return negative numbers +def positive_log(value, base) { + if (value <= 1) { + return 0 + } else { + return Math.log(value)/Math.log(base) + } +} + +def log_increase_cpus(start, step, value, base) { + return check_max(start + step * (1 + Math.ceil(positive_log(value, base))), 'cpus') +} - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries = 1 + +process { + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 5 maxErrors = '-1' - // Process-specific resource requirements - // NOTE - Please try and re-use the labels below as much as possible. - // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. - // If possible, it would be nice to keep the same label naming convention when - // adding in your local modules too. - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors - withLabel:process_single { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + // In this configuration file, we give little resources by default and + // explicitly bump them up for some processes. + // All rules should still increase resources every attempt to allow the + // pipeline to self-heal from MEMLIMIT/RUNLIMIT. + + // Default + cpus = 1 + memory = { check_max( 50.MB * task.attempt, 'memory' ) } + time = { check_max( 30.min * task.attempt, 'time' ) } + + withName: 'SAMTOOLS_(CONVERT|FILTER)' { + time = { check_max( 1.hour * task.attempt, 'time' ) } + } + + withName: 'SAMTOOLS_(FASTA)' { + time = { check_max( 2.hour * task.attempt, 'time' ) } + } + + withName: 'SAMTOOLS_(STATS)' { + // Actually less than 1 hour for PacBio HiFi data, but confirmed 3 hours for Hi-C + time = { check_max( 4.hour * task.attempt, 'time' ) } } - withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + + withName: 'SAMTOOLS_(COLLATE|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' { + time = { check_max( 8.hour * task.attempt, 'time' ) } } - withLabel:process_medium { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + + withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS)' { + memory = { check_max( 250.MB * task.attempt, 'memory' ) } } - withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 72.GB * task.attempt, 'memory' ) } - time = { check_max( 16.h * task.attempt, 'time' ) } + + withName: '.*:ALIGN_(HIFI|HIC|ILLUMINA):.*:SAMTOOLS_(STATS|VIEW)' { + memory = { check_max( 1.GB * task.attempt, 'memory' ) } } - withLabel:process_long { - time = { check_max( 20.h * task.attempt, 'time' ) } + withName: '.*:ALIGN_(CLR|ONT):.*:SAMTOOLS_(STATS|VIEW)' { + memory = { check_max( 2.GB * task.attempt, 'memory' ) } } - withLabel:process_high_memory { - memory = { check_max( 200.GB * task.attempt, 'memory' ) } + + withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' { + cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } + memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } } - withLabel:error_ignore { - errorStrategy = 'ignore' + + withName: 'SAMTOOLS_SORMADUP' { + cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) } + memory = { check_max( 10.GB + 0.6.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } + time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 + + withName: SAMTOOLS_SORT { + cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } + // Memory increases by 768M for each thread + memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) } + time = { check_max( 8.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } - withName:BWAMEM2_INDEX { - memory = { check_max( 1.GB * Math.ceil( 28 * fasta.size() / 1000000000 ) * task.attempt, 'memory' ) } + + withName: BLAST_BLASTN { + time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } + // The tool never seems to use more than 1 core even when given multiple. Sticking to 1 (the default) } + + withName: BWAMEM2_INDEX { + memory = { check_max( 24.GB * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'memory' ) } + time = { check_max( 30.min * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'time' ) } + // Not multithreaded + } + + withName: BWAMEM2_MEM { + // Corresponds to 12 threads as the minimum, 24 threads if 3 billion reads + cpus = { log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2) } + // Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size + // Runtime is considered proportional to the number of reads and inversely to number of threads + time = { check_max( 3.h * task.attempt * Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) } + // Base RAM usage is about 6 times the genome size. Each thread takes an additional 800 MB RAM + // Memory usage of SAMTOOLS_VIEW is negligible. + memory = { check_max( 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 800.MB * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } + } + + withName: MINIMAP2_ALIGN { + cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } + memory = { check_max( (6.GB * Math.ceil( reference.size() / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) } + time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + } + + withName: CRUMBLE { + // No correlation between memory usage and the number of reads or the genome size. + // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB. + // The formula below tries to mimic that growth and relies on job retries being allowed. + memory = { check_max( task.attempt * (task.attempt + 1) * 512.MB, 'memory' ) } + // Slightly better correlation between runtime and the number of reads. + time = { check_max( 1.5.h + 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + } + withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } diff --git a/conf/modules.config b/conf/modules.config index 08c127c..31f4c6b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -33,18 +33,10 @@ process { } withName: SAMTOOLS_COLLATE { + ext.args = { (params.use_work_dir_as_temp ? "-T." : "") } ext.prefix = { "${meta.id}.collate" } } - withName: SAMTOOLS_FIXMATE { - ext.args = '-m' - ext.prefix = { "${meta.id}.fixmate" } - } - - withName: SAMTOOLS_MARKDUP { - ext.prefix = { "${meta.id}.markdup" } - } - withName: BLAST_BLASTN { ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' } @@ -58,7 +50,14 @@ process { } withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { - ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" } + // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. + // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes + // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp + // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. + // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes + // NOTE: Use `reference.size()` for now, and switch to `meta2.genome_size` once we update the modules. + // ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } } withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { diff --git a/docs/output.md b/docs/output.md index 8cc61c3..9722d11 100644 --- a/docs/output.md +++ b/docs/output.md @@ -34,7 +34,7 @@ PacBio reads generated using both CLR and CCS technology are filtered using `BLA ### Short reads -Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` markduplicate workflow. The mark duplicate alignments is output in the CRAM format, along with the index. +Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` [mark-duplicate workflow](https://www.htslib.org/algorithms/duplicate.html#workflow). The mark duplicate alignments is output in the CRAM format, along with the index.
Output files diff --git a/modules.json b/modules.json index ac47073..a9a06be 100644 --- a/modules.json +++ b/modules.json @@ -61,11 +61,6 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, - "samtools/fixmate": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] - }, "samtools/flagstat": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -76,11 +71,6 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, - "samtools/markdup": { - "branch": "master", - "git_sha": "9e51255c4f8ec69fb6ccf68593392835f14fecb8", - "installed_by": ["modules"] - }, "samtools/merge": { "branch": "master", "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", diff --git a/modules/local/pacbio_filter.nf b/modules/local/pacbio_filter.nf index 18dd11c..e4deaa4 100644 --- a/modules/local/pacbio_filter.nf +++ b/modules/local/pacbio_filter.nf @@ -5,7 +5,7 @@ process PACBIO_FILTER { conda "conda-forge::gawk=5.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : - 'quay.io/biocontainers/gawk:5.1.0' }" + 'biocontainers/gawk:5.1.0' }" input: tuple val(meta), path(txt) diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 9c44c61..f0a3073 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK { conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" + 'biocontainers/python:3.8.3' }" input: path samplesheet diff --git a/modules/local/samtools_sormadup.nf b/modules/local/samtools_sormadup.nf new file mode 100644 index 0000000..5aadab5 --- /dev/null +++ b/modules/local/samtools_sormadup.nf @@ -0,0 +1,77 @@ +// Copied from https://github.com/nf-core/modules/pull/3310 +// Author: Matthias De Smet, https://github.com/matthdsm +process SAMTOOLS_SORMADUP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.{bam,cram}") , emit: bam + tuple val(meta), path("*.{bai,crai}") , optional:true, emit: bam_index + tuple val(meta), path("*.metrics") , emit: metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' + + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + "bam" + def reference = fasta ? "--reference ${fasta}" : "" + + """ + samtools collate \\ + $args \\ + -O \\ + -u \\ + -T ${prefix}.collate \\ + --threads $task.cpus \\ + ${reference} \\ + ${input} \\ + - \\ + | \\ + samtools fixmate \\ + $args2 \\ + -m \\ + -u \\ + --threads $task.cpus \\ + - \\ + - \\ + | \\ + samtools sort \\ + $args3 \\ + -u \\ + -T ${prefix}.sort \\ + --threads $task.cpus \\ + - \\ + | \\ + samtools markdup \\ + -T ${prefix}.markdup \\ + -f ${prefix}.metrics \\ + --threads $task.cpus \\ + $args4 \\ + - \\ + ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/unmask.nf b/modules/local/unmask.nf index 482eefc..72d1a07 100644 --- a/modules/local/unmask.nf +++ b/modules/local/unmask.nf @@ -5,7 +5,7 @@ process UNMASK { conda "conda-forge::gawk=5.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : - 'quay.io/biocontainers/gawk:5.1.0' }" + 'biocontainers/gawk:5.1.0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/crumble/crumble.diff b/modules/nf-core/crumble/crumble.diff index 00c5857..2c4cb1e 100644 --- a/modules/nf-core/crumble/crumble.diff +++ b/modules/nf-core/crumble/crumble.diff @@ -1,7 +1,7 @@ Changes in module 'nf-core/crumble' --- modules/nf-core/crumble/main.nf +++ modules/nf-core/crumble/main.nf -@@ -30,7 +30,7 @@ +@@ -30,11 +30,14 @@ args.contains("-O cram") ? "cram" : "sam" def bedin = keepbed ? "-R ${keepbed}" : "" @@ -10,5 +10,12 @@ Changes in module 'nf-core/crumble' if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ ++ # Need to fake REF_PATH to force crumble to use the Fasta file defined in ++ # the UR field of the @SQ headers. (bug reported to the samtools team). ++ env REF_PATH=/missing \\ + crumble \\ + $args \\ + $bedin \\ ************************************************************ diff --git a/modules/nf-core/crumble/main.nf b/modules/nf-core/crumble/main.nf index a250829..44c0c59 100644 --- a/modules/nf-core/crumble/main.nf +++ b/modules/nf-core/crumble/main.nf @@ -35,6 +35,9 @@ process CRUMBLE { def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ + # Need to fake REF_PATH to force crumble to use the Fasta file defined in + # the UR field of the @SQ headers. (bug reported to the samtools team). + env REF_PATH=/missing \\ crumble \\ $args \\ $bedin \\ diff --git a/modules/nf-core/samtools/fixmate/main.nf b/modules/nf-core/samtools/fixmate/main.nf deleted file mode 100644 index 7127bff..0000000 --- a/modules/nf-core/samtools/fixmate/main.nf +++ /dev/null @@ -1,37 +0,0 @@ -process SAMTOOLS_FIXMATE { - tag "$meta.id" - label 'process_low' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(bam) - - output: - tuple val(meta), path("*.bam"), emit: bam - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools \\ - fixmate \\ - $args \\ - --threads ${task.cpus-1} \\ - $bam \\ - ${prefix}.bam \\ - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/fixmate/meta.yml b/modules/nf-core/samtools/fixmate/meta.yml deleted file mode 100644 index a72c5ca..0000000 --- a/modules/nf-core/samtools/fixmate/meta.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: samtools_fixmate -description: Samtools fixmate is a tool that can fill in information (insert size, cigar, mapq) about paired end reads onto the corresponding other read. Also has options to remove secondary/unmapped alignments and recalculate whether reads are proper pairs. -keywords: - - fixmate - - samtools - - insert size - - repair - - bam - - paired - - read pairs -tools: - - samtools: - description: | - SAMtools is a set of utilities for interacting with and post-processing - short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. - These files are generated as output by short read aligners like BWA. - homepage: http://www.htslib.org/ - documentation: http://www.htslib.org/doc/samtools.html - tool_dev_url: https://github.com/samtools/samtools - doi: 10.1093/bioinformatics/btp352 - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file, must be sorted by name, not coordinate - pattern: "*.{bam,cram,sam}" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - bam: - type: file - description: A BAM/CRAM/SAM file with mate information added and/or proper pairs recalled - pattern: "*.{bam,cram,sam}" - -authors: - - "@sppearce" diff --git a/modules/nf-core/samtools/markdup/main.nf b/modules/nf-core/samtools/markdup/main.nf deleted file mode 100644 index 218cf97..0000000 --- a/modules/nf-core/samtools/markdup/main.nf +++ /dev/null @@ -1,63 +0,0 @@ -process SAMTOOLS_MARKDUP { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(input) - path fasta - - output: - tuple val(meta), path("*.bam"), emit: bam, optional: true - tuple val(meta), path("*.cram"), emit: cram, optional: true - tuple val(meta), path("*.sam"), emit: sam, optional: true - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def reference = fasta ? "--reference ${fasta}" : "" - def extension = args.contains("--output-fmt sam") ? "sam" : - args.contains("--output-fmt bam") ? "bam" : - args.contains("--output-fmt cram") ? "cram" : - "bam" - if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools \\ - markdup \\ - $args \\ - ${reference} \\ - -@ $task.cpus \\ - -T $prefix \\ - $input \\ - ${prefix}.${extension} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def extension = args.contains("--output-fmt sam") ? "sam" : - args.contains("--output-fmt bam") ? "bam" : - args.contains("--output-fmt cram") ? "cram" : - "bam" - if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - touch ${prefix}.${extension} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/markdup/meta.yml b/modules/nf-core/samtools/markdup/meta.yml deleted file mode 100644 index 4207c93..0000000 --- a/modules/nf-core/samtools/markdup/meta.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: "samtools_markdup" -description: mark duplicate alignments in a coordinate sorted file -keywords: - - bam - - duplicates - - markduplicates - - samtools -tools: - - "samtools": - description: "Tools for dealing with SAM, BAM and CRAM files" - homepage: "http://www.htslib.org" - documentation: "https://www.htslib.org/doc/samtools-markdup.html" - tool_dev_url: "https://github.com/samtools/samtools" - doi: "10.1093/bioinformatics/btp352" - licence: "['MIT']" - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - output: - type: file - description: Sorted BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -authors: - - "@priyanka-surana" diff --git a/nextflow.config b/nextflow.config index 298513b..f9d5027 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,9 @@ params { bwamem2_index = null fasta = null + // Execution options + use_work_dir_as_temp = false + // Boilerplate options outdir = "./results" @@ -166,6 +169,7 @@ report { trace { enabled = true file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + fields = 'task_id,hash,native_id,process,tag,status,exit,cpus,memory,time,attempt,submit,start,complete,duration,%cpu,%mem,peak_rss,rchar,wchar' } dag { enabled = true diff --git a/nextflow_schema.json b/nextflow_schema.json index 5f44c81..737b3c2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -67,6 +67,21 @@ } } }, + "execution": { + "title": "Execution", + "type": "object", + "description": "Control the execution of the pipeline.", + "default": "", + "properties": { + "use_work_dir_as_temp": { + "type": "boolean", + "description": "Set to true to make tools (e.g. sort, FastK, MerquryFK) use the work directory for their temporary files, rather than the system default.", + "fa_icon": "fas fa-arrow-circle-down", + "hidden": true + } + }, + "fa_icon": "fas fa-running" + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -236,6 +251,9 @@ { "$ref": "#/definitions/reference_genome_options" }, + { + "$ref": "#/definitions/execution" + }, { "$ref": "#/definitions/institutional_config_options" }, diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index 3d28217..c1d2263 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -23,29 +23,32 @@ workflow ALIGN_ONT { | map { meta, file -> file } | set { ch_fasta } + // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM MINIMAP2_ALIGN ( reads, ch_fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) // Collect all alignment output by sample name MINIMAP2_ALIGN.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } | set { ch_bams } - // Merge - SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_MERGE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORT.out.bam + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 080af18..01cd1ac 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -5,7 +5,6 @@ include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' @@ -30,29 +29,32 @@ workflow ALIGN_PACBIO { | map { meta, file -> file } | set { ch_fasta } + // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, ch_fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) // Collect all alignment output by sample name MINIMAP2_ALIGN.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } | set { ch_bams } - // Merge - SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_MERGE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORT.out.bam + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 379d2fa..d8a8a53 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -3,6 +3,7 @@ // include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' workflow INPUT_CHECK { @@ -11,20 +12,35 @@ workflow INPUT_CHECK { main: + ch_versions = Channel.empty() + + // Read the samplesheet SAMPLESHEET_CHECK ( samplesheet ).csv | splitCsv ( header:true, sep:',' ) - | map { create_data_channel( it ) } + // Prepare the channel for SAMTOOLS_FLAGSTAT + | map { row -> [row + [id: file(row.datafile).baseName], file(row.datafile, checkIfExists: true), []] } + | set { samplesheet_rows } + ch_versions = ch_versions.mix ( SAMPLESHEET_CHECK.out.versions.first() ) + + // Get stats from each input file + SAMTOOLS_FLAGSTAT ( samplesheet_rows ) + ch_versions = ch_versions.mix ( SAMTOOLS_FLAGSTAT.out.versions.first() ) + + // Create the read channel for the rest of the pipeline + samplesheet_rows + | join( SAMTOOLS_FLAGSTAT.out.flagstat ) + | map { meta, datafile, meta2, stats -> create_data_channel( meta, datafile, stats ) } | set { reads } emit: reads // channel: [ val(meta), /path/to/datafile ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } // Function to get list of [ meta, reads ] -def create_data_channel ( LinkedHashMap row ) { +def create_data_channel ( LinkedHashMap row, datafile, stats ) { // create meta map def meta = [:] meta.id = row.sample @@ -39,13 +55,15 @@ def create_data_channel ( LinkedHashMap row ) { } meta.read_group = "\'@RG\\tID:" + row.datafile.split('/')[-1].split('\\.')[0] + "\\tPL:" + platform + "\\tSM:" + meta.id.split('_')[0..-2].join('_') + "\'" - - // add path(s) of the read file(s) to the meta map - def data_meta = [] - if ( !file(row.datafile).exists() ) { - exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.datafile}" - } else { - data_meta = [ meta, file(row.datafile) ] + // Read the first line of the flagstat file + // 3127898040 + 0 in total (QC-passed reads + QC-failed reads) + // and make the sum of both integers + stats.withReader { + line = it.readLine() + def lspl = line.split() + def read_count = lspl[0].toLong() + lspl[2].toLong() + meta.read_count = read_count } - return data_meta + + return [meta, datafile] } diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf index 4fca771..9b271f8 100644 --- a/subworkflows/local/markdup_stats.nf +++ b/subworkflows/local/markdup_stats.nf @@ -3,9 +3,10 @@ // Convert to CRAM and calculate statistics // -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { MARKDUPLICATE } from '../../subworkflows/local/markduplicate' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' +include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' workflow MARKDUP_STATS { @@ -25,18 +26,34 @@ workflow MARKDUP_STATS { // Collect all BWAMEM2 output by sample name SAMTOOLS_SORT.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } | set { ch_bams } + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) + | set { ch_bam } + + // Mark duplicates - MARKDUPLICATE ( ch_bams ) - ch_versions = ch_versions.mix ( MARKDUPLICATE.out.versions ) + SAMTOOLS_SORMADUP ( ch_bam, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) // Convert merged BAM to CRAM and calculate indices and statistics - MARKDUPLICATE.out.bam + SAMTOOLS_SORMADUP.out.bam | map { meta, bam -> [ meta, bam, [] ] } | set { ch_stat } diff --git a/subworkflows/local/markduplicate.nf b/subworkflows/local/markduplicate.nf deleted file mode 100644 index 3f47aa4..0000000 --- a/subworkflows/local/markduplicate.nf +++ /dev/null @@ -1,49 +0,0 @@ -// -// Merge BAM files and mark duplicates -// - -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_COLLATE } from '../../modules/nf-core/samtools/collate/main' -include { SAMTOOLS_FIXMATE } from '../../modules/nf-core/samtools/fixmate/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_MARKDUP } from '../../modules/nf-core/samtools/markdup/main' - - -workflow MARKDUPLICATE { - take: - bams // channel: [ val(meta), [ /path/to/bams ] ] - - - main: - ch_versions = Channel.empty() - - - // Merge position sorted bam files - SAMTOOLS_MERGE ( bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - // Collate merged BAM file - SAMTOOLS_COLLATE ( SAMTOOLS_MERGE.out.bam, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_COLLATE.out.versions.first() ) - - - // Fill in mate coordinates and insert size fields - SAMTOOLS_FIXMATE ( SAMTOOLS_COLLATE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_FIXMATE.out.versions.first() ) - - - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_FIXMATE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - - // Mark duplicates - SAMTOOLS_MARKDUP ( SAMTOOLS_SORT.out.bam, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MARKDUP.out.versions.first() ) - - - emit: - bam = SAMTOOLS_MARKDUP.out.bam // channel: [ val(meta), /path/to/bam ] - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index b5f23fd..5264569 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -19,12 +19,15 @@ workflow PREPARE_GENOME { // Uncompress genome fasta file if required if ( params.fasta.endsWith('.gz') ) { - ch_fasta = GUNZIP ( fasta ).gunzip + ch_unzipped = GUNZIP ( fasta ).gunzip ch_versions = ch_versions.mix ( GUNZIP.out.versions ) } else { - ch_fasta = fasta + ch_unzipped = fasta } + ch_unzipped + | map { meta, fa -> [ meta + [id: fa.baseName, genome_size: fa.size()], fa] } + | set { ch_fasta } // Unmask genome fasta UNMASK ( ch_fasta ) diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index 75995b4..9d12b0b 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -81,7 +81,7 @@ workflow READMAPPING { // SUBWORKFLOW: Uncompress and prepare reference genome files // ch_fasta - | map { [ [ id: it.baseName.tokenize(".")[0..1].join(".") ], it ] } + | map { [ [ id: it.baseName ], it ] } | set { ch_genome } PREPARE_GENOME ( ch_genome )