diff --git a/CHANGELOG.md b/CHANGELOG.md index 8eb80be..8150062 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,24 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.2.2](https://github.com/sanger-tol/readmapping/releases/tag/1.2.2)] - Norwegian Ridgeback (patch 2) -[2024-05-23] +## [[1.3.0](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye - [2024-06-XX] + +### Enhancements & fixes + +- Combined steps to improve the efficiency of the pipeline, especially on large genomes +- "crumble" is now run on _every_ data type, not just PacBio + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. + +| Dependency | Old version | New version | +| ---------- | ------------- | ------------- | +| `samtools` | 1.14 and 1.17 | 1.17 and 1.18 | + +> **NB:** Dependency has been **updated** if both old and new version information is present.
**NB:** Dependency has been **added** if just the new version information is present.
**NB:** Dependency has been **removed** if version information isn't present. + +## [[1.2.2](https://github.com/sanger-tol/readmapping/releases/tag/1.2.2)] - Norwegian Ridgeback (patch 2) - [2024-05-23] ### Enhancements & fixes diff --git a/conf/base.config b/conf/base.config index bfd327b..ca753f3 100644 --- a/conf/base.config +++ b/conf/base.config @@ -20,20 +20,16 @@ process { memory = { check_max( 50.MB * task.attempt, 'memory' ) } time = { check_max( 30.min * task.attempt, 'time' ) } - withName: 'SAMTOOLS_(CONVERT|FILTER)' { + withName: 'SAMTOOLS_(CONVERT)' { time = { check_max( 1.hour * task.attempt, 'time' ) } } - withName: 'SAMTOOLS_(FASTA)' { - time = { check_max( 2.hour * task.attempt, 'time' ) } - } - withName: 'SAMTOOLS_(STATS)' { // Actually less than 1 hour for PacBio HiFi data, but confirmed 3 hours for Hi-C time = { check_max( 4.hour * task.attempt, 'time' ) } } - withName: 'SAMTOOLS_(COLLATE|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' { + withName: 'SAMTOOLS_(COLLATETOFASTA|FILTERTOFASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|VIEW)' { time = { check_max( 8.hour * task.attempt, 'time' ) } } @@ -41,14 +37,11 @@ process { memory = { check_max( 250.MB * task.attempt, 'memory' ) } } - withName: '.*:ALIGN_(HIFI|HIC|ILLUMINA):.*:SAMTOOLS_(STATS|VIEW)' { - memory = { check_max( 1.GB * task.attempt, 'memory' ) } - } - withName: '.*:ALIGN_(CLR|ONT):.*:SAMTOOLS_(STATS|VIEW)' { - memory = { check_max( 2.GB * task.attempt, 'memory' ) } + withName: 'SAMTOOLS_(STATS|VIEW)' { + memory = { check_max( ((meta.datatype == "pacbio_clr" || meta.datatype == "ont") ? 2.GB : 1.GB) * task.attempt, 'memory' ) } } - withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' { + withName: 'SAMTOOLS_COLLATETOFASTA' { cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } } @@ -59,13 +52,6 @@ process { time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } - withName: SAMTOOLS_SORT { - cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } - // Memory increases by 768M for each thread - memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) } - time = { check_max( 8.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } - } - withName: BLAST_BLASTN { time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } @@ -84,17 +70,24 @@ process { // Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size // Runtime is considered proportional to the number of reads and inversely to number of threads time = { check_max( 3.h * task.attempt * Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) } - // Base RAM usage is about 6 times the genome size. Each thread takes an additional 800 MB RAM - // Memory usage of SAMTOOLS_VIEW is negligible. - memory = { check_max( 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 800.MB * task.attempt * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } + // Base RAM usage is about 6 times the genome size. + // Each thread takes an additional 800 MB RAM for bwa-mem2 and 800 MB for samtools sort + memory = { check_max( 8.GB + 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 1600.MB * task.attempt * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } } - withName: MINIMAP2_ALIGN { + withName: '.*:ALIGN_HIFI:MINIMAP2_ALIGN' { cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } - memory = { check_max( (6.GB * Math.ceil( reference.size() / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) } + memory = { check_max( 800.MB * log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) + 14.GB * Math.ceil( Math.pow(meta2.genome_size / 1000000000, 0.6)) * task.attempt, 'memory' ) } time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } } + // Extrapolated from the HIFI settings on the basis of 1 ONT alignment. CLR assumed to behave the same way as ONT + withName: '.*:ALIGN_(CLR|ONT):MINIMAP2_ALIGN' { + cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } + memory = { check_max( 800.MB * log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) + 30.GB * Math.ceil( Math.pow(meta2.genome_size / 1000000000, 0.6)) * task.attempt, 'memory' ) } + time = { check_max( 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + } + withName: CRUMBLE { // No correlation between memory usage and the number of reads or the genome size. // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB. diff --git a/conf/modules.config b/conf/modules.config index 31f4c6b..c9e3628 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -23,18 +23,13 @@ process { ext.args = { "-R ${meta.read_group}" } } - withName: SAMTOOLS_SORT { - ext.prefix = { "${meta.id}.sort" } - } - withName: SAMTOOLS_MERGE { ext.args = { "-c -p" } ext.prefix = { "${meta.id}.merge" } } - withName: SAMTOOLS_COLLATE { + withName: SAMTOOLS_COLLATETOFASTA { ext.args = { (params.use_work_dir_as_temp ? "-T." : "") } - ext.prefix = { "${meta.id}.collate" } } withName: BLAST_BLASTN { @@ -45,27 +40,21 @@ process { ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index" } - withName: SAMTOOLS_FILTER { - ext.prefix = { "${meta.id}.filter" } - } - - withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes - // NOTE: Use `reference.size()` for now, and switch to `meta2.genome_size` once we update the modules. - // ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' } - ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } + withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { - ext.args = { "-ax map-pb -R ${meta.read_group}" } + ext.args = { "-ax map-pb -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } withName: '.*:.*:ALIGN_ONT:MINIMAP2_ALIGN' { - ext.args = { "-ax map-ont -R ${meta.read_group}" } + ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { @@ -87,12 +76,7 @@ process { withName: CRUMBLE { ext.prefix = { "${input.baseName}.crumble" } - ext.args = '-y pbccs -O cram' - publishDir = [ - path: { "${params.outdir}/read_mapping/pacbio" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + ext.args = { (meta.datatype == "pacbio" ? "-y pbccs " : "") + "-O bam" } } withName: SAMPLESHEET_CHECK { @@ -103,41 +87,9 @@ process { ] } - withName: '.*:ALIGN_HIC:MARKDUP_STATS:CONVERT_STATS:.*' { - publishDir = [ - path: { "${params.outdir}/read_mapping/hic" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:ALIGN_ILLUMINA:MARKDUP_STATS:CONVERT_STATS:.*' { - publishDir = [ - path: { "${params.outdir}/read_mapping/illumina" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:ALIGN_HIFI:CONVERT_STATS:.*' { - publishDir = [ - path: { "${params.outdir}/read_mapping/pacbio" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:ALIGN_CLR:CONVERT_STATS:.*' { - publishDir = [ - path: { "${params.outdir}/read_mapping/pacbio" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:ALIGN_ONT:CONVERT_STATS:.*' { + withName: '.*:CONVERT_STATS:SAMTOOLS_.*' { publishDir = [ - path: { "${params.outdir}/read_mapping/ont" }, + path: { "${params.outdir}/read_mapping/${meta.datatype}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/modules.json b/modules.json index a9a06be..f7da822 100644 --- a/modules.json +++ b/modules.json @@ -23,8 +23,7 @@ "crumble": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"], - "patch": "modules/nf-core/crumble/crumble.diff" + "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", @@ -38,12 +37,7 @@ }, "minimap2/align": { "branch": "master", - "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", - "installed_by": ["modules"] - }, - "samtools/collate": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "efbf86bb487f288ac30660282709d9620dd6048e", "installed_by": ["modules"] }, "samtools/faidx": { @@ -51,11 +45,6 @@ "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", "installed_by": ["modules"] }, - "samtools/fasta": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] - }, "samtools/fastq": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -74,12 +63,8 @@ "samtools/merge": { "branch": "master", "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", - "installed_by": ["modules"] - }, - "samtools/sort": { - "branch": "master", - "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" }, "samtools/stats": { "branch": "master", diff --git a/modules/local/samtools_collatetofasta.nf b/modules/local/samtools_collatetofasta.nf new file mode 100644 index 0000000..81119f2 --- /dev/null +++ b/modules/local/samtools_collatetofasta.nf @@ -0,0 +1,45 @@ +process SAMTOOLS_COLLATETOFASTA { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.fasta"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools collate \\ + $args \\ + -O \\ + -u \\ + -T ${prefix}.collate \\ + --threads $task.cpus \\ + ${input} \\ + | \\ + samtools fasta \\ + $args2 \\ + --threads $task.cpus \\ + -0 ${prefix}.fasta \\ + > /dev/null + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/local/samtools_filtertofastq.nf similarity index 60% rename from modules/nf-core/samtools/sort/main.nf rename to modules/local/samtools_filtertofastq.nf index 2b7753f..fbee087 100644 --- a/modules/nf-core/samtools/sort/main.nf +++ b/modules/local/samtools_filtertofastq.nf @@ -1,6 +1,6 @@ -process SAMTOOLS_SORT { +process SAMTOOLS_FILTERTOFASTQ { tag "$meta.id" - label 'process_medium' + label 'process_low' conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -8,27 +8,35 @@ process SAMTOOLS_SORT { 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: - tuple val(meta), path(bam) + tuple val(meta), path(input), path(index) + path qname output: - tuple val(meta), path("*.bam"), emit: bam - tuple val(meta), path("*.csi"), emit: csi, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path("*.fastq.gz") , emit: fastq + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ - samtools sort \\ + samtools view \\ + --threads $task.cpus \\ + --qname-file ${qname} \\ + --unoutput - \\ $args \\ - -@ $task.cpus \\ - -o ${prefix}.bam \\ - -T $prefix \\ - $bam + -o /dev/null \\ + $input \\ + | \\ + samtools fastq \\ + $args2 \\ + --threads $task.cpus \\ + -0 ${prefix}.fastq.gz \\ + - \\ + > /dev/null cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -39,7 +47,7 @@ process SAMTOOLS_SORT { stub: def prefix = task.ext.prefix ?: "${meta.id}" """ - touch ${prefix}.bam + echo | gzip > ${prefix}.fastq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/crumble/crumble.diff b/modules/nf-core/crumble/crumble.diff deleted file mode 100644 index 2c4cb1e..0000000 --- a/modules/nf-core/crumble/crumble.diff +++ /dev/null @@ -1,21 +0,0 @@ -Changes in module 'nf-core/crumble' ---- modules/nf-core/crumble/main.nf -+++ modules/nf-core/crumble/main.nf -@@ -30,11 +30,14 @@ - args.contains("-O cram") ? "cram" : - "sam" - def bedin = keepbed ? "-R ${keepbed}" : "" -- def bedout = bedout ? "-b ${prefix}.out.bed" : "" -+ def bedout = bedout ? "-b ${prefix}.suspicious_regions.bed" : "" - if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - - def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ -+ # Need to fake REF_PATH to force crumble to use the Fasta file defined in -+ # the UR field of the @SQ headers. (bug reported to the samtools team). -+ env REF_PATH=/missing \\ - crumble \\ - $args \\ - $bedin \\ - -************************************************************ diff --git a/modules/nf-core/crumble/main.nf b/modules/nf-core/crumble/main.nf index 44c0c59..2699257 100644 --- a/modules/nf-core/crumble/main.nf +++ b/modules/nf-core/crumble/main.nf @@ -30,14 +30,11 @@ process CRUMBLE { args.contains("-O cram") ? "cram" : "sam" def bedin = keepbed ? "-R ${keepbed}" : "" - def bedout = bedout ? "-b ${prefix}.suspicious_regions.bed" : "" + def bedout = bedout ? "-b ${prefix}.out.bed" : "" if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ - # Need to fake REF_PATH to force crumble to use the Fasta file defined in - # the UR field of the @SQ headers. (bug reported to the samtools team). - env REF_PATH=/missing \\ crumble \\ $args \\ $bedin \\ diff --git a/modules/nf-core/minimap2/align/environment.yml b/modules/nf-core/minimap2/align/environment.yml new file mode 100644 index 0000000..cf6e775 --- /dev/null +++ b/modules/nf-core/minimap2/align/environment.yml @@ -0,0 +1,9 @@ +name: minimap2_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::minimap2=2.24 + - bioconda::samtools=1.18 + - bioconda::htslib=1.18 diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf index 4da47c1..07a3215 100644 --- a/modules/nf-core/minimap2/align/main.nf +++ b/modules/nf-core/minimap2/align/main.nf @@ -3,14 +3,14 @@ process MINIMAP2_ALIGN { label 'process_medium' // Note: the versions here need to match the versions used in the mulled container below and minimap2/index - conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : - 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' }" input: tuple val(meta), path(reads) - path reference + tuple val(meta2), path(reference) val bam_format val cigar_paf_format val cigar_bam @@ -24,9 +24,10 @@ process MINIMAP2_ALIGN { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def bam_output = bam_format ? "-a | samtools sort -@ ${task.cpus} -o ${prefix}.bam ${args2}" : "-o ${prefix}.paf" def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' """ diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml index 991b39a..408522d 100644 --- a/modules/nf-core/minimap2/align/meta.yml +++ b/modules/nf-core/minimap2/align/meta.yml @@ -25,6 +25,11 @@ input: description: | List of input FASTA or FASTQ files of size 1 and 2 for single-end and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test_ref'] - reference: type: file description: | @@ -63,3 +68,8 @@ authors: - "@sofstam" - "@sateeshperi" - "@jfy133" +maintainers: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test b/modules/nf-core/minimap2/align/tests/main.nf.test new file mode 100644 index 0000000..b634468 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test @@ -0,0 +1,145 @@ +nextflow_process { + + name "Test Process MINIMAP2_ALIGN" + script "../main.nf" + process "MINIMAP2_ALIGN" + + tag "modules" + tag "modules_nfcore" + tag "minimap2" + tag "minimap2/align" + + test("sarscov2 - fastq, fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, [], true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test.snap b/modules/nf-core/minimap2/align/tests/main.nf.test.snap new file mode 100644 index 0000000..a39a169 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test.snap @@ -0,0 +1,38 @@ +{ + "sarscov2 - fastq, fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:06.01315354" + }, + "sarscov2 - fastq, fasta, true, false, false - stub": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:24.487175659" + }, + "sarscov2 - [fastq1, fastq2], fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:12.50816279" + }, + "sarscov2 - fastq, [], true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:18.414974788" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/tests/tags.yml b/modules/nf-core/minimap2/align/tests/tags.yml new file mode 100644 index 0000000..39dba37 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/tags.yml @@ -0,0 +1,2 @@ +minimap2/align: + - "modules/nf-core/minimap2/align/**" diff --git a/modules/nf-core/samtools/collate/main.nf b/modules/nf-core/samtools/collate/main.nf deleted file mode 100644 index b23246b..0000000 --- a/modules/nf-core/samtools/collate/main.nf +++ /dev/null @@ -1,46 +0,0 @@ -process SAMTOOLS_COLLATE { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0': - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(input) - path fasta - - output: - tuple val(meta), path("*.bam"), emit: bam, optional: true - tuple val(meta), path("*.cram"), emit: cram, optional: true - tuple val(meta), path("*.sam"), emit: sam, optional: true - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def reference = fasta ? "--reference ${fasta}" : "" - def extension = args.contains("--output-fmt sam") ? "sam" : - args.contains("--output-fmt bam") ? "bam" : - args.contains("--output-fmt cram") ? "cram" : - "bam" - if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools \\ - collate \\ - $args \\ - ${reference} \\ - -@ $task.cpus \\ - -o ${prefix}.${extension} \\ - $input - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/collate/meta.yml b/modules/nf-core/samtools/collate/meta.yml deleted file mode 100644 index 0e78403..0000000 --- a/modules/nf-core/samtools/collate/meta.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: "samtools_collate" -description: shuffles and groups reads together by their names -keywords: - - collate - - bam -tools: - - "samtools": - description: "Tools for dealing with SAM, BAM and CRAM files" - homepage: "http://www.htslib.org" - documentation: "https://www.htslib.org/doc/samtools-collate.html" - tool_dev_url: "https://github.com/samtools/samtools" - doi: "10.1093/bioinformatics/btp352" - licence: "['MIT']" - -input: - # Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -output: - #Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - output: - type: file - description: Collated BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -authors: - - "@priyanka-surana" diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf deleted file mode 100644 index 3145965..0000000 --- a/modules/nf-core/samtools/fasta/main.nf +++ /dev/null @@ -1,44 +0,0 @@ -process SAMTOOLS_FASTA { - tag "$meta.id" - label 'process_low' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(input) - val(interleave) - - output: - tuple val(meta), path("*_{1,2}.fasta.gz") , optional:true, emit: fasta - tuple val(meta), path("*_interleaved.fasta.gz"), optional:true, emit: interleaved - tuple val(meta), path("*_singleton.fasta.gz") , optional:true, emit: singleton - tuple val(meta), path("*_other.fasta.gz") , optional:true, emit: other - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fasta.gz" : - meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" : - "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz" - """ - samtools \\ - fasta \\ - $args \\ - --threads ${task.cpus-1} \\ - -0 ${prefix}_other.fasta.gz \\ - $input \\ - $output - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/fasta/meta.yml b/modules/nf-core/samtools/fasta/meta.yml deleted file mode 100644 index 8e45986..0000000 --- a/modules/nf-core/samtools/fasta/meta.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: "samtools_fasta" -description: Converts a SAM/BAM/CRAM file to FASTA -keywords: - - bam - - sam - - cram - - fasta -tools: - - "samtools": - description: "Tools for dealing with SAM, BAM and CRAM files" - homepage: "http://www.htslib.org" - documentation: "https://www.htslib.org/doc/samtools-fasta.html" - tool_dev_url: "https://github.com/samtools/samtools" - doi: "10.1093/bioinformatics/btp352" - licence: "['MIT']" - -input: - # Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - interleave: - type: boolean - description: Set true for interleaved fasta files - -output: - #Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - fasta: - type: file - description: Compressed FASTA file(s) with reads with either the READ1 or READ2 flag set in separate files. - pattern: "*_{1,2}.fasta.gz" - - interleaved: - type: file - description: Compressed FASTA file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. - pattern: "*_interleaved.fasta.gz" - - singleton: - type: file - description: Compressed FASTA file with singleton reads - pattern: "*_singleton.fasta.gz" - - other: - type: file - description: Compressed FASTA file with reads with either both READ1 and READ2 flags set or unset - pattern: "*_other.fasta.gz" - -authors: - - "@priyanka-surana" diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf index b73b7cb..5fb8ede 100644 --- a/modules/nf-core/samtools/merge/main.nf +++ b/modules/nf-core/samtools/merge/main.nf @@ -26,6 +26,11 @@ process SAMTOOLS_MERGE { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + if (input_files instanceof List) { + sorted_input_files = input_files.toSorted({it.name}).join(' ') + } else { + sorted_input_files = input_files + } def reference = fasta ? "--reference ${fasta}" : "" """ samtools \\ @@ -34,7 +39,7 @@ process SAMTOOLS_MERGE { $args \\ ${reference} \\ ${prefix}.${file_type} \\ - $input_files + $sorted_input_files cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff new file mode 100644 index 0000000..cca0b3c --- /dev/null +++ b/modules/nf-core/samtools/merge/samtools-merge.diff @@ -0,0 +1,26 @@ +Changes in module 'nf-core/samtools/merge' +--- modules/nf-core/samtools/merge/main.nf ++++ modules/nf-core/samtools/merge/main.nf +@@ -26,6 +26,11 @@ + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() ++ if (input_files instanceof List) { ++ sorted_input_files = input_files.toSorted({it.name}).join(' ') ++ } else { ++ sorted_input_files = input_files ++ } + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ +@@ -34,7 +39,7 @@ + $args \\ + ${reference} \\ + ${prefix}.${file_type} \\ +- $input_files ++ $sorted_input_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml deleted file mode 100644 index 0732843..0000000 --- a/modules/nf-core/samtools/sort/meta.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: samtools_sort -description: Sort SAM/BAM/CRAM file -keywords: - - sort - - bam - - sam - - cram -tools: - - samtools: - description: | - SAMtools is a set of utilities for interacting with and post-processing - short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. - These files are generated as output by short read aligners like BWA. - homepage: http://www.htslib.org/ - documentation: http://www.htslib.org/doc/samtools.html - doi: 10.1093/bioinformatics/btp352 - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: Sorted BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - csi: - type: file - description: BAM index file (optional) - pattern: "*.csi" -authors: - - "@drpatelh" - - "@ewels" diff --git a/nextflow.config b/nextflow.config index cd9aae7..d09f872 100644 --- a/nextflow.config +++ b/nextflow.config @@ -183,7 +183,7 @@ manifest { description = 'Pipeline to map reads generated using different sequencing technologies against a genome assembly.' mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.2.2' + version = '1.3.0' doi = '10.5281/zenodo.6563577' } diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index c1d2263..f1013d4 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -4,8 +4,6 @@ include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' workflow ALIGN_ONT { @@ -18,13 +16,8 @@ workflow ALIGN_ONT { ch_versions = Channel.empty() - // Align Fastq to Genome - fasta - | map { meta, file -> file } - | set { ch_fasta } - - // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( reads, ch_fasta, true, false, false ) + // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM + MINIMAP2_ALIGN ( reads, fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) @@ -52,15 +45,8 @@ workflow ALIGN_ONT { | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } - CONVERT_STATS ( ch_sort, fasta ) - ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) - emit: - cram = CONVERT_STATS.out.cram // channel: [ val(meta), /path/to/cram ] - crai = CONVERT_STATS.out.crai // channel: [ val(meta), /path/to/crai ] - stats = CONVERT_STATS.out.stats // channel: [ val(meta), /path/to/stats ] - idxstats = CONVERT_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] - flagstat = CONVERT_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] + bam = ch_sort // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 01cd1ac..07855a7 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -5,7 +5,6 @@ include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' workflow ALIGN_PACBIO { @@ -24,13 +23,8 @@ workflow ALIGN_PACBIO { ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) - // Align Fastq to Genome - fasta - | map { meta, file -> file } - | set { ch_fasta } - - // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, ch_fasta, true, false, false ) + // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM + MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) @@ -58,15 +52,8 @@ workflow ALIGN_PACBIO { | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } - CONVERT_STATS ( ch_sort, fasta ) - ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) - emit: - cram = CONVERT_STATS.out.cram // channel: [ val(meta), /path/to/cram ] - crai = CONVERT_STATS.out.crai // channel: [ val(meta), /path/to/crai ] - stats = CONVERT_STATS.out.stats // channel: [ val(meta), /path/to/stats ] - idxstats = CONVERT_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] - flagstat = CONVERT_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] + bam = ch_sort // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index a6e574b..33c27b6 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -2,9 +2,10 @@ // Align short read (HiC and Illumina) data against the genome // -include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' -include { BWAMEM2_MEM } from '../../modules/nf-core/bwamem2/mem/main' -include { MARKDUP_STATS } from '../../subworkflows/local/markdup_stats' +include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' +include { BWAMEM2_MEM } from '../../modules/nf-core/bwamem2/mem/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' workflow ALIGN_SHORT { @@ -23,21 +24,46 @@ workflow ALIGN_SHORT { ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) - // Align Fastq to Genome - BWAMEM2_MEM ( SAMTOOLS_FASTQ.out.fastq, index, [] ) + // Align Fastq to Genome and output sorted BAM + BWAMEM2_MEM ( SAMTOOLS_FASTQ.out.fastq, index, true ) ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() ) - // Merge, markdup, convert, and stats - MARKDUP_STATS ( BWAMEM2_MEM.out.bam, fasta ) - ch_versions = ch_versions.mix ( MARKDUP_STATS.out.versions ) + // Collect all BWAMEM2 output by sample name + BWAMEM2_MEM.out.bam + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } + | groupTuple( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } + | set { ch_bams } + + + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) + | set { ch_bam } + + + // Mark duplicates + SAMTOOLS_SORMADUP ( ch_bam, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) + + + // Convert merged BAM to CRAM and calculate indices and statistics + SAMTOOLS_SORMADUP.out.bam + | map { meta, bam -> [ meta, bam, [] ] } + | set { ch_stat } emit: - cram = MARKDUP_STATS.out.cram // channel: [ val(meta), /path/to/cram ] - crai = MARKDUP_STATS.out.crai // channel: [ val(meta), /path/to/crai ] - stats = MARKDUP_STATS.out.stats // channel: [ val(meta), /path/to/stats ] - idxstats = MARKDUP_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] - flagstat = MARKDUP_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] + bam = ch_stat // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/convert_stats.nf b/subworkflows/local/convert_stats.nf index 2ea16b9..b89928a 100644 --- a/subworkflows/local/convert_stats.nf +++ b/subworkflows/local/convert_stats.nf @@ -2,6 +2,7 @@ // Convert BAM to CRAM, create index and calculate statistics // +include { CRUMBLE } from '../../modules/nf-core/crumble/main' include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' @@ -17,11 +18,28 @@ workflow CONVERT_STATS { main: ch_versions = Channel.empty() + // Compress the quality scores of Illumina and PacBio CCS alignments + bam + | branch { + meta, bam, bai -> + run_crumble : meta.datatype == "hic" || meta.datatype == "illumina" || meta.datatype == "pacbio" + [meta, bam] + no_crumble: true + } + | set { ch_bams } + + CRUMBLE ( ch_bams.run_crumble, [], [] ) + ch_versions = ch_versions.mix ( CRUMBLE.out.versions ) + // Convert BAM to CRAM - SAMTOOLS_VIEW ( bam, fasta, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) + CRUMBLE.out.bam + | map { meta, bam -> [meta, bam, []] } + | mix ( ch_bams.no_crumble ) + | set { ch_bams_for_conversion } + SAMTOOLS_VIEW ( ch_bams_for_conversion, fasta, [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) // Combine CRAM and CRAI into one channel SAMTOOLS_VIEW.out.cram diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index 7dfe8a1..759880c 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -4,13 +4,10 @@ // include { SAMTOOLS_VIEW as SAMTOOLS_CONVERT } from '../../modules/nf-core/samtools/view/main' -include { SAMTOOLS_COLLATE } from '../../modules/nf-core/samtools/collate/main' -include { SAMTOOLS_FASTA } from '../../modules/nf-core/samtools/fasta/main' -include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { SAMTOOLS_COLLATETOFASTA } from '../../modules/local/samtools_collatetofasta' include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main' include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' -include { SAMTOOLS_VIEW as SAMTOOLS_FILTER } from '../../modules/nf-core/samtools/view/main' -include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' +include { SAMTOOLS_FILTERTOFASTQ } from '../../modules/local/samtools_filtertofastq' workflow FILTER_PACBIO { @@ -33,22 +30,12 @@ workflow FILTER_PACBIO { // Collate BAM file to create interleaved FASTA - SAMTOOLS_COLLATE ( SAMTOOLS_CONVERT.out.bam, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_COLLATE.out.versions.first() ) - - - // Convert BAM to FASTA - SAMTOOLS_FASTA ( SAMTOOLS_COLLATE.out.bam, true ) - ch_versions = ch_versions.mix ( SAMTOOLS_FASTA.out.versions.first() ) - - - // Gunzip FASTA file to BLAST - GUNZIP ( SAMTOOLS_FASTA.out.other ) - ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() ) + SAMTOOLS_COLLATETOFASTA ( SAMTOOLS_CONVERT.out.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions.first() ) // Nucleotide BLAST - BLAST_BLASTN ( GUNZIP.out.gunzip, db ) + BLAST_BLASTN ( SAMTOOLS_COLLATETOFASTA.out.fasta, db ) ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) @@ -57,7 +44,7 @@ workflow FILTER_PACBIO { ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() ) - // Create filtered BAM file + // Filter the BAM file and convert to FASTQ SAMTOOLS_CONVERT.out.bam | join ( SAMTOOLS_CONVERT.out.csi ) | join ( PACBIO_FILTER.out.list ) @@ -71,16 +58,11 @@ workflow FILTER_PACBIO { | map { meta, bam, csi, list -> list } | set { ch_lists } - SAMTOOLS_FILTER ( ch_reads, [ [], [] ], ch_lists ) - ch_versions = ch_versions.mix ( SAMTOOLS_FILTER.out.versions.first() ) - - - // Convert BAM to FASTQ - SAMTOOLS_FASTQ ( SAMTOOLS_FILTER.out.unoutput, true ) - ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) + SAMTOOLS_FILTERTOFASTQ ( ch_reads, ch_lists ) + ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions.first() ) emit: - fastq = SAMTOOLS_FASTQ.out.other // channel: [ meta, /path/to/fastq ] - versions = ch_versions // channel: [ versions.yml ] + fastq = SAMTOOLS_FILTERTOFASTQ.out.fastq // channel: [ meta, /path/to/fastq ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf deleted file mode 100644 index 9b271f8..0000000 --- a/subworkflows/local/markdup_stats.nf +++ /dev/null @@ -1,71 +0,0 @@ -// -// Merge and Markdup all alignments at specimen level -// Convert to CRAM and calculate statistics -// - -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' - - -workflow MARKDUP_STATS { - take: - aln // channel: [ val(meta), /path/to/bam ] - fasta // channel: [ val(meta), /path/to/fasta ] - - - main: - ch_versions = Channel.empty() - - - // Sort BAM file - SAMTOOLS_SORT ( aln ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - - // Collect all BWAMEM2 output by sample name - SAMTOOLS_SORT.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - | groupTuple( by: [0] ) - | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } - | branch { - meta, bams -> - single_bam: bams.size() == 1 - multi_bams: true - } - | set { ch_bams } - - - // Merge, but only if there is more than 1 file - SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - SAMTOOLS_MERGE.out.bam - | mix ( ch_bams.single_bam ) - | set { ch_bam } - - - // Mark duplicates - SAMTOOLS_SORMADUP ( ch_bam, fasta ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) - - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORMADUP.out.bam - | map { meta, bam -> [ meta, bam, [] ] } - | set { ch_stat } - - CONVERT_STATS ( ch_stat, fasta ) - ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) - - - emit: - cram = CONVERT_STATS.out.cram // channel: [ val(meta), /path/to/cram ] - crai = CONVERT_STATS.out.crai // channel: [ val(meta), /path/to/crai ] - stats = CONVERT_STATS.out.stats // channel: [ val(meta), /path/to/stats ] - idxstats = CONVERT_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] - flagstat = CONVERT_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index 9d12b0b..18ebd36 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -32,6 +32,7 @@ include { ALIGN_SHORT as ALIGN_ILLUMINA } from '../subworkflows/local/align_shor include { ALIGN_PACBIO as ALIGN_HIFI } from '../subworkflows/local/align_pacbio' include { ALIGN_PACBIO as ALIGN_CLR } from '../subworkflows/local/align_pacbio' include { ALIGN_ONT } from '../subworkflows/local/align_ont' +include { CONVERT_STATS } from '../subworkflows/local/convert_stats' /* @@ -45,7 +46,6 @@ include { ALIGN_ONT } from '../subworkflows/local/align_ont' // include { UNTAR } from '../modules/nf-core/untar/main' -include { CRUMBLE } from '../modules/nf-core/crumble/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -126,11 +126,14 @@ workflow READMAPPING { ch_versions = ch_versions.mix ( ALIGN_ONT.out.versions ) - // - // MODULE: To compress PacBio HiFi aligned CRAM files - // - CRUMBLE ( ALIGN_HIFI.out.cram, [], true ) - ch_versions = ch_versions.mix ( CRUMBLE.out.versions ) + ch_aligned_bams = Channel.empty() + | mix( ALIGN_HIC.out.bam ) + | mix( ALIGN_ILLUMINA.out.bam ) + | mix( ALIGN_HIFI.out.bam ) + | mix( ALIGN_CLR.out.bam ) + | mix( ALIGN_ONT.out.bam ) + CONVERT_STATS ( ch_aligned_bams, PREPARE_GENOME.out.fasta ) + ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) //