From c76d9a997b5ba2a033c237e0abb6ad57246d7614 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 9 Feb 2024 15:36:49 +0000 Subject: [PATCH 01/16] Sort the BAM files before merging to make the output reproducible --- modules/nf-core/samtools/merge/main.nf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf index b73b7cb..5fb8ede 100644 --- a/modules/nf-core/samtools/merge/main.nf +++ b/modules/nf-core/samtools/merge/main.nf @@ -26,6 +26,11 @@ process SAMTOOLS_MERGE { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + if (input_files instanceof List) { + sorted_input_files = input_files.toSorted({it.name}).join(' ') + } else { + sorted_input_files = input_files + } def reference = fasta ? "--reference ${fasta}" : "" """ samtools \\ @@ -34,7 +39,7 @@ process SAMTOOLS_MERGE { $args \\ ${reference} \\ ${prefix}.${file_type} \\ - $input_files + $sorted_input_files cat <<-END_VERSIONS > versions.yml "${task.process}": From 15ad23dc071414d3dc31dc13d775f09e440ef4a4 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 25 Jan 2024 15:36:53 +0000 Subject: [PATCH 02/16] Mistletoe (96Gbp, 2.4 M reads) takes 210 GB RAM and slightly over 2 hours --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index bfd327b..6a37a76 100644 --- a/conf/base.config +++ b/conf/base.config @@ -91,7 +91,7 @@ process { withName: MINIMAP2_ALIGN { cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } - memory = { check_max( (6.GB * Math.ceil( reference.size() / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) } + memory = { check_max( 14.GB * Math.ceil( Math.pow(reference.size() / 1000000000, 0.6)) * task.attempt, 'memory' ) } time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } } From e370b8c3d7e8f8653e50cf7fd3ef655db9dfd31d Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 25 Jan 2024 16:00:31 +0000 Subject: [PATCH 03/16] Created a local module to convert to Fasta without intermediate files --- conf/base.config | 8 +-- conf/modules.config | 3 +- modules.json | 10 ---- modules/local/samtools_collatetofasta.nf | 45 +++++++++++++++++ modules/nf-core/samtools/collate/main.nf | 46 ----------------- modules/nf-core/samtools/collate/meta.yml | 44 ---------------- modules/nf-core/samtools/fasta/main.nf | 44 ---------------- modules/nf-core/samtools/fasta/meta.yml | 61 ----------------------- subworkflows/local/filter_pacbio.nf | 20 ++------ 9 files changed, 52 insertions(+), 229 deletions(-) create mode 100644 modules/local/samtools_collatetofasta.nf delete mode 100644 modules/nf-core/samtools/collate/main.nf delete mode 100644 modules/nf-core/samtools/collate/meta.yml delete mode 100644 modules/nf-core/samtools/fasta/main.nf delete mode 100644 modules/nf-core/samtools/fasta/meta.yml diff --git a/conf/base.config b/conf/base.config index 6a37a76..5ea996e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -24,16 +24,12 @@ process { time = { check_max( 1.hour * task.attempt, 'time' ) } } - withName: 'SAMTOOLS_(FASTA)' { - time = { check_max( 2.hour * task.attempt, 'time' ) } - } - withName: 'SAMTOOLS_(STATS)' { // Actually less than 1 hour for PacBio HiFi data, but confirmed 3 hours for Hi-C time = { check_max( 4.hour * task.attempt, 'time' ) } } - withName: 'SAMTOOLS_(COLLATE|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' { + withName: 'SAMTOOLS_(COLLATETOFASTA|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' { time = { check_max( 8.hour * task.attempt, 'time' ) } } @@ -48,7 +44,7 @@ process { memory = { check_max( 2.GB * task.attempt, 'memory' ) } } - withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' { + withName: 'SAMTOOLS_COLLATETOFASTA' { cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } } diff --git a/conf/modules.config b/conf/modules.config index 31f4c6b..63f071d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -32,9 +32,8 @@ process { ext.prefix = { "${meta.id}.merge" } } - withName: SAMTOOLS_COLLATE { + withName: SAMTOOLS_COLLATETOFASTA { ext.args = { (params.use_work_dir_as_temp ? "-T." : "") } - ext.prefix = { "${meta.id}.collate" } } withName: BLAST_BLASTN { diff --git a/modules.json b/modules.json index a9a06be..66a383a 100644 --- a/modules.json +++ b/modules.json @@ -41,21 +41,11 @@ "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": ["modules"] }, - "samtools/collate": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] - }, "samtools/faidx": { "branch": "master", "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", "installed_by": ["modules"] }, - "samtools/fasta": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] - }, "samtools/fastq": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/local/samtools_collatetofasta.nf b/modules/local/samtools_collatetofasta.nf new file mode 100644 index 0000000..81119f2 --- /dev/null +++ b/modules/local/samtools_collatetofasta.nf @@ -0,0 +1,45 @@ +process SAMTOOLS_COLLATETOFASTA { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.fasta"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools collate \\ + $args \\ + -O \\ + -u \\ + -T ${prefix}.collate \\ + --threads $task.cpus \\ + ${input} \\ + | \\ + samtools fasta \\ + $args2 \\ + --threads $task.cpus \\ + -0 ${prefix}.fasta \\ + > /dev/null + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/collate/main.nf b/modules/nf-core/samtools/collate/main.nf deleted file mode 100644 index b23246b..0000000 --- a/modules/nf-core/samtools/collate/main.nf +++ /dev/null @@ -1,46 +0,0 @@ -process SAMTOOLS_COLLATE { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0': - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(input) - path fasta - - output: - tuple val(meta), path("*.bam"), emit: bam, optional: true - tuple val(meta), path("*.cram"), emit: cram, optional: true - tuple val(meta), path("*.sam"), emit: sam, optional: true - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def reference = fasta ? "--reference ${fasta}" : "" - def extension = args.contains("--output-fmt sam") ? "sam" : - args.contains("--output-fmt bam") ? "bam" : - args.contains("--output-fmt cram") ? "cram" : - "bam" - if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools \\ - collate \\ - $args \\ - ${reference} \\ - -@ $task.cpus \\ - -o ${prefix}.${extension} \\ - $input - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/collate/meta.yml b/modules/nf-core/samtools/collate/meta.yml deleted file mode 100644 index 0e78403..0000000 --- a/modules/nf-core/samtools/collate/meta.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: "samtools_collate" -description: shuffles and groups reads together by their names -keywords: - - collate - - bam -tools: - - "samtools": - description: "Tools for dealing with SAM, BAM and CRAM files" - homepage: "http://www.htslib.org" - documentation: "https://www.htslib.org/doc/samtools-collate.html" - tool_dev_url: "https://github.com/samtools/samtools" - doi: "10.1093/bioinformatics/btp352" - licence: "['MIT']" - -input: - # Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -output: - #Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - output: - type: file - description: Collated BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -authors: - - "@priyanka-surana" diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf deleted file mode 100644 index 3145965..0000000 --- a/modules/nf-core/samtools/fasta/main.nf +++ /dev/null @@ -1,44 +0,0 @@ -process SAMTOOLS_FASTA { - tag "$meta.id" - label 'process_low' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(input) - val(interleave) - - output: - tuple val(meta), path("*_{1,2}.fasta.gz") , optional:true, emit: fasta - tuple val(meta), path("*_interleaved.fasta.gz"), optional:true, emit: interleaved - tuple val(meta), path("*_singleton.fasta.gz") , optional:true, emit: singleton - tuple val(meta), path("*_other.fasta.gz") , optional:true, emit: other - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fasta.gz" : - meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" : - "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz" - """ - samtools \\ - fasta \\ - $args \\ - --threads ${task.cpus-1} \\ - -0 ${prefix}_other.fasta.gz \\ - $input \\ - $output - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/fasta/meta.yml b/modules/nf-core/samtools/fasta/meta.yml deleted file mode 100644 index 8e45986..0000000 --- a/modules/nf-core/samtools/fasta/meta.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: "samtools_fasta" -description: Converts a SAM/BAM/CRAM file to FASTA -keywords: - - bam - - sam - - cram - - fasta -tools: - - "samtools": - description: "Tools for dealing with SAM, BAM and CRAM files" - homepage: "http://www.htslib.org" - documentation: "https://www.htslib.org/doc/samtools-fasta.html" - tool_dev_url: "https://github.com/samtools/samtools" - doi: "10.1093/bioinformatics/btp352" - licence: "['MIT']" - -input: - # Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - interleave: - type: boolean - description: Set true for interleaved fasta files - -output: - #Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - fasta: - type: file - description: Compressed FASTA file(s) with reads with either the READ1 or READ2 flag set in separate files. - pattern: "*_{1,2}.fasta.gz" - - interleaved: - type: file - description: Compressed FASTA file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. - pattern: "*_interleaved.fasta.gz" - - singleton: - type: file - description: Compressed FASTA file with singleton reads - pattern: "*_singleton.fasta.gz" - - other: - type: file - description: Compressed FASTA file with reads with either both READ1 and READ2 flags set or unset - pattern: "*_other.fasta.gz" - -authors: - - "@priyanka-surana" diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index 7dfe8a1..818534e 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -4,9 +4,7 @@ // include { SAMTOOLS_VIEW as SAMTOOLS_CONVERT } from '../../modules/nf-core/samtools/view/main' -include { SAMTOOLS_COLLATE } from '../../modules/nf-core/samtools/collate/main' -include { SAMTOOLS_FASTA } from '../../modules/nf-core/samtools/fasta/main' -include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { SAMTOOLS_COLLATETOFASTA } from '../../modules/local/samtools_collatetofasta' include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main' include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' include { SAMTOOLS_VIEW as SAMTOOLS_FILTER } from '../../modules/nf-core/samtools/view/main' @@ -33,22 +31,12 @@ workflow FILTER_PACBIO { // Collate BAM file to create interleaved FASTA - SAMTOOLS_COLLATE ( SAMTOOLS_CONVERT.out.bam, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_COLLATE.out.versions.first() ) - - - // Convert BAM to FASTA - SAMTOOLS_FASTA ( SAMTOOLS_COLLATE.out.bam, true ) - ch_versions = ch_versions.mix ( SAMTOOLS_FASTA.out.versions.first() ) - - - // Gunzip FASTA file to BLAST - GUNZIP ( SAMTOOLS_FASTA.out.other ) - ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() ) + SAMTOOLS_COLLATETOFASTA ( SAMTOOLS_CONVERT.out.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions.first() ) // Nucleotide BLAST - BLAST_BLASTN ( GUNZIP.out.gunzip, db ) + BLAST_BLASTN ( SAMTOOLS_COLLATETOFASTA.out.fasta, db ) ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) From 04c160d2bc44d4042c562895436e982087e26d46 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 25 Jan 2024 16:35:10 +0000 Subject: [PATCH 04/16] Simplified the topology of the pipeline by instantiating the CONVERT_STATS sub-workflow just once --- conf/base.config | 7 ++---- conf/modules.config | 36 ++--------------------------- subworkflows/local/align_ont.nf | 10 +------- subworkflows/local/align_pacbio.nf | 10 +------- subworkflows/local/align_short.nf | 6 +---- subworkflows/local/markdup_stats.nf | 10 +------- workflows/readmapping.nf | 15 +++++++++++- 7 files changed, 22 insertions(+), 72 deletions(-) diff --git a/conf/base.config b/conf/base.config index 5ea996e..bdb122d 100644 --- a/conf/base.config +++ b/conf/base.config @@ -37,11 +37,8 @@ process { memory = { check_max( 250.MB * task.attempt, 'memory' ) } } - withName: '.*:ALIGN_(HIFI|HIC|ILLUMINA):.*:SAMTOOLS_(STATS|VIEW)' { - memory = { check_max( 1.GB * task.attempt, 'memory' ) } - } - withName: '.*:ALIGN_(CLR|ONT):.*:SAMTOOLS_(STATS|VIEW)' { - memory = { check_max( 2.GB * task.attempt, 'memory' ) } + withName: 'SAMTOOLS_(STATS|VIEW)' { + memory = { check_max( ((meta.datatype == "pacbio_clr" || meta.datatype == "ont") ? 2.GB : 1.GB) * task.attempt, 'memory' ) } } withName: 'SAMTOOLS_COLLATETOFASTA' { diff --git a/conf/modules.config b/conf/modules.config index 63f071d..2390814 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -102,41 +102,9 @@ process { ] } - withName: '.*:ALIGN_HIC:MARKDUP_STATS:CONVERT_STATS:.*' { + withName: '.*:CONVERT_STATS:.*' { publishDir = [ - path: { "${params.outdir}/read_mapping/hic" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:ALIGN_ILLUMINA:MARKDUP_STATS:CONVERT_STATS:.*' { - publishDir = [ - path: { "${params.outdir}/read_mapping/illumina" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:ALIGN_HIFI:CONVERT_STATS:.*' { - publishDir = [ - path: { "${params.outdir}/read_mapping/pacbio" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:ALIGN_CLR:CONVERT_STATS:.*' { - publishDir = [ - path: { "${params.outdir}/read_mapping/pacbio" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:ALIGN_ONT:CONVERT_STATS:.*' { - publishDir = [ - path: { "${params.outdir}/read_mapping/ont" }, + path: { "${params.outdir}/read_mapping/${meta.datatype}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index c1d2263..cf1feeb 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -5,7 +5,6 @@ include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' workflow ALIGN_ONT { @@ -52,15 +51,8 @@ workflow ALIGN_ONT { | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } - CONVERT_STATS ( ch_sort, fasta ) - ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) - emit: - cram = CONVERT_STATS.out.cram // channel: [ val(meta), /path/to/cram ] - crai = CONVERT_STATS.out.crai // channel: [ val(meta), /path/to/crai ] - stats = CONVERT_STATS.out.stats // channel: [ val(meta), /path/to/stats ] - idxstats = CONVERT_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] - flagstat = CONVERT_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] + bam = ch_sort // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 01cd1ac..a29e827 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -5,7 +5,6 @@ include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' workflow ALIGN_PACBIO { @@ -58,15 +57,8 @@ workflow ALIGN_PACBIO { | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } - CONVERT_STATS ( ch_sort, fasta ) - ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) - emit: - cram = CONVERT_STATS.out.cram // channel: [ val(meta), /path/to/cram ] - crai = CONVERT_STATS.out.crai // channel: [ val(meta), /path/to/crai ] - stats = CONVERT_STATS.out.stats // channel: [ val(meta), /path/to/stats ] - idxstats = CONVERT_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] - flagstat = CONVERT_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] + bam = ch_sort // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index a6e574b..73a8e2e 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -34,10 +34,6 @@ workflow ALIGN_SHORT { emit: - cram = MARKDUP_STATS.out.cram // channel: [ val(meta), /path/to/cram ] - crai = MARKDUP_STATS.out.crai // channel: [ val(meta), /path/to/crai ] - stats = MARKDUP_STATS.out.stats // channel: [ val(meta), /path/to/stats ] - idxstats = MARKDUP_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] - flagstat = MARKDUP_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] + bam = MARKDUP_STATS.out.bam // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf index 9b271f8..7d30257 100644 --- a/subworkflows/local/markdup_stats.nf +++ b/subworkflows/local/markdup_stats.nf @@ -6,7 +6,6 @@ include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' workflow MARKDUP_STATS { @@ -57,15 +56,8 @@ workflow MARKDUP_STATS { | map { meta, bam -> [ meta, bam, [] ] } | set { ch_stat } - CONVERT_STATS ( ch_stat, fasta ) - ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) - emit: - cram = CONVERT_STATS.out.cram // channel: [ val(meta), /path/to/cram ] - crai = CONVERT_STATS.out.crai // channel: [ val(meta), /path/to/crai ] - stats = CONVERT_STATS.out.stats // channel: [ val(meta), /path/to/stats ] - idxstats = CONVERT_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] - flagstat = CONVERT_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] + bam = ch_stat // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index 9d12b0b..e579d3e 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -32,6 +32,7 @@ include { ALIGN_SHORT as ALIGN_ILLUMINA } from '../subworkflows/local/align_shor include { ALIGN_PACBIO as ALIGN_HIFI } from '../subworkflows/local/align_pacbio' include { ALIGN_PACBIO as ALIGN_CLR } from '../subworkflows/local/align_pacbio' include { ALIGN_ONT } from '../subworkflows/local/align_ont' +include { CONVERT_STATS } from '../subworkflows/local/convert_stats' /* @@ -126,10 +127,22 @@ workflow READMAPPING { ch_versions = ch_versions.mix ( ALIGN_ONT.out.versions ) + ch_aligned_bams = Channel.empty() + | mix( ALIGN_HIC.out.bam ) + | mix( ALIGN_ILLUMINA.out.bam ) + | mix( ALIGN_HIFI.out.bam ) + | mix( ALIGN_CLR.out.bam ) + | mix( ALIGN_ONT.out.bam ) + CONVERT_STATS ( ch_aligned_bams, PREPARE_GENOME.out.fasta ) + ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) + // // MODULE: To compress PacBio HiFi aligned CRAM files // - CRUMBLE ( ALIGN_HIFI.out.cram, [], true ) + CONVERT_STATS.out.cram + | filter { meta, bam -> meta.datatype == "pacbio" } + | set { ch_pacbio_bams } + CRUMBLE ( ch_pacbio_bams, [], true ) ch_versions = ch_versions.mix ( CRUMBLE.out.versions ) From 6565f9314951221760f422776cfedff421cba2e7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 25 Jan 2024 17:04:46 +0000 Subject: [PATCH 05/16] Rolled out the minimap2 -I setting to all data types --- conf/modules.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2390814..aeab190 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -48,7 +48,6 @@ process { ext.prefix = { "${meta.id}.filter" } } - withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp @@ -56,15 +55,16 @@ process { // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes // NOTE: Use `reference.size()` for now, and switch to `meta2.genome_size` once we update the modules. // ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } } withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { - ext.args = { "-ax map-pb -R ${meta.read_group}" } + ext.args = { "-ax map-pb -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } } withName: '.*:.*:ALIGN_ONT:MINIMAP2_ALIGN' { - ext.args = { "-ax map-ont -R ${meta.read_group}" } + ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } } withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { From b3cb5d8f3c248876ae04e792ceb57f6b1d0eef12 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 25 Jan 2024 17:29:24 +0000 Subject: [PATCH 06/16] Combined two consecutive samtools steps to save some space --- conf/base.config | 4 +- conf/modules.config | 4 -- modules/local/samtools_filtertofastq.nf | 57 +++++++++++++++++++++++++ subworkflows/local/filter_pacbio.nf | 18 +++----- 4 files changed, 65 insertions(+), 18 deletions(-) create mode 100644 modules/local/samtools_filtertofastq.nf diff --git a/conf/base.config b/conf/base.config index bdb122d..66b612b 100644 --- a/conf/base.config +++ b/conf/base.config @@ -20,7 +20,7 @@ process { memory = { check_max( 50.MB * task.attempt, 'memory' ) } time = { check_max( 30.min * task.attempt, 'time' ) } - withName: 'SAMTOOLS_(CONVERT|FILTER)' { + withName: 'SAMTOOLS_(CONVERT)' { time = { check_max( 1.hour * task.attempt, 'time' ) } } @@ -29,7 +29,7 @@ process { time = { check_max( 4.hour * task.attempt, 'time' ) } } - withName: 'SAMTOOLS_(COLLATETOFASTA|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' { + withName: 'SAMTOOLS_(COLLATETOFASTA|FILTERTOFASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' { time = { check_max( 8.hour * task.attempt, 'time' ) } } diff --git a/conf/modules.config b/conf/modules.config index aeab190..3a93825 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -44,10 +44,6 @@ process { ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index" } - withName: SAMTOOLS_FILTER { - ext.prefix = { "${meta.id}.filter" } - } - // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp diff --git a/modules/local/samtools_filtertofastq.nf b/modules/local/samtools_filtertofastq.nf new file mode 100644 index 0000000..fbee087 --- /dev/null +++ b/modules/local/samtools_filtertofastq.nf @@ -0,0 +1,57 @@ +process SAMTOOLS_FILTERTOFASTQ { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(index) + path qname + + output: + tuple val(meta), path("*.fastq.gz") , emit: fastq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools view \\ + --threads $task.cpus \\ + --qname-file ${qname} \\ + --unoutput - \\ + $args \\ + -o /dev/null \\ + $input \\ + | \\ + samtools fastq \\ + $args2 \\ + --threads $task.cpus \\ + -0 ${prefix}.fastq.gz \\ + - \\ + > /dev/null + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo | gzip > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index 818534e..759880c 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -7,8 +7,7 @@ include { SAMTOOLS_VIEW as SAMTOOLS_CONVERT } from '../../modules/nf-core/samtoo include { SAMTOOLS_COLLATETOFASTA } from '../../modules/local/samtools_collatetofasta' include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main' include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' -include { SAMTOOLS_VIEW as SAMTOOLS_FILTER } from '../../modules/nf-core/samtools/view/main' -include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' +include { SAMTOOLS_FILTERTOFASTQ } from '../../modules/local/samtools_filtertofastq' workflow FILTER_PACBIO { @@ -45,7 +44,7 @@ workflow FILTER_PACBIO { ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() ) - // Create filtered BAM file + // Filter the BAM file and convert to FASTQ SAMTOOLS_CONVERT.out.bam | join ( SAMTOOLS_CONVERT.out.csi ) | join ( PACBIO_FILTER.out.list ) @@ -59,16 +58,11 @@ workflow FILTER_PACBIO { | map { meta, bam, csi, list -> list } | set { ch_lists } - SAMTOOLS_FILTER ( ch_reads, [ [], [] ], ch_lists ) - ch_versions = ch_versions.mix ( SAMTOOLS_FILTER.out.versions.first() ) - - - // Convert BAM to FASTQ - SAMTOOLS_FASTQ ( SAMTOOLS_FILTER.out.unoutput, true ) - ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) + SAMTOOLS_FILTERTOFASTQ ( ch_reads, ch_lists ) + ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions.first() ) emit: - fastq = SAMTOOLS_FASTQ.out.other // channel: [ meta, /path/to/fastq ] - versions = ch_versions // channel: [ versions.yml ] + fastq = SAMTOOLS_FILTERTOFASTQ.out.fastq // channel: [ meta, /path/to/fastq ] + versions = ch_versions // channel: [ versions.yml ] } From 3d7a70080727f3e7adb2ce42a7b76a40ff6bea31 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 25 Jan 2024 18:02:11 +0000 Subject: [PATCH 07/16] Sort the BAM file directly with bwamem2 to save some disk space --- conf/base.config | 15 +++----- conf/modules.config | 4 --- modules.json | 5 --- modules/nf-core/samtools/sort/main.nf | 49 -------------------------- modules/nf-core/samtools/sort/meta.yml | 48 ------------------------- subworkflows/local/align_ont.nf | 1 - subworkflows/local/align_short.nf | 4 +-- subworkflows/local/markdup_stats.nf | 8 +---- 8 files changed, 7 insertions(+), 127 deletions(-) delete mode 100644 modules/nf-core/samtools/sort/main.nf delete mode 100644 modules/nf-core/samtools/sort/meta.yml diff --git a/conf/base.config b/conf/base.config index 66b612b..aa296e0 100644 --- a/conf/base.config +++ b/conf/base.config @@ -29,7 +29,7 @@ process { time = { check_max( 4.hour * task.attempt, 'time' ) } } - withName: 'SAMTOOLS_(COLLATETOFASTA|FILTERTOFASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' { + withName: 'SAMTOOLS_(COLLATETOFASTA|FILTERTOFASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|VIEW)' { time = { check_max( 8.hour * task.attempt, 'time' ) } } @@ -52,13 +52,6 @@ process { time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } - withName: SAMTOOLS_SORT { - cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } - // Memory increases by 768M for each thread - memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) } - time = { check_max( 8.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } - } - withName: BLAST_BLASTN { time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } @@ -77,9 +70,9 @@ process { // Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size // Runtime is considered proportional to the number of reads and inversely to number of threads time = { check_max( 3.h * task.attempt * Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) } - // Base RAM usage is about 6 times the genome size. Each thread takes an additional 800 MB RAM - // Memory usage of SAMTOOLS_VIEW is negligible. - memory = { check_max( 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 800.MB * task.attempt * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } + // Base RAM usage is about 6 times the genome size. + // Each thread takes an additional 800 MB RAM for bwa-mem2 and 800 MB for samtools sort + memory = { check_max( 8.GB + 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 1600.MB * task.attempt * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } } withName: MINIMAP2_ALIGN { diff --git a/conf/modules.config b/conf/modules.config index 3a93825..6259289 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -23,10 +23,6 @@ process { ext.args = { "-R ${meta.read_group}" } } - withName: SAMTOOLS_SORT { - ext.prefix = { "${meta.id}.sort" } - } - withName: SAMTOOLS_MERGE { ext.args = { "-c -p" } ext.prefix = { "${meta.id}.merge" } diff --git a/modules.json b/modules.json index 66a383a..b61ea34 100644 --- a/modules.json +++ b/modules.json @@ -66,11 +66,6 @@ "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", "installed_by": ["modules"] }, - "samtools/sort": { - "branch": "master", - "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", - "installed_by": ["modules"] - }, "samtools/stats": { "branch": "master", "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf deleted file mode 100644 index 2b7753f..0000000 --- a/modules/nf-core/samtools/sort/main.nf +++ /dev/null @@ -1,49 +0,0 @@ -process SAMTOOLS_SORT { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(bam) - - output: - tuple val(meta), path("*.bam"), emit: bam - tuple val(meta), path("*.csi"), emit: csi, optional: true - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools sort \\ - $args \\ - -@ $task.cpus \\ - -o ${prefix}.bam \\ - -T $prefix \\ - $bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml deleted file mode 100644 index 0732843..0000000 --- a/modules/nf-core/samtools/sort/meta.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: samtools_sort -description: Sort SAM/BAM/CRAM file -keywords: - - sort - - bam - - sam - - cram -tools: - - samtools: - description: | - SAMtools is a set of utilities for interacting with and post-processing - short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. - These files are generated as output by short read aligners like BWA. - homepage: http://www.htslib.org/ - documentation: http://www.htslib.org/doc/samtools.html - doi: 10.1093/bioinformatics/btp352 - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: Sorted BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - csi: - type: file - description: BAM index file (optional) - pattern: "*.csi" -authors: - - "@drpatelh" - - "@ewels" diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index cf1feeb..91ed3c7 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -4,7 +4,6 @@ include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' workflow ALIGN_ONT { diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index 73a8e2e..d834f9a 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -23,8 +23,8 @@ workflow ALIGN_SHORT { ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) - // Align Fastq to Genome - BWAMEM2_MEM ( SAMTOOLS_FASTQ.out.fastq, index, [] ) + // Align Fastq to Genome and output sorted BAM + BWAMEM2_MEM ( SAMTOOLS_FASTQ.out.fastq, index, true ) ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() ) diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf index 7d30257..8193ca8 100644 --- a/subworkflows/local/markdup_stats.nf +++ b/subworkflows/local/markdup_stats.nf @@ -4,7 +4,6 @@ // include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' @@ -18,13 +17,8 @@ workflow MARKDUP_STATS { ch_versions = Channel.empty() - // Sort BAM file - SAMTOOLS_SORT ( aln ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - // Collect all BWAMEM2 output by sample name - SAMTOOLS_SORT.out.bam + aln | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } From 0e21eafd4fe8df1107072b57975aff7d459d550e Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 25 Jan 2024 18:06:45 +0000 Subject: [PATCH 08/16] Merged the markdup_stats sub-workflow into align_illumina --- subworkflows/local/align_short.nf | 44 ++++++++++++++++++---- subworkflows/local/markdup_stats.nf | 57 ----------------------------- 2 files changed, 37 insertions(+), 64 deletions(-) delete mode 100644 subworkflows/local/markdup_stats.nf diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index d834f9a..33c27b6 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -2,9 +2,10 @@ // Align short read (HiC and Illumina) data against the genome // -include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' -include { BWAMEM2_MEM } from '../../modules/nf-core/bwamem2/mem/main' -include { MARKDUP_STATS } from '../../subworkflows/local/markdup_stats' +include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' +include { BWAMEM2_MEM } from '../../modules/nf-core/bwamem2/mem/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' workflow ALIGN_SHORT { @@ -28,12 +29,41 @@ workflow ALIGN_SHORT { ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() ) - // Merge, markdup, convert, and stats - MARKDUP_STATS ( BWAMEM2_MEM.out.bam, fasta ) - ch_versions = ch_versions.mix ( MARKDUP_STATS.out.versions ) + // Collect all BWAMEM2 output by sample name + BWAMEM2_MEM.out.bam + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } + | groupTuple( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } + | set { ch_bams } + + + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) + | set { ch_bam } + + + // Mark duplicates + SAMTOOLS_SORMADUP ( ch_bam, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) + + + // Convert merged BAM to CRAM and calculate indices and statistics + SAMTOOLS_SORMADUP.out.bam + | map { meta, bam -> [ meta, bam, [] ] } + | set { ch_stat } emit: - bam = MARKDUP_STATS.out.bam // channel: [ val(meta), /path/to/bam ] + bam = ch_stat // channel: [ val(meta), /path/to/bam ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf deleted file mode 100644 index 8193ca8..0000000 --- a/subworkflows/local/markdup_stats.nf +++ /dev/null @@ -1,57 +0,0 @@ -// -// Merge and Markdup all alignments at specimen level -// Convert to CRAM and calculate statistics -// - -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' - - -workflow MARKDUP_STATS { - take: - aln // channel: [ val(meta), /path/to/bam ] - fasta // channel: [ val(meta), /path/to/fasta ] - - - main: - ch_versions = Channel.empty() - - - // Collect all BWAMEM2 output by sample name - aln - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - | groupTuple( by: [0] ) - | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } - | branch { - meta, bams -> - single_bam: bams.size() == 1 - multi_bams: true - } - | set { ch_bams } - - - // Merge, but only if there is more than 1 file - SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - SAMTOOLS_MERGE.out.bam - | mix ( ch_bams.single_bam ) - | set { ch_bam } - - - // Mark duplicates - SAMTOOLS_SORMADUP ( ch_bam, fasta ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) - - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORMADUP.out.bam - | map { meta, bam -> [ meta, bam, [] ] } - | set { ch_stat } - - - emit: - bam = ch_stat // channel: [ val(meta), /path/to/bam ] - versions = ch_versions // channel: [ versions.yml ] -} From 3d71db723a6100ee50b9e137b6e4901a01e1187f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 2 Feb 2024 16:33:27 +0000 Subject: [PATCH 09/16] Updated the minimap2/align module to the latest nf-core version Now accepts a meta2 for the reference genome --- conf/base.config | 2 +- conf/modules.config | 8 +- modules.json | 2 +- .../nf-core/minimap2/align/environment.yml | 9 ++ modules/nf-core/minimap2/align/main.nf | 13 +- modules/nf-core/minimap2/align/meta.yml | 10 ++ .../nf-core/minimap2/align/tests/main.nf.test | 145 ++++++++++++++++++ .../minimap2/align/tests/main.nf.test.snap | 38 +++++ modules/nf-core/minimap2/align/tests/tags.yml | 2 + subworkflows/local/align_ont.nf | 9 +- subworkflows/local/align_pacbio.nf | 9 +- 11 files changed, 220 insertions(+), 27 deletions(-) create mode 100644 modules/nf-core/minimap2/align/environment.yml create mode 100644 modules/nf-core/minimap2/align/tests/main.nf.test create mode 100644 modules/nf-core/minimap2/align/tests/main.nf.test.snap create mode 100644 modules/nf-core/minimap2/align/tests/tags.yml diff --git a/conf/base.config b/conf/base.config index aa296e0..ca61f16 100644 --- a/conf/base.config +++ b/conf/base.config @@ -77,7 +77,7 @@ process { withName: MINIMAP2_ALIGN { cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } - memory = { check_max( 14.GB * Math.ceil( Math.pow(reference.size() / 1000000000, 0.6)) * task.attempt, 'memory' ) } + memory = { check_max( 800.MB * log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) + 14.GB * Math.ceil( Math.pow(meta2.genome_size / 1000000000, 0.6)) * task.attempt, 'memory' ) } time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } } diff --git a/conf/modules.config b/conf/modules.config index 6259289..d95ed06 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -45,18 +45,16 @@ process { // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes - // NOTE: Use `reference.size()` for now, and switch to `meta2.genome_size` once we update the modules. - // ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' } withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { - ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { - ext.args = { "-ax map-pb -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } + ext.args = { "-ax map-pb -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } withName: '.*:.*:ALIGN_ONT:MINIMAP2_ALIGN' { - ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } + ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { diff --git a/modules.json b/modules.json index b61ea34..9321f7d 100644 --- a/modules.json +++ b/modules.json @@ -38,7 +38,7 @@ }, "minimap2/align": { "branch": "master", - "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "git_sha": "efbf86bb487f288ac30660282709d9620dd6048e", "installed_by": ["modules"] }, "samtools/faidx": { diff --git a/modules/nf-core/minimap2/align/environment.yml b/modules/nf-core/minimap2/align/environment.yml new file mode 100644 index 0000000..cf6e775 --- /dev/null +++ b/modules/nf-core/minimap2/align/environment.yml @@ -0,0 +1,9 @@ +name: minimap2_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::minimap2=2.24 + - bioconda::samtools=1.18 + - bioconda::htslib=1.18 diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf index 4da47c1..07a3215 100644 --- a/modules/nf-core/minimap2/align/main.nf +++ b/modules/nf-core/minimap2/align/main.nf @@ -3,14 +3,14 @@ process MINIMAP2_ALIGN { label 'process_medium' // Note: the versions here need to match the versions used in the mulled container below and minimap2/index - conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : - 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' }" input: tuple val(meta), path(reads) - path reference + tuple val(meta2), path(reference) val bam_format val cigar_paf_format val cigar_bam @@ -24,9 +24,10 @@ process MINIMAP2_ALIGN { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def bam_output = bam_format ? "-a | samtools sort -@ ${task.cpus} -o ${prefix}.bam ${args2}" : "-o ${prefix}.paf" def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' """ diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml index 991b39a..408522d 100644 --- a/modules/nf-core/minimap2/align/meta.yml +++ b/modules/nf-core/minimap2/align/meta.yml @@ -25,6 +25,11 @@ input: description: | List of input FASTA or FASTQ files of size 1 and 2 for single-end and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test_ref'] - reference: type: file description: | @@ -63,3 +68,8 @@ authors: - "@sofstam" - "@sateeshperi" - "@jfy133" +maintainers: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test b/modules/nf-core/minimap2/align/tests/main.nf.test new file mode 100644 index 0000000..b634468 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test @@ -0,0 +1,145 @@ +nextflow_process { + + name "Test Process MINIMAP2_ALIGN" + script "../main.nf" + process "MINIMAP2_ALIGN" + + tag "modules" + tag "modules_nfcore" + tag "minimap2" + tag "minimap2/align" + + test("sarscov2 - fastq, fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, [], true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test.snap b/modules/nf-core/minimap2/align/tests/main.nf.test.snap new file mode 100644 index 0000000..a39a169 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test.snap @@ -0,0 +1,38 @@ +{ + "sarscov2 - fastq, fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:06.01315354" + }, + "sarscov2 - fastq, fasta, true, false, false - stub": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:24.487175659" + }, + "sarscov2 - [fastq1, fastq2], fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:12.50816279" + }, + "sarscov2 - fastq, [], true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:18.414974788" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/tests/tags.yml b/modules/nf-core/minimap2/align/tests/tags.yml new file mode 100644 index 0000000..39dba37 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/tags.yml @@ -0,0 +1,2 @@ +minimap2/align: + - "modules/nf-core/minimap2/align/**" diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index 91ed3c7..f1013d4 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -16,13 +16,8 @@ workflow ALIGN_ONT { ch_versions = Channel.empty() - // Align Fastq to Genome - fasta - | map { meta, file -> file } - | set { ch_fasta } - - // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( reads, ch_fasta, true, false, false ) + // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM + MINIMAP2_ALIGN ( reads, fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index a29e827..07855a7 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -23,13 +23,8 @@ workflow ALIGN_PACBIO { ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) - // Align Fastq to Genome - fasta - | map { meta, file -> file } - | set { ch_fasta } - - // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, ch_fasta, true, false, false ) + // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM + MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) From 7a9671c7654ae7ecbfb88616e7951afdaa7c88a7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 6 Feb 2024 18:31:25 +0000 Subject: [PATCH 10/16] Updated the crumble module to keep it closer to the original --- modules/nf-core/crumble/crumble.diff | 8 +------- modules/nf-core/crumble/main.nf | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/modules/nf-core/crumble/crumble.diff b/modules/nf-core/crumble/crumble.diff index 2c4cb1e..e5b45f9 100644 --- a/modules/nf-core/crumble/crumble.diff +++ b/modules/nf-core/crumble/crumble.diff @@ -1,13 +1,7 @@ Changes in module 'nf-core/crumble' --- modules/nf-core/crumble/main.nf +++ modules/nf-core/crumble/main.nf -@@ -30,11 +30,14 @@ - args.contains("-O cram") ? "cram" : - "sam" - def bedin = keepbed ? "-R ${keepbed}" : "" -- def bedout = bedout ? "-b ${prefix}.out.bed" : "" -+ def bedout = bedout ? "-b ${prefix}.suspicious_regions.bed" : "" - if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" +@@ -35,6 +35,9 @@ def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ diff --git a/modules/nf-core/crumble/main.nf b/modules/nf-core/crumble/main.nf index 44c0c59..17260cb 100644 --- a/modules/nf-core/crumble/main.nf +++ b/modules/nf-core/crumble/main.nf @@ -30,7 +30,7 @@ process CRUMBLE { args.contains("-O cram") ? "cram" : "sam" def bedin = keepbed ? "-R ${keepbed}" : "" - def bedout = bedout ? "-b ${prefix}.suspicious_regions.bed" : "" + def bedout = bedout ? "-b ${prefix}.out.bed" : "" if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. From cfb7977af7b1750e26c8c87b012f3cb85c37828b Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 6 Feb 2024 18:37:38 +0000 Subject: [PATCH 11/16] We don't need the suspicious regions --- workflows/readmapping.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index e579d3e..af3ff3d 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -142,7 +142,7 @@ workflow READMAPPING { CONVERT_STATS.out.cram | filter { meta, bam -> meta.datatype == "pacbio" } | set { ch_pacbio_bams } - CRUMBLE ( ch_pacbio_bams, [], true ) + CRUMBLE ( ch_pacbio_bams, [], [] ) ch_versions = ch_versions.mix ( CRUMBLE.out.versions ) From 20536ea60c0217b18644c18a41efec9b9c830ce3 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 6 Feb 2024 18:50:47 +0000 Subject: [PATCH 12/16] Also compress the Illumina files with crumble, and don't compress the pre-cumble BAMs --- conf/modules.config | 9 ++------- subworkflows/local/convert_stats.nf | 22 ++++++++++++++++++++-- workflows/readmapping.nf | 10 ---------- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index d95ed06..c9e3628 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -76,12 +76,7 @@ process { withName: CRUMBLE { ext.prefix = { "${input.baseName}.crumble" } - ext.args = '-y pbccs -O cram' - publishDir = [ - path: { "${params.outdir}/read_mapping/pacbio" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + ext.args = { (meta.datatype == "pacbio" ? "-y pbccs " : "") + "-O bam" } } withName: SAMPLESHEET_CHECK { @@ -92,7 +87,7 @@ process { ] } - withName: '.*:CONVERT_STATS:.*' { + withName: '.*:CONVERT_STATS:SAMTOOLS_.*' { publishDir = [ path: { "${params.outdir}/read_mapping/${meta.datatype}" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/convert_stats.nf b/subworkflows/local/convert_stats.nf index 2ea16b9..b89928a 100644 --- a/subworkflows/local/convert_stats.nf +++ b/subworkflows/local/convert_stats.nf @@ -2,6 +2,7 @@ // Convert BAM to CRAM, create index and calculate statistics // +include { CRUMBLE } from '../../modules/nf-core/crumble/main' include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' @@ -17,11 +18,28 @@ workflow CONVERT_STATS { main: ch_versions = Channel.empty() + // Compress the quality scores of Illumina and PacBio CCS alignments + bam + | branch { + meta, bam, bai -> + run_crumble : meta.datatype == "hic" || meta.datatype == "illumina" || meta.datatype == "pacbio" + [meta, bam] + no_crumble: true + } + | set { ch_bams } + + CRUMBLE ( ch_bams.run_crumble, [], [] ) + ch_versions = ch_versions.mix ( CRUMBLE.out.versions ) + // Convert BAM to CRAM - SAMTOOLS_VIEW ( bam, fasta, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) + CRUMBLE.out.bam + | map { meta, bam -> [meta, bam, []] } + | mix ( ch_bams.no_crumble ) + | set { ch_bams_for_conversion } + SAMTOOLS_VIEW ( ch_bams_for_conversion, fasta, [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) // Combine CRAM and CRAI into one channel SAMTOOLS_VIEW.out.cram diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index af3ff3d..18ebd36 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -46,7 +46,6 @@ include { CONVERT_STATS } from '../subworkflows/local/convert_st // include { UNTAR } from '../modules/nf-core/untar/main' -include { CRUMBLE } from '../modules/nf-core/crumble/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -136,15 +135,6 @@ workflow READMAPPING { CONVERT_STATS ( ch_aligned_bams, PREPARE_GENOME.out.fasta ) ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) - // - // MODULE: To compress PacBio HiFi aligned CRAM files - // - CONVERT_STATS.out.cram - | filter { meta, bam -> meta.datatype == "pacbio" } - | set { ch_pacbio_bams } - CRUMBLE ( ch_pacbio_bams, [], [] ) - ch_versions = ch_versions.mix ( CRUMBLE.out.versions ) - // // MODULE: Combine different versions.yml From 7cbb4161de0b95bd5edf271c5fadf6a7805a2119 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 6 Feb 2024 19:14:43 +0000 Subject: [PATCH 13/16] Not needed because crumble now outputs BAM --- modules.json | 3 +-- modules/nf-core/crumble/crumble.diff | 15 --------------- modules/nf-core/crumble/main.nf | 3 --- 3 files changed, 1 insertion(+), 20 deletions(-) delete mode 100644 modules/nf-core/crumble/crumble.diff diff --git a/modules.json b/modules.json index 9321f7d..5e4c2e6 100644 --- a/modules.json +++ b/modules.json @@ -23,8 +23,7 @@ "crumble": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"], - "patch": "modules/nf-core/crumble/crumble.diff" + "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", diff --git a/modules/nf-core/crumble/crumble.diff b/modules/nf-core/crumble/crumble.diff deleted file mode 100644 index e5b45f9..0000000 --- a/modules/nf-core/crumble/crumble.diff +++ /dev/null @@ -1,15 +0,0 @@ -Changes in module 'nf-core/crumble' ---- modules/nf-core/crumble/main.nf -+++ modules/nf-core/crumble/main.nf -@@ -35,6 +35,9 @@ - - def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ -+ # Need to fake REF_PATH to force crumble to use the Fasta file defined in -+ # the UR field of the @SQ headers. (bug reported to the samtools team). -+ env REF_PATH=/missing \\ - crumble \\ - $args \\ - $bedin \\ - -************************************************************ diff --git a/modules/nf-core/crumble/main.nf b/modules/nf-core/crumble/main.nf index 17260cb..2699257 100644 --- a/modules/nf-core/crumble/main.nf +++ b/modules/nf-core/crumble/main.nf @@ -35,9 +35,6 @@ process CRUMBLE { def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ - # Need to fake REF_PATH to force crumble to use the Fasta file defined in - # the UR field of the @SQ headers. (bug reported to the samtools team). - env REF_PATH=/missing \\ crumble \\ $args \\ $bedin \\ From 7ba557caa81d3ee5bd3c3e2f9729149c64d193cb Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 5 Jun 2024 18:35:55 +0000 Subject: [PATCH 14/16] Memory settings for ALIGN_ONT:MINIMAP2_ALIGN --- conf/base.config | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index ca61f16..ca753f3 100644 --- a/conf/base.config +++ b/conf/base.config @@ -75,12 +75,19 @@ process { memory = { check_max( 8.GB + 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 1600.MB * task.attempt * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } } - withName: MINIMAP2_ALIGN { + withName: '.*:ALIGN_HIFI:MINIMAP2_ALIGN' { cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } memory = { check_max( 800.MB * log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) + 14.GB * Math.ceil( Math.pow(meta2.genome_size / 1000000000, 0.6)) * task.attempt, 'memory' ) } time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } } + // Extrapolated from the HIFI settings on the basis of 1 ONT alignment. CLR assumed to behave the same way as ONT + withName: '.*:ALIGN_(CLR|ONT):MINIMAP2_ALIGN' { + cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } + memory = { check_max( 800.MB * log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) + 30.GB * Math.ceil( Math.pow(meta2.genome_size / 1000000000, 0.6)) * task.attempt, 'memory' ) } + time = { check_max( 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + } + withName: CRUMBLE { // No correlation between memory usage and the number of reads or the genome size. // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB. From 92b4744e03349d649c6510e0942a8c10859b106d Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 6 Jun 2024 14:35:24 +0000 Subject: [PATCH 15/16] Bumped the version and updated the changelog --- CHANGELOG.md | 19 ++++++++++++++++++- nextflow.config | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8eb80be..8150062 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,24 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.2.2](https://github.com/sanger-tol/readmapping/releases/tag/1.2.2)] - Norwegian Ridgeback (patch 2) -[2024-05-23] +## [[1.3.0](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye - [2024-06-XX] + +### Enhancements & fixes + +- Combined steps to improve the efficiency of the pipeline, especially on large genomes +- "crumble" is now run on _every_ data type, not just PacBio + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. + +| Dependency | Old version | New version | +| ---------- | ------------- | ------------- | +| `samtools` | 1.14 and 1.17 | 1.17 and 1.18 | + +> **NB:** Dependency has been **updated** if both old and new version information is present.
**NB:** Dependency has been **added** if just the new version information is present.
**NB:** Dependency has been **removed** if version information isn't present. + +## [[1.2.2](https://github.com/sanger-tol/readmapping/releases/tag/1.2.2)] - Norwegian Ridgeback (patch 2) - [2024-05-23] ### Enhancements & fixes diff --git a/nextflow.config b/nextflow.config index cd9aae7..d09f872 100644 --- a/nextflow.config +++ b/nextflow.config @@ -183,7 +183,7 @@ manifest { description = 'Pipeline to map reads generated using different sequencing technologies against a genome assembly.' mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.2.2' + version = '1.3.0' doi = '10.5281/zenodo.6563577' } From 82996949a154e0315e14efdd8ee5c901058fa94a Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 6 Jun 2024 15:51:07 +0100 Subject: [PATCH 16/16] The patch was missing --- modules.json | 3 ++- .../samtools/merge/samtools-merge.diff | 26 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/samtools/merge/samtools-merge.diff diff --git a/modules.json b/modules.json index 5e4c2e6..f7da822 100644 --- a/modules.json +++ b/modules.json @@ -63,7 +63,8 @@ "samtools/merge": { "branch": "master", "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" }, "samtools/stats": { "branch": "master", diff --git a/modules/nf-core/samtools/merge/samtools-merge.diff b/modules/nf-core/samtools/merge/samtools-merge.diff new file mode 100644 index 0000000..cca0b3c --- /dev/null +++ b/modules/nf-core/samtools/merge/samtools-merge.diff @@ -0,0 +1,26 @@ +Changes in module 'nf-core/samtools/merge' +--- modules/nf-core/samtools/merge/main.nf ++++ modules/nf-core/samtools/merge/main.nf +@@ -26,6 +26,11 @@ + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() ++ if (input_files instanceof List) { ++ sorted_input_files = input_files.toSorted({it.name}).join(' ') ++ } else { ++ sorted_input_files = input_files ++ } + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ +@@ -34,7 +39,7 @@ + $args \\ + ${reference} \\ + ${prefix}.${file_type} \\ +- $input_files ++ $sorted_input_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************