diff --git a/conf/base.config b/conf/base.config index 6f3dab3..f9d14f7 100644 --- a/conf/base.config +++ b/conf/base.config @@ -63,6 +63,12 @@ process { time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } + withName: SAMTOOLS_ADDREPLACERG { + cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) } + memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) } + time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } + } + withName: BLAST_BLASTN { time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } @@ -114,6 +120,11 @@ process { memory = { check_max( 1.GB * Math.ceil( 30 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) } } + withName: GENERATE_CRAM_CSV { + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + } + withName: CRUMBLE { // No correlation between memory usage and the number of reads or the genome size. // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB. diff --git a/conf/modules.config b/conf/modules.config index 28116bd..342caca 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,14 +20,6 @@ process { ext.args = 'include=f' } - withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' { - ext.args = { "-5SPCp -R ${meta.read_group}" } - } - - withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' { - ext.args = { "-p -R ${meta.read_group}" } - } - withName: SAMTOOLS_MERGE { beforeScript = { "export REF_PATH=spoof"} ext.args = { "-c -p" } @@ -119,6 +111,30 @@ process { ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } + withName: ".*:ALIGN_HIFI:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + ext.args = "" + ext.args1 = { "-F 0x200 -nt" } + ext.args2 = { "-ax map-hifi --cs=short -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + + withName: ".*:ALIGN_CLR:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + ext.args = "" + ext.args1 = { "-F 0x200 -nt" } + ext.args2 = { "-ax map-pb -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + + withName: ".*:ALIGN_ONT:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + ext.args = "" + ext.args1 = { "-F 0x200 -nt" } + ext.args2 = { "-ax map-ont -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + withName: '.*:CONVERT_STATS:SAMTOOLS_CRAM' { beforeScript = { "export REF_PATH=spoof"} ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" } diff --git a/seq_cache_populate.pl b/seq_cache_populate.pl deleted file mode 100644 index e69de29..0000000 diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index ef1a021..f1e3465 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -2,7 +2,11 @@ // Align Nanopore read files against the genome // -include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_ADDREPLACERG } from '../../modules/local/samtools_addreplacerg' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' +include { MINIMAP2_MAPREDUCE } from '../../subworkflows/local/minimap2_mapreduce' +include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' @@ -14,17 +18,54 @@ workflow ALIGN_ONT { main: ch_versions = Channel.empty() + ch_merged_bam = Channel.empty() + // Convert FASTQ to CRAM + CONVERT_CRAM ( reads, fasta ) + ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) - // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( reads, fasta, true, "bai", false, false ) - ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) + SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions ) + SAMTOOLS_ADDREPLACERG.out.cram + | set { ch_reads_cram } - // Collect all alignment output by sample name - MINIMAP2_ALIGN.out.bam + // Index the CRAM file + SAMTOOLS_INDEX ( ch_reads_cram ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) + + ch_reads_cram + | join ( SAMTOOLS_INDEX.out.crai ) + | set { ch_reads_cram_crai } + + + // + // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT + // + GENERATE_CRAM_CSV( ch_reads_cram_crai ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) + + // + // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 + // + MINIMAP2_MAPREDUCE ( + fasta, + GENERATE_CRAM_CSV.out.csv + ) + ch_versions = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions ) + ch_merged_bam = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam) + + + ch_merged_bam + | combine( ch_reads_cram_crai ) + | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } + | set { ch_merged_bam } + + + // Collect all BAM output by sample name + ch_merged_bam | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - | groupTuple ( by: [0] ) + | groupTuple( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } | branch { meta, bams -> diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index f472a6c..cd42e63 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -3,7 +3,11 @@ // include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' -include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_ADDREPLACERG } from '../../modules/local/samtools_addreplacerg' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' +include { MINIMAP2_MAPREDUCE } from '../../subworkflows/local/minimap2_mapreduce' +include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' @@ -16,21 +20,56 @@ workflow ALIGN_PACBIO { main: ch_versions = Channel.empty() - + ch_merged_bam = Channel.empty() // Filter BAM and output as FASTQ FILTER_PACBIO ( reads, db ) ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) + // Convert FASTQ to CRAM + CONVERT_CRAM ( FILTER_PACBIO.out.fastq, fasta ) + ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) + + SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions ) + + SAMTOOLS_ADDREPLACERG.out.cram + | set { ch_reads_cram } + + // Index the CRAM file + SAMTOOLS_INDEX ( ch_reads_cram ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) + + ch_reads_cram + | join ( SAMTOOLS_INDEX.out.crai ) + | set { ch_reads_cram_crai } + + + // + // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT + // + GENERATE_CRAM_CSV( ch_reads_cram_crai ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) + + // + // SUBWORKFLOW: mapping pacbio reads using minimap2 + // + MINIMAP2_MAPREDUCE ( + fasta, + GENERATE_CRAM_CSV.out.csv + ) + ch_versions = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions ) + ch_merged_bam = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam) - // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, fasta, true, "bai", false, false ) - ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) + ch_merged_bam + | combine( ch_reads_cram_crai ) + | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } + | set { ch_merged_bam } - // Collect all alignment output by sample name - MINIMAP2_ALIGN.out.bam + // Collect all BAM output by sample name + ch_merged_bam | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - | groupTuple ( by: [0] ) + | groupTuple( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } | branch { meta, bams -> diff --git a/subworkflows/local/minimap2_mapreduce.nf b/subworkflows/local/minimap2_mapreduce.nf index 35b5aae..7503e02 100644 --- a/subworkflows/local/minimap2_mapreduce.nf +++ b/subworkflows/local/minimap2_mapreduce.nf @@ -38,7 +38,8 @@ workflow MINIMAP2_MAPREDUCE { .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path-> tuple([ id: cram_id.id, - chunk_id: cram_id.id + "_" + cram_info[5] + chunk_id: cram_id.id + "_" + cram_info[5], + genome_size: ref_id.genome_size ], file(cram_info[0]), cram_info[1],