Merge pull request sanger-tol#9 from reichan1998/cram_handling

Merge cram_handling and 1.3.1
reichan1998 · Oct 1, 2024 · f72917c · f72917c
2 parents c39bb79 + b089fa8
commit f72917c
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 24 deletions.
diff --git a/conf/base.config b/conf/base.config
@@ -63,6 +63,12 @@ process {
         time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
     }
 
+    withName: SAMTOOLS_ADDREPLACERG {
+        cpus   = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
+        memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) }
+        time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
+    }
+
     withName: BLAST_BLASTN {
         time   = { check_max(          2.hour  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
         memory = { check_max( 100.MB + 20.MB   * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
@@ -114,6 +120,11 @@ process {
         memory  = { check_max( 1.GB * Math.ceil( 30 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) }
     }
 
+    withName: GENERATE_CRAM_CSV {
+        cpus    = { check_max( 4    * task.attempt, 'cpus'    ) }
+        memory  = { check_max( 16.GB * task.attempt, 'memory' ) }
+    }
+
     withName: CRUMBLE {
         // No correlation between memory usage and the number of reads or the genome size.
         // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB.

diff --git a/conf/modules.config b/conf/modules.config
@@ -20,14 +20,6 @@ process {
         ext.args = 'include=f'
     }
 
-    withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
-        ext.args = { "-5SPCp -R ${meta.read_group}" }
-    }
-
-    withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
-        ext.args = { "-p -R ${meta.read_group}" }
-    }
-
     withName: SAMTOOLS_MERGE {
         beforeScript = { "export REF_PATH=spoof"}
         ext.args = { "-c -p" }
@@ -119,6 +111,30 @@ process {
         ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' }
     }
 
+    withName: ".*:ALIGN_HIFI:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = { "-F 0x200 -nt" }
+        ext.args2       = { "-ax map-hifi --cs=short -I" + Math.ceil(meta.genome_size/1e9) + 'G' }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
+    withName: ".*:ALIGN_CLR:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = { "-F 0x200 -nt" }
+        ext.args2       = { "-ax map-pb -I" + Math.ceil(meta.genome_size/1e9) + 'G' }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
+    withName: ".*:ALIGN_ONT:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = { "-F 0x200 -nt" }
+        ext.args2       = { "-ax map-ont -I" + Math.ceil(meta.genome_size/1e9) + 'G' }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
     withName: '.*:CONVERT_STATS:SAMTOOLS_CRAM' {
         beforeScript = { "export REF_PATH=spoof"}
         ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" }

diff --git a/seq_cache_populate.pl b/seq_cache_populate.pl
diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf
@@ -2,7 +2,11 @@
 // Align Nanopore read files against the genome
 //
 
-include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main'
+include { SAMTOOLS_ADDREPLACERG } from '../../modules/local/samtools_addreplacerg'
+include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main'
+include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv'
+include { MINIMAP2_MAPREDUCE } from '../../subworkflows/local/minimap2_mapreduce'
+include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup'
 include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main'
 
 
@@ -14,17 +18,54 @@ workflow ALIGN_ONT {
 
     main:
     ch_versions = Channel.empty()
+    ch_merged_bam   = Channel.empty()
 
+    // Convert FASTQ to CRAM
+    CONVERT_CRAM ( reads, fasta )
+    ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions )
 
-    // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM
-    MINIMAP2_ALIGN ( reads, fasta, true, "bai", false, false )
-    ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() )
+    SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam )
+    ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions )
 
+    SAMTOOLS_ADDREPLACERG.out.cram
+    | set { ch_reads_cram }
 
-    // Collect all alignment output by sample name
-    MINIMAP2_ALIGN.out.bam
+    // Index the CRAM file
+    SAMTOOLS_INDEX ( ch_reads_cram )
+    ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions )
+
+    ch_reads_cram
+    | join ( SAMTOOLS_INDEX.out.crai )
+    | set { ch_reads_cram_crai }
+
+
+    //
+    // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
+    //
+    GENERATE_CRAM_CSV( ch_reads_cram_crai )
+    ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions )
+
+    //
+    // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2
+    //
+    MINIMAP2_MAPREDUCE (
+        fasta,
+        GENERATE_CRAM_CSV.out.csv
+    )
+    ch_versions         = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions )
+    ch_merged_bam           = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam)
+
+
+    ch_merged_bam
+    | combine( ch_reads_cram_crai )
+    | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
+    | set { ch_merged_bam }
+
+
+    // Collect all BAM output by sample name
+    ch_merged_bam
     | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
-    | groupTuple ( by: [0] )
+    | groupTuple( by: [0] )
     | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
     | branch {
         meta, bams ->

diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf
@@ -3,7 +3,11 @@
 //
 
 include { FILTER_PACBIO  } from '../../subworkflows/local/filter_pacbio'
-include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main'
+include { SAMTOOLS_ADDREPLACERG } from '../../modules/local/samtools_addreplacerg'
+include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main'
+include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv'
+include { MINIMAP2_MAPREDUCE } from '../../subworkflows/local/minimap2_mapreduce'
+include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup'
 include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main'
 
 
@@ -16,21 +20,56 @@ workflow ALIGN_PACBIO {
 
     main:
     ch_versions = Channel.empty()
-
+    ch_merged_bam   = Channel.empty()
 
     // Filter BAM and output as FASTQ
     FILTER_PACBIO ( reads, db )
     ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions )
 
+    // Convert FASTQ to CRAM
+    CONVERT_CRAM ( FILTER_PACBIO.out.fastq, fasta )
+    ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions )
+
+    SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam )
+    ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions )
+
+    SAMTOOLS_ADDREPLACERG.out.cram
+    | set { ch_reads_cram }
+
+    // Index the CRAM file
+    SAMTOOLS_INDEX ( ch_reads_cram )
+    ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions )
+
+    ch_reads_cram
+    | join ( SAMTOOLS_INDEX.out.crai )
+    | set { ch_reads_cram_crai }
+
+
+    //
+    // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT
+    //
+    GENERATE_CRAM_CSV( ch_reads_cram_crai )
+    ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions )
+
+    //
+    // SUBWORKFLOW: mapping pacbio reads using minimap2
+    //
+    MINIMAP2_MAPREDUCE (
+        fasta,
+        GENERATE_CRAM_CSV.out.csv
+    )
+    ch_versions         = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions )
+    ch_merged_bam           = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam)
 
-    // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM
-    MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, fasta, true, "bai", false, false )
-    ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() )
+    ch_merged_bam
+    | combine( ch_reads_cram_crai )
+    | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
+    | set { ch_merged_bam }
 
-    // Collect all alignment output by sample name
-    MINIMAP2_ALIGN.out.bam
+    // Collect all BAM output by sample name
+    ch_merged_bam
     | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
-    | groupTuple ( by: [0] )
+    | groupTuple( by: [0] )
     | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
     | branch {
         meta, bams ->

diff --git a/subworkflows/local/minimap2_mapreduce.nf b/subworkflows/local/minimap2_mapreduce.nf
@@ -38,7 +38,8 @@ workflow MINIMAP2_MAPREDUCE {
         .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path->
             tuple([
                     id: cram_id.id,
-                    chunk_id: cram_id.id + "_" + cram_info[5]
+                    chunk_id: cram_id.id + "_" + cram_info[5],
+                    genome_size: ref_id.genome_size
                     ],
                 file(cram_info[0]),
                 cram_info[1],