add input type fastq.gz and fg.gz for Illumina reads

sanger-tol · Jul 9, 2024 · 8b74d6c · 8b74d6c
1 parent f9be6aa
commit 8b74d6c
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 3 deletions.
diff --git a/docs/usage.md b/docs/usage.md
@@ -42,7 +42,7 @@ sample1_T5,pacbio,pacbio2.bam,pacbio2
 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `sample`   | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_).                                                 |
 | `datatype` | Type of sequencing data. Must be one of `hic`, `Illumina`, `pacbio`, or `ont`.                                                                                                                                                        |
-| `datafile` | Full path to read data file. Must be `bam` or `cram` for `hic` and `illumina`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                                                                                   |
+| `datafile` | Full path to read data file. Must be `bam` or `cram` for `hic`. Must be `bam`, `cram`, `fastq.gz` or `fq.gz` for `illumina`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                                                                                   |
 | `library`  | (Optional) The library value is a unique identifier which is assigned to read group (`@RG`) ID. If the library name is not specified, the pipeline will auto-create library name using the data filename provided in the samplesheet. |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

diff --git a/subworkflows/local/align_illumina_fastq.nf b/subworkflows/local/align_illumina_fastq.nf
@@ -0,0 +1,63 @@
+//
+// Align Illumina FASTQ data against the genome
+//
+
+include { BWAMEM2_MEM       } from '../../modules/nf-core/bwamem2/mem/main'
+include { SAMTOOLS_MERGE    } from '../../modules/nf-core/samtools/merge/main'
+include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup'
+
+
+workflow ALIGN_ILLUMINA_FASTQ {
+    take:
+    fasta    // channel: [ val(meta), /path/to/fasta ]
+    index    // channel: [ val(meta), /path/to/bwamem2/ ]
+    reads    // channel: [ val(meta), /path/to/datafile ]
+
+
+    main:
+    ch_versions = Channel.empty()
+
+
+    // Align Fastq to Genome and output sorted BAM
+    BWAMEM2_MEM ( reads, index, true )
+    ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() )
+
+
+    // Collect all BWAMEM2 output by sample name
+    BWAMEM2_MEM.out.bam
+    | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
+    | groupTuple( by: [0] )
+    | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
+    | branch {
+        meta, bams ->
+            single_bam: bams.size() == 1
+            multi_bams: true
+    }
+    | set { ch_bams }
+
+
+    // Merge, but only if there is more than 1 file
+    SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
+    ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
+
+
+    SAMTOOLS_MERGE.out.bam
+    | mix ( ch_bams.single_bam )
+    | set { ch_bam }
+
+
+    // Mark duplicates
+    SAMTOOLS_SORMADUP ( ch_bam, fasta )
+    ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )
+
+
+    // Convert merged BAM to CRAM and calculate indices and statistics
+    SAMTOOLS_SORMADUP.out.bam
+    | map { meta, bam -> [ meta, bam, [] ] }
+    | set { ch_stat }
+
+
+    emit:
+    bam      = ch_stat                       // channel: [ val(meta), /path/to/bam ]
+    versions = ch_versions                   // channel: [ versions.yml ]
+}
diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf
@@ -33,6 +33,7 @@ include { ALIGN_PACBIO as ALIGN_HIFI    } from '../subworkflows/local/align_pacb
 include { ALIGN_PACBIO as ALIGN_CLR     } from '../subworkflows/local/align_pacbio'
 include { ALIGN_ONT                     } from '../subworkflows/local/align_ont'
 include { CONVERT_STATS                 } from '../subworkflows/local/convert_stats'
+include { ALIGN_ILLUMINA_FASTQ          } from '../subworkflows/local/align_illumina_fastq'
 
 
 /*
@@ -76,6 +77,13 @@ workflow READMAPPING {
 
     ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions )
 
+    ch_reads.illumina
+    | branch {
+        meta, reads ->
+            fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ }
+            cram : reads.findAll { it.getName().toLowerCase() =~ /.*cram/ }
+    }
+    | set { ch_illumina }
 
     //
     // SUBWORKFLOW: Uncompress and prepare reference genome files
@@ -112,10 +120,13 @@ workflow READMAPPING {
     //
     ALIGN_HIC ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.hic )
     ch_versions = ch_versions.mix ( ALIGN_HIC.out.versions )
-
-    ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.illumina )
+    
+    ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_illumina.cram )
     ch_versions = ch_versions.mix ( ALIGN_ILLUMINA.out.versions )
 
+    ALIGN_ILLUMINA_FASTQ ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_illumina.fastq )
+    ch_versions = ch_versions.mix ( ALIGN_ILLUMINA_FASTQ.out.versions )
+
     ALIGN_HIFI ( PREPARE_GENOME.out.fasta, ch_reads.pacbio, ch_vector_db )
     ch_versions = ch_versions.mix ( ALIGN_HIFI.out.versions )