From 480430727fcb6b7ce96d9727b59e902a8d866480 Mon Sep 17 00:00:00 2001
From: reichan1998 <baochauduong1311@gmail.com>
Date: Tue, 9 Jul 2024 19:08:20 +0700
Subject: [PATCH] move the branching for FASTQ files inside of align_short.nf

---
 docs/usage.md                              |  2 +-
 subworkflows/local/align_illumina_fastq.nf | 63 ----------------------
 subworkflows/local/align_short.nf          | 28 +++++++---
 workflows/readmapping.nf                   | 13 +----
 4 files changed, 24 insertions(+), 82 deletions(-)
 delete mode 100644 subworkflows/local/align_illumina_fastq.nf

diff --git a/docs/usage.md b/docs/usage.md
index cb0fb20..dc201ef 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -42,7 +42,7 @@ sample1_T5,pacbio,pacbio2.bam,pacbio2
 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `sample`   | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_).                                                 |
 | `datatype` | Type of sequencing data. Must be one of `hic`, `Illumina`, `pacbio`, or `ont`.                                                                                                                                                        |
-| `datafile` | Full path to read data file. Must be `bam` or `cram` for `hic`. Must be `cram` or `fastq.gz` or `fq.gz` for `Illumina`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                                          |
+| `datafile` | Full path to read data file. Must be `bam` or `cram` or `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                                                          |
 | `library`  | (Optional) The library value is a unique identifier which is assigned to read group (`@RG`) ID. If the library name is not specified, the pipeline will auto-create library name using the data filename provided in the samplesheet. |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
diff --git a/subworkflows/local/align_illumina_fastq.nf b/subworkflows/local/align_illumina_fastq.nf
deleted file mode 100644
index 0447ea4..0000000
--- a/subworkflows/local/align_illumina_fastq.nf
+++ /dev/null
@@ -1,63 +0,0 @@
-//
-// Align Illumina FASTQ data against the genome
-//
-
-include { BWAMEM2_MEM       } from '../../modules/nf-core/bwamem2/mem/main'
-include { SAMTOOLS_MERGE    } from '../../modules/nf-core/samtools/merge/main'
-include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup'
-
-
-workflow ALIGN_ILLUMINA_FASTQ {
-    take:
-    fasta    // channel: [ val(meta), /path/to/fasta ]
-    index    // channel: [ val(meta), /path/to/bwamem2/ ]
-    reads    // channel: [ val(meta), /path/to/datafile ]
-
-
-    main:
-    ch_versions = Channel.empty()
-
-
-    // Align Fastq to Genome and output sorted BAM
-    BWAMEM2_MEM ( reads, index, true )
-    ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() )
-
-
-    // Collect all BWAMEM2 output by sample name
-    BWAMEM2_MEM.out.bam
-    | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
-    | groupTuple( by: [0] )
-    | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
-    | branch {
-        meta, bams ->
-            single_bam: bams.size() == 1
-            multi_bams: true
-    }
-    | set { ch_bams }
-
-
-    // Merge, but only if there is more than 1 file
-    SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
-    ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
-
-
-    SAMTOOLS_MERGE.out.bam
-    | mix ( ch_bams.single_bam )
-    | set { ch_bam }
-
-
-    // Mark duplicates
-    SAMTOOLS_SORMADUP ( ch_bam, fasta )
-    ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )
-
-
-    // Convert merged BAM to CRAM and calculate indices and statistics
-    SAMTOOLS_SORMADUP.out.bam
-    | map { meta, bam -> [ meta, bam, [] ] }
-    | set { ch_stat }
-
-
-    emit:
-    bam      = ch_stat                       // channel: [ val(meta), /path/to/bam ]
-    versions = ch_versions                   // channel: [ versions.yml ]
-}
diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf
index 33c27b6..3237a1e 100644
--- a/subworkflows/local/align_short.nf
+++ b/subworkflows/local/align_short.nf
@@ -18,15 +18,31 @@ workflow ALIGN_SHORT {
     main:
     ch_versions = Channel.empty()
 
+    // Check file types and branch
+    reads
+    | branch {
+        meta, reads ->
+            fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ }
+            cram : true
+    }
+    | set { ch_reads }
+
 
-    // Convert from CRAM to FASTQ
-    SAMTOOLS_FASTQ ( reads, false )
-    ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() )
+    if ( ch_reads.cram ) {
+        // Convert from CRAM to FASTQ
+        SAMTOOLS_FASTQ ( ch_reads.cram, false )
+        ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() )
 
 
-    // Align Fastq to Genome and output sorted BAM
-    BWAMEM2_MEM ( SAMTOOLS_FASTQ.out.fastq, index, true )
-    ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() )
+        // Align Fastq to Genome and output sorted BAM
+        BWAMEM2_MEM ( SAMTOOLS_FASTQ.out.fastq, index, true )
+        ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() )
+
+        
+    } else {
+        BWAMEM2_MEM ( ch_reads.fastq, index, true )
+        ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() )
+    }
 
 
     // Collect all BWAMEM2 output by sample name
diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf
index 3623f22..2910f52 100644
--- a/workflows/readmapping.nf
+++ b/workflows/readmapping.nf
@@ -33,7 +33,6 @@ include { ALIGN_PACBIO as ALIGN_HIFI    } from '../subworkflows/local/align_pacb
 include { ALIGN_PACBIO as ALIGN_CLR     } from '../subworkflows/local/align_pacbio'
 include { ALIGN_ONT                     } from '../subworkflows/local/align_ont'
 include { CONVERT_STATS                 } from '../subworkflows/local/convert_stats'
-include { ALIGN_ILLUMINA_FASTQ          } from '../subworkflows/local/align_illumina_fastq'
 
 
 /*
@@ -77,13 +76,6 @@ workflow READMAPPING {
 
     ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions )
 
-    ch_reads.illumina
-    | branch {
-        meta, reads ->
-            fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ }
-            cram : reads.findAll { it.getName().toLowerCase() =~ /.*cram/ }
-    }
-    | set { ch_illumina }
 
     //
     // SUBWORKFLOW: Uncompress and prepare reference genome files
@@ -121,12 +113,9 @@ workflow READMAPPING {
     ALIGN_HIC ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.hic )
     ch_versions = ch_versions.mix ( ALIGN_HIC.out.versions )
     
-    ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_illumina.cram )
+    ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.illumina )
     ch_versions = ch_versions.mix ( ALIGN_ILLUMINA.out.versions )
 
-    ALIGN_ILLUMINA_FASTQ ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_illumina.fastq )
-    ch_versions = ch_versions.mix ( ALIGN_ILLUMINA_FASTQ.out.versions )
-
     ALIGN_HIFI ( PREPARE_GENOME.out.fasta, ch_reads.pacbio, ch_vector_db )
     ch_versions = ch_versions.mix ( ALIGN_HIFI.out.versions )