From 480430727fcb6b7ce96d9727b59e902a8d866480 Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Tue, 9 Jul 2024 19:08:20 +0700 Subject: [PATCH] move the branching for FASTQ files inside of align_short.nf --- docs/usage.md | 2 +- subworkflows/local/align_illumina_fastq.nf | 63 ---------------------- subworkflows/local/align_short.nf | 28 +++++++--- workflows/readmapping.nf | 13 +---- 4 files changed, 24 insertions(+), 82 deletions(-) delete mode 100644 subworkflows/local/align_illumina_fastq.nf diff --git a/docs/usage.md b/docs/usage.md index cb0fb20..dc201ef 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -42,7 +42,7 @@ sample1_T5,pacbio,pacbio2.bam,pacbio2 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | | `datatype` | Type of sequencing data. Must be one of `hic`, `Illumina`, `pacbio`, or `ont`. | -| `datafile` | Full path to read data file. Must be `bam` or `cram` for `hic`. Must be `cram` or `fastq.gz` or `fq.gz` for `Illumina`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`. | +| `datafile` | Full path to read data file. Must be `bam` or `cram` or `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`. | | `library` | (Optional) The library value is a unique identifier which is assigned to read group (`@RG`) ID. If the library name is not specified, the pipeline will auto-create library name using the data filename provided in the samplesheet. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. diff --git a/subworkflows/local/align_illumina_fastq.nf b/subworkflows/local/align_illumina_fastq.nf deleted file mode 100644 index 0447ea4..0000000 --- a/subworkflows/local/align_illumina_fastq.nf +++ /dev/null @@ -1,63 +0,0 @@ -// -// Align Illumina FASTQ data against the genome -// - -include { BWAMEM2_MEM } from '../../modules/nf-core/bwamem2/mem/main' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' - - -workflow ALIGN_ILLUMINA_FASTQ { - take: - fasta // channel: [ val(meta), /path/to/fasta ] - index // channel: [ val(meta), /path/to/bwamem2/ ] - reads // channel: [ val(meta), /path/to/datafile ] - - - main: - ch_versions = Channel.empty() - - - // Align Fastq to Genome and output sorted BAM - BWAMEM2_MEM ( reads, index, true ) - ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() ) - - - // Collect all BWAMEM2 output by sample name - BWAMEM2_MEM.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - | groupTuple( by: [0] ) - | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } - | branch { - meta, bams -> - single_bam: bams.size() == 1 - multi_bams: true - } - | set { ch_bams } - - - // Merge, but only if there is more than 1 file - SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - SAMTOOLS_MERGE.out.bam - | mix ( ch_bams.single_bam ) - | set { ch_bam } - - - // Mark duplicates - SAMTOOLS_SORMADUP ( ch_bam, fasta ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) - - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORMADUP.out.bam - | map { meta, bam -> [ meta, bam, [] ] } - | set { ch_stat } - - - emit: - bam = ch_stat // channel: [ val(meta), /path/to/bam ] - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index 33c27b6..3237a1e 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -18,15 +18,31 @@ workflow ALIGN_SHORT { main: ch_versions = Channel.empty() + // Check file types and branch + reads + | branch { + meta, reads -> + fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ } + cram : true + } + | set { ch_reads } + - // Convert from CRAM to FASTQ - SAMTOOLS_FASTQ ( reads, false ) - ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) + if ( ch_reads.cram ) { + // Convert from CRAM to FASTQ + SAMTOOLS_FASTQ ( ch_reads.cram, false ) + ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) - // Align Fastq to Genome and output sorted BAM - BWAMEM2_MEM ( SAMTOOLS_FASTQ.out.fastq, index, true ) - ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() ) + // Align Fastq to Genome and output sorted BAM + BWAMEM2_MEM ( SAMTOOLS_FASTQ.out.fastq, index, true ) + ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() ) + + + } else { + BWAMEM2_MEM ( ch_reads.fastq, index, true ) + ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() ) + } // Collect all BWAMEM2 output by sample name diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index 3623f22..2910f52 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -33,7 +33,6 @@ include { ALIGN_PACBIO as ALIGN_HIFI } from '../subworkflows/local/align_pacb include { ALIGN_PACBIO as ALIGN_CLR } from '../subworkflows/local/align_pacbio' include { ALIGN_ONT } from '../subworkflows/local/align_ont' include { CONVERT_STATS } from '../subworkflows/local/convert_stats' -include { ALIGN_ILLUMINA_FASTQ } from '../subworkflows/local/align_illumina_fastq' /* @@ -77,13 +76,6 @@ workflow READMAPPING { ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) - ch_reads.illumina - | branch { - meta, reads -> - fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ } - cram : reads.findAll { it.getName().toLowerCase() =~ /.*cram/ } - } - | set { ch_illumina } // // SUBWORKFLOW: Uncompress and prepare reference genome files @@ -121,12 +113,9 @@ workflow READMAPPING { ALIGN_HIC ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.hic ) ch_versions = ch_versions.mix ( ALIGN_HIC.out.versions ) - ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_illumina.cram ) + ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.illumina ) ch_versions = ch_versions.mix ( ALIGN_ILLUMINA.out.versions ) - ALIGN_ILLUMINA_FASTQ ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_illumina.fastq ) - ch_versions = ch_versions.mix ( ALIGN_ILLUMINA_FASTQ.out.versions ) - ALIGN_HIFI ( PREPARE_GENOME.out.fasta, ch_reads.pacbio, ch_vector_db ) ch_versions = ch_versions.mix ( ALIGN_HIFI.out.versions )