Skip to content

Commit

Permalink
add input type fastq.gz and fg.gz for Illumina reads
Browse files Browse the repository at this point in the history
  • Loading branch information
reichan1998 committed Jul 9, 2024
1 parent f9be6aa commit 8b74d6c
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 3 deletions.
2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ sample1_T5,pacbio,pacbio2.bam,pacbio2
| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). |
| `datatype` | Type of sequencing data. Must be one of `hic`, `Illumina`, `pacbio`, or `ont`. |
| `datafile` | Full path to read data file. Must be `bam` or `cram` for `hic` and `illumina`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`. |
| `datafile` | Full path to read data file. Must be `bam` or `cram` for `hic`. Must be `bam`, `cram`, `fastq.gz` or `fq.gz` for `illumina`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`. |
| `library` | (Optional) The library value is a unique identifier which is assigned to read group (`@RG`) ID. If the library name is not specified, the pipeline will auto-create library name using the data filename provided in the samplesheet. |

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
Expand Down
63 changes: 63 additions & 0 deletions subworkflows/local/align_illumina_fastq.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
//
// Align Illumina FASTQ data against the genome
//

include { BWAMEM2_MEM } from '../../modules/nf-core/bwamem2/mem/main'
include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main'
include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup'


workflow ALIGN_ILLUMINA_FASTQ {
take:
fasta // channel: [ val(meta), /path/to/fasta ]
index // channel: [ val(meta), /path/to/bwamem2/ ]
reads // channel: [ val(meta), /path/to/datafile ]


main:
ch_versions = Channel.empty()


// Align Fastq to Genome and output sorted BAM
BWAMEM2_MEM ( reads, index, true )
ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() )


// Collect all BWAMEM2 output by sample name
BWAMEM2_MEM.out.bam
| map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
| groupTuple( by: [0] )
| map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
| branch {
meta, bams ->
single_bam: bams.size() == 1
multi_bams: true
}
| set { ch_bams }


// Merge, but only if there is more than 1 file
SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )


SAMTOOLS_MERGE.out.bam
| mix ( ch_bams.single_bam )
| set { ch_bam }


// Mark duplicates
SAMTOOLS_SORMADUP ( ch_bam, fasta )
ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )


// Convert merged BAM to CRAM and calculate indices and statistics
SAMTOOLS_SORMADUP.out.bam
| map { meta, bam -> [ meta, bam, [] ] }
| set { ch_stat }


emit:
bam = ch_stat // channel: [ val(meta), /path/to/bam ]
versions = ch_versions // channel: [ versions.yml ]
}
15 changes: 13 additions & 2 deletions workflows/readmapping.nf
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ include { ALIGN_PACBIO as ALIGN_HIFI } from '../subworkflows/local/align_pacb
include { ALIGN_PACBIO as ALIGN_CLR } from '../subworkflows/local/align_pacbio'
include { ALIGN_ONT } from '../subworkflows/local/align_ont'
include { CONVERT_STATS } from '../subworkflows/local/convert_stats'
include { ALIGN_ILLUMINA_FASTQ } from '../subworkflows/local/align_illumina_fastq'


/*
Expand Down Expand Up @@ -76,6 +77,13 @@ workflow READMAPPING {

ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions )

ch_reads.illumina
| branch {
meta, reads ->
fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ }
cram : reads.findAll { it.getName().toLowerCase() =~ /.*cram/ }
}
| set { ch_illumina }

//
// SUBWORKFLOW: Uncompress and prepare reference genome files
Expand Down Expand Up @@ -112,10 +120,13 @@ workflow READMAPPING {
//
ALIGN_HIC ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.hic )
ch_versions = ch_versions.mix ( ALIGN_HIC.out.versions )

ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.illumina )
ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_illumina.cram )
ch_versions = ch_versions.mix ( ALIGN_ILLUMINA.out.versions )

ALIGN_ILLUMINA_FASTQ ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_illumina.fastq )
ch_versions = ch_versions.mix ( ALIGN_ILLUMINA_FASTQ.out.versions )

ALIGN_HIFI ( PREPARE_GENOME.out.fasta, ch_reads.pacbio, ch_vector_db )
ch_versions = ch_versions.mix ( ALIGN_HIFI.out.versions )

Expand Down

0 comments on commit 8b74d6c

Please sign in to comment.