diff --git a/conf/base.config b/conf/base.config index ca753f3..386bbb4 100644 --- a/conf/base.config +++ b/conf/base.config @@ -100,4 +100,16 @@ process { withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } + + withName: SEQKIT_SPLIT2 { + cpus = { log_increase_cpus(2, 1*task.attempt, meta.read_count/1000000, 2) } + memory = { check_max( 2.GB + 500.MB * log_increase_cpus(2, 1*task.attempt, meta.read_count/1000000, 2), 'memory' ) } + time = { check_max( 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + } + + withName: SAMTOOLS_SPLIT { + cpus = { log_increase_cpus(2, 2*task.attempt, 1, 2) } + memory = { check_max( 1.GB * task.attempt, 'memory' ) } + time = { check_max( 2.hour * task.attempt, 'time' ) } + } } diff --git a/conf/modules.config b/conf/modules.config index 089592e..d14e0ef 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,6 +28,11 @@ process { ext.prefix = { "${meta.id}.merge" } } + withName: SAMTOOLS_MERGE_CHUNKS { + ext.args = { "-c -p" } + ext.prefix = { "${meta.id}.merge" } + } + // If custom header provided, this is inserted in place of existing // @HD and @SQ lines, while preserving any other header entries withName: SAMTOOLS_REHEADER { @@ -100,5 +105,13 @@ process { pattern: '*_versions.yml' ] } + + withName: SEQKIT_SPLIT2 { + ext.args = "--by-part 10" + } + withName: SAMTOOLS_SPLIT { + ext.args = "-h" + ext.chunk_size = 10000 + } } diff --git a/modules.json b/modules.json index 113030b..74e9748 100644 --- a/modules.json +++ b/modules.json @@ -84,7 +84,17 @@ "samtools/view": { "branch": "master", "git_sha": "6c2309aaec566c0d44a6cf14d4b2d0c51afe2e91", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "seqkit/split2": { + "branch": "master", + "git_sha": "2be41ca2cc780eca4293d1b0dd3850b0b7ac40a3", + "installed_by": [ + "modules" + ], + "patch": "modules/nf-core/seqkit/split2/seqkit-split2.diff" }, "untar": { "branch": "master", diff --git a/modules/local/samtools_split.nf b/modules/local/samtools_split.nf new file mode 100644 index 0000000..b5ac080 --- /dev/null +++ b/modules/local/samtools_split.nf @@ -0,0 +1,62 @@ +process SAMTOOLS_SPLIT { + tag "$meta.id" + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.bam"), optional: true, emit: chunked_bam + tuple val(meta), path("*.cram"), optional: true, emit: chunked_cram + + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def chunk_size = task.ext.chunk_size ?: 10000 + def reference = fasta ? "--reference ${fasta}" : "" + def file_extension = reads.name.split('\\.')[-1] + def cpus = task.ext.cpus ?: 1 + + """ + # Convert BAM or CRAM to SAM + if [ "${file_extension}" == "bam" ]; then + samtools view -@ ${cpus} ${args} ${reads} > ${prefix}.sam + + # Split SAM file into chunks + split -l ${chunk_size} ${prefix}.sam ${prefix}_part_ + + # Convert each chunk back to BAM + for chunk in ${prefix}_part_*; do + samtools view -@ ${cpus} -b -h \$chunk > \${chunk}.bam + done + elif [ "${file_extension}" == "cram" ]; then + samtools view -@ ${cpus} ${args} ${reference} ${reads} > ${prefix}.sam + # Split SAM file into chunks + split -l ${chunk_size} ${prefix}.sam ${prefix}_part_ + + # Convert each chunk back to BAM + for chunk in ${prefix}_part_*; do + samtools view -@ ${cpus} -b -h \$chunk > \${chunk}.cram + done + else + echo "Unsupported file type: ${file_extension}" + exit 1 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqkit/split2/environment.yml b/modules/nf-core/seqkit/split2/environment.yml new file mode 100644 index 0000000..85716aa --- /dev/null +++ b/modules/nf-core/seqkit/split2/environment.yml @@ -0,0 +1,7 @@ +name: seqkit_split2 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::seqkit=2.8.1 diff --git a/modules/nf-core/seqkit/split2/main.nf b/modules/nf-core/seqkit/split2/main.nf new file mode 100644 index 0000000..b4543b8 --- /dev/null +++ b/modules/nf-core/seqkit/split2/main.nf @@ -0,0 +1,51 @@ +process SEQKIT_SPLIT2 { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.8.1--h9ee0642_0' : + 'biocontainers/seqkit:2.8.1--h9ee0642_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("**/*.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if(reads instanceof List && reads.size() == 2){ + """ + seqkit \\ + split2 \\ + $args \\ + --threads $task.cpus \\ + --read1 ${reads[0]} \\ + --read2 ${reads[1]} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + seqkit \\ + split2 \\ + $args \\ + --threads $task.cpus \\ + $reads + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/seqkit/split2/meta.yml b/modules/nf-core/seqkit/split2/meta.yml new file mode 100644 index 0000000..14babd4 --- /dev/null +++ b/modules/nf-core/seqkit/split2/meta.yml @@ -0,0 +1,42 @@ +name: seqkit_split2 +description: Split single or paired-end fastq.gz files +keywords: + - split + - fastq + - seqkit +tools: + - seqkit: + description: | + Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, written by Wei Shen. + homepage: https://github.com/shenwei356/seqkit + documentation: https://bioinf.shenwei.me/seqkit/ + doi: 10.1371/journal.pone.0163962 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FastQ files + pattern: "*.{fq.gz/fastq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Split fastq files + pattern: "*.{fq.gz/fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/seqkit/split2/seqkit-split2.diff b/modules/nf-core/seqkit/split2/seqkit-split2.diff new file mode 100644 index 0000000..f6d5ce7 --- /dev/null +++ b/modules/nf-core/seqkit/split2/seqkit-split2.diff @@ -0,0 +1,34 @@ +Changes in module 'nf-core/seqkit/split2' +--- modules/nf-core/seqkit/split2/main.nf ++++ modules/nf-core/seqkit/split2/main.nf +@@ -20,14 +20,14 @@ + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" +- if(meta.single_end){ ++ if(reads instanceof List && reads.size() == 2){ + """ + seqkit \\ + split2 \\ + $args \\ + --threads $task.cpus \\ +- $reads \\ +- --out-dir ${prefix} ++ --read1 ${reads[0]} \\ ++ --read2 ${reads[1]} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": +@@ -40,9 +40,7 @@ + split2 \\ + $args \\ + --threads $task.cpus \\ +- --read1 ${reads[0]} \\ +- --read2 ${reads[1]} \\ +- --out-dir ${prefix} ++ $reads + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/seqkit/split2/tests/length.config b/modules/nf-core/seqkit/split2/tests/length.config new file mode 100644 index 0000000..64d8a9a --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/length.config @@ -0,0 +1,5 @@ +process { + withName: SEQKIT_SPLIT2 { + ext.args = '--by-length 8K' + } +} diff --git a/modules/nf-core/seqkit/split2/tests/main.nf.test b/modules/nf-core/seqkit/split2/tests/main.nf.test new file mode 100644 index 0000000..abf8d06 --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/main.nf.test @@ -0,0 +1,222 @@ +nextflow_process { + + name "Test Process SEQKIT_SPLIT2" + script "../main.nf" + process "SEQKIT_SPLIT2" + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/split2" + + test("single_end - length") { + + config "./length.config" + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("paired_end - length") { + + config "./length.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - length - stub") { + + options "-stub" + config "./length.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - part") { + + config "./part.config" + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("paired_end - part") { + + config "./part.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - part - stub") { + + options "-stub" + config "./part.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - size") { + + config "./size.config" + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("paired_end - size") { + + config "./size.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - size - stub") { + + options "-stub" + config "./size.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/split2/tests/main.nf.test.snap b/modules/nf-core/seqkit/split2/tests/main.nf.test.snap new file mode 100644 index 0000000..071e358 --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/main.nf.test.snap @@ -0,0 +1,414 @@ +{ + "paired_end - size": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411", + "test_2.part_001.fastq.gz:md5,201ee95b559240e27830970b78a547c8", + "test_2.part_002.fastq.gz:md5,35ff29a76f34b2507a37287352324650" + ] + ] + ], + "1": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411", + "test_2.part_001.fastq.gz:md5,201ee95b559240e27830970b78a547c8", + "test_2.part_002.fastq.gz:md5,35ff29a76f34b2507a37287352324650" + ] + ] + ], + "versions": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:49:27.111190403" + }, + "single_end - size": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411" + ] + ] + ], + "1": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411" + ] + ] + ], + "versions": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:43:42.879258276" + }, + "single_end - part": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52" + ] + ] + ], + "1": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52" + ] + ] + ], + "versions": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:43:16.85414188" + }, + "single_end - length": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d" + ] + ] + ], + "1": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d" + ] + ] + ], + "versions": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:42:47.553659985" + }, + "single_end - size - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411" + ] + ] + ], + "1": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411" + ] + ] + ], + "versions": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:44:01.29740288" + }, + "single_end - part - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52" + ] + ] + ], + "1": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52" + ] + ] + ], + "versions": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:43:33.964460368" + }, + "paired_end - length - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "reads": [ + + ], + "versions": [ + + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:42:57.687195505" + }, + "single_end - length - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d" + ] + ] + ], + "1": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d" + ] + ] + ], + "versions": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:43:07.15316163" + }, + "paired_end - part": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52", + "test_2.part_001.fastq.gz:md5,13d14d69744bd93c8c51873b529bf714", + "test_2.part_002.fastq.gz:md5,5f5a78d78f312164a1159c62d65c15f1", + "test_2.part_003.fastq.gz:md5,48bcbceb485b73bcf1f198e252b016d8" + ] + ] + ], + "1": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52", + "test_2.part_001.fastq.gz:md5,13d14d69744bd93c8c51873b529bf714", + "test_2.part_002.fastq.gz:md5,5f5a78d78f312164a1159c62d65c15f1", + "test_2.part_003.fastq.gz:md5,48bcbceb485b73bcf1f198e252b016d8" + ] + ] + ], + "versions": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:48:43.715062381" + }, + "paired_end - length": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d", + "test_2.part_001.fastq.gz:md5,77b9076f82a762711582584342bde5a1", + "test_2.part_002.fastq.gz:md5,33bb6e3edc759baa7ba6580da36def48" + ] + ] + ], + "1": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d", + "test_2.part_001.fastq.gz:md5,77b9076f82a762711582584342bde5a1", + "test_2.part_002.fastq.gz:md5,33bb6e3edc759baa7ba6580da36def48" + ] + ] + ], + "versions": [ + "versions.yml:md5,7725bee2f34895f4c1678afacf87ed4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T05:46:21.786837526" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/split2/tests/part.config b/modules/nf-core/seqkit/split2/tests/part.config new file mode 100644 index 0000000..2436bfb --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/part.config @@ -0,0 +1,5 @@ +process { + withName: SEQKIT_SPLIT2 { + ext.args = '--by-part 3' + } +} diff --git a/modules/nf-core/seqkit/split2/tests/size.config b/modules/nf-core/seqkit/split2/tests/size.config new file mode 100644 index 0000000..42a153d --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/size.config @@ -0,0 +1,5 @@ +process { + withName: SEQKIT_SPLIT2 { + ext.args = '--by-size 50' + } +} diff --git a/modules/nf-core/seqkit/split2/tests/tags.yml b/modules/nf-core/seqkit/split2/tests/tags.yml new file mode 100644 index 0000000..cd4f5c4 --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/tags.yml @@ -0,0 +1,2 @@ +seqkit/split2: + - "modules/nf-core/seqkit/split2/**" diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index ef1a021..b1681df 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -4,6 +4,8 @@ include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_CHUNKS } from '../../modules/nf-core/samtools/merge/main' +include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main' workflow ALIGN_ONT { @@ -16,13 +18,43 @@ workflow ALIGN_ONT { ch_versions = Channel.empty() + // Split FASTQ files into chunks + SEQKIT_SPLIT2 ( reads ) + ch_versions = ch_versions.mix ( SEQKIT_SPLIT2.out.versions.first() ) + + + // Rename reads to include the chunk number, then align individually + SEQKIT_SPLIT2.out.reads + | flatMap { meta, reads -> + reads.collect { + def chunk_number = it.getName().toString().split('\\.')[-3] + def new_meta = meta.clone() + new_meta.id = "${meta.id}_${chunk_number}" + [new_meta, it] + } + } + | set { ch_reads_rg } + + // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( reads, fasta, true, "bai", false, false ) + MINIMAP2_ALIGN ( ch_reads_rg, fasta, true, "bai", false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) - // Collect all alignment output by sample name + // Assign chunked BAM files to new identifiers MINIMAP2_ALIGN.out.bam + | map { meta, bam -> [['id': meta.id.split('_')[0..1].join('_'), 'datatype': meta.datatype, 'read_group':meta.read_group, 'read_count': meta.read_count ], bam] } + | groupTuple( by: [0] ) + | set { ch_bams_merge } + + + // Merge chunks + SAMTOOLS_MERGE_CHUNKS ( ch_bams_merge, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE_CHUNKS.out.versions.first() ) + + + // Collect all alignment output by sample name + SAMTOOLS_MERGE_CHUNKS.out.bam | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index f472a6c..e18a292 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -5,6 +5,8 @@ include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main' +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_CHUNKS } from '../../modules/nf-core/samtools/merge/main' workflow ALIGN_PACBIO { @@ -23,12 +25,43 @@ workflow ALIGN_PACBIO { ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) + // Split FASTQ files into chunks + SEQKIT_SPLIT2 ( FILTER_PACBIO.out.fastq ) + ch_versions = ch_versions.mix ( SEQKIT_SPLIT2.out.versions.first() ) + + + // Rename reads to include the chunk number, then align individually + SEQKIT_SPLIT2.out.reads + | flatMap { meta, reads -> + reads.collect { + def chunk_number = it.getName().toString().split('\\.')[-3] + def new_meta = meta.clone() + new_meta.id = "${meta.id}_${chunk_number}" + [new_meta, it] + } + } + | set { ch_reads_rg } + + // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, fasta, true, "bai", false, false ) + MINIMAP2_ALIGN ( ch_reads_rg, fasta, true, "bai", false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) - // Collect all alignment output by sample name + + // Assign chunked BAM files to new identifiers MINIMAP2_ALIGN.out.bam + | map { meta, bam -> [['id': meta.id.split('_')[0..1].join('_'), 'datatype': meta.datatype, 'read_group':meta.read_group, 'read_count': meta.read_count ], bam] } + | groupTuple( by: [0] ) + | set { ch_bams_merge } + + + // Merge chunks + SAMTOOLS_MERGE_CHUNKS ( ch_bams_merge, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE_CHUNKS.out.versions.first() ) + + + // Collect all alignment output by sample name + SAMTOOLS_MERGE_CHUNKS.out.bam | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index e74b480..df97d12 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -1,11 +1,15 @@ // // Align short read (HiC and Illumina) data against the genome // +params.num_chunks = 4 include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' include { BWAMEM2_MEM } from '../../modules/nf-core/bwamem2/mem/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_CHUNKS } from '../../modules/nf-core/samtools/merge/main' include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' +include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main' +include { SAMTOOLS_SPLIT } from '../../modules/local/samtools_split' workflow ALIGN_SHORT { @@ -28,13 +32,50 @@ workflow ALIGN_SHORT { | set { ch_reads } + // Split CRAM files into chunks + SAMTOOLS_SPLIT ( ch_reads.cram, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_SPLIT.out.versions.first() ) + + + // Assign chunked files to new identifiers + SAMTOOLS_SPLIT.out.chunked_cram + | flatMap { meta, chunked_cram -> + chunked_cram.collect { + def chunk_number = it.getName().toString().split('\\.')[0].split('_')[2..3].join('_') + def new_meta = meta.clone() + new_meta.id = "${meta.id}_${chunk_number}" + [new_meta, it] + } + } + | set { ch_chunks_from_cram } + + // Convert from CRAM to FASTQ only if CRAM files were provided as input - SAMTOOLS_FASTQ ( ch_reads.cram, false ) + SAMTOOLS_FASTQ ( ch_chunks_from_cram, false ) ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) - - - SAMTOOLS_FASTQ.out.fastq - | mix ( ch_reads.fastq ) + + + // Split FASTQ files into chunks + SEQKIT_SPLIT2 ( ch_reads.fastq ) + ch_versions = ch_versions.mix ( SEQKIT_SPLIT2.out.versions.first() ) + + + // Rename reads to include the chunk number, then align individually + SEQKIT_SPLIT2.out.reads + | flatMap { meta, reads -> + reads.collect { + def chunk_number = it.getName().toString().split('\\.')[-3] + def new_meta = meta.clone() + new_meta.id = "${meta.id}_${chunk_number}" + [new_meta, it] + } + } + | set { ch_chunks_from_fastq } + + + // Mix FASTQ files + ch_chunks_from_fastq + | mix ( SAMTOOLS_FASTQ.out.fastq ) | set { ch_reads_fastq } @@ -43,8 +84,18 @@ workflow ALIGN_SHORT { ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() ) - // Collect all BWAMEM2 output by sample name BWAMEM2_MEM.out.bam + | map { meta, bam -> [['id': meta.id.split('_')[0..1].join('_'), 'datatype': meta.datatype, 'read_group':meta.read_group, 'read_count': meta.read_count ], bam] } + | groupTuple( by: [0] ) + | set { ch_bams_merge } + + + SAMTOOLS_MERGE_CHUNKS ( ch_bams_merge, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE_CHUNKS.out.versions.first() ) + + + // Collect all SAMTOOLS_SORT output by sample name + SAMTOOLS_MERGE_CHUNKS.out.bam | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }