From 6a04301da94e6631e7570f2cb25ce5121741ade5 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 12:07:25 +0100 Subject: [PATCH 01/17] Updates for new subworkflow --- bin/gc_content.py | 29 ++++++++++++++++++ modules/local/gc_content.nf | 37 +++++++++++++++++++++++ subworkflows/local/generate_genome.nf | 43 +++++++++++++++++++-------- subworkflows/local/yaml_input.nf | 22 ++++++++++++-- workflows/ascc.nf | 33 ++++++++++++-------- 5 files changed, 138 insertions(+), 26 deletions(-) create mode 100644 bin/gc_content.py create mode 100644 modules/local/gc_content.nf diff --git a/bin/gc_content.py b/bin/gc_content.py new file mode 100644 index 0000000..b2f786b --- /dev/null +++ b/bin/gc_content.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +""" +Script for finding the GC content of each sequence in a multiFASTA file +""" + +import argparse +import general_purpose_functions as gpf + + +def main(fasta_path): + fasta_data = gpf.read_fasta_in_chunks(fasta_path) + for header, seq in fasta_data: + header = header.split()[0] + seq = seq.upper() + gc_content = None + gc_count = seq.count("G") + seq.count("C") + seq_len = len(seq) + if seq_len > 0: + gc_content = gc_count / seq_len + gc_content_string = "{:.6f}".format(gc_content) + print("{}\t{}".format(header, gc_content_string)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("fasta_path", type=str, help="Path to input FASTA file") + parser.add_argument("-v", action="version", version="1.0") + args = parser.parse_args() + main(args.fasta_path) diff --git a/modules/local/gc_content.nf b/modules/local/gc_content.nf new file mode 100644 index 0000000..0a650bf --- /dev/null +++ b/modules/local/gc_content.nf @@ -0,0 +1,37 @@ +process GC_CONTENT { + tag "${meta.id}" + label 'process_low' + + conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path( "*-gc.txt" ) , emit: txt + path "versions.yml" , emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gc_content.py ${fasta} > ${prefix}-gc.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gc_content: \$(gc_content.py -v) + END_VERSIONS + """ + + stub: + """ + touch full_coords.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gc_content: \$(gc_content.py -v) + END_VERSIONS + """ +} diff --git a/subworkflows/local/generate_genome.nf b/subworkflows/local/generate_genome.nf index 0f41c72..61cd434 100755 --- a/subworkflows/local/generate_genome.nf +++ b/subworkflows/local/generate_genome.nf @@ -3,29 +3,48 @@ // // MODULE IMPORT BLOCK // +include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx/main' +include { CUSTOM_GETCHROMSIZES } from '../../modules/nf-core/custom/getchromsizes/main' +include { GNU_SORT } from '../../modules/nf-core/gnu/sort' +include { GET_LARGEST_SCAFF } from '../../modules/local/get_largest_scaff' workflow GENERATE_GENOME { take: - assembly_title // Channel val(assembly_title) - reference // Channel [ val(meta), path(file) ] + to_chromsize // tuple [[meta.id], file] main: ch_versions = Channel.empty() // - // LOGIC: GENERATES A REFERENCE DATA TUPLE + // MODULE: GENERATE INDEX OF REFERENCE + // EMITS REFERENCE INDEX FILE MODIFIED FOR SCAFF SIZES // - reference - .combine( assembly_title ) - .map { it -> - tuple ([id: it[1]], - it[0]) - } - .set { reference_ch } + CUSTOM_GETCHROMSIZES ( + to_chromsize, + "genome" + ) + ch_versions = ch_versions.mix( CUSTOM_GETCHROMSIZES.out.versions ) - // THIS IS HERE FOR FUTURE EXPANSION + // + // MODULE: SORT CHROM SIZES BY CHOM SIZE NOT NAME + // + GNU_SORT ( + CUSTOM_GETCHROMSIZES.out.sizes + ) + + // + // MODULE: Cut out the largest scaffold size and use as comparator against 512MB + // This is the cut off for TABIX using tbi indexes + // + GET_LARGEST_SCAFF ( + CUSTOM_GETCHROMSIZES.out.sizes + ) + ch_versions = ch_versions.mix( GET_LARGEST_SCAFF.out.versions ) emit: - reference_tuple = reference_ch + max_scaff_size = GET_LARGEST_SCAFF.out.scaff_size.toInteger() + dot_genome = GNU_SORT.out.sorted + ref_index = CUSTOM_GETCHROMSIZES.out.fai + reference_tuple = to_chromsize versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index b4891b0..4ffd8c7 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -48,9 +48,27 @@ workflow YAML_INPUT { } .set { seqkit } + group.assembly_title + .combine( group.assembly_path ) + .map { id, file -> + tuple( [ id: meta.id ], + file + ) + } + .set { ch_reference } + + group.assembly_title + .combine( group.pacbio_reads ) + .map { id, file -> + tuple( [ id: meta.id ], + file + ) + } + .set { ch_pacbio } + emit: - pacbio_reads = group.pacbio_reads - reference = group.assembly_path + reference_tuple = ch_reference + pacbio_tuple = ch_pacbio assembly_title = group.assembly_title taxid = group.taxid nt_database = group.nt_database diff --git a/workflows/ascc.nf b/workflows/ascc.nf index f7b0773..6060d12 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -20,16 +20,21 @@ WorkflowAscc.initialise(params, log) IMPORT LOCAL MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { YAML_INPUT } from '../subworkflows/local/yaml_input' -include { GENERATE_GENOME } from '../subworkflows/local/generate_genome' -include { EXTRACT_TIARA_HITS } from '../subworkflows/local/extract_tiara_hits' -include { EXTRACT_NT_BLAST } from '../subworkflows/local/extract_nt_blast' -include { RUN_FCSADAPTOR } from '../subworkflows/local/run_fcsadaptor' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { RUN_NT_KRAKEN } from '..//subworkflows/local/run_nt_kraken' +include { YAML_INPUT } from '../subworkflows/local/yaml_input' +include { GENERATE_GENOME } from '../subworkflows/local/generate_genome' +include { EXTRACT_TIARA_HITS } from '../subworkflows/local/extract_tiara_hits' +include { EXTRACT_NT_BLAST } from '../subworkflows/local/extract_nt_blast' +include { RUN_FCSADAPTOR } from '../subworkflows/local/run_fcsadaptor' +include { RUN_NT_KRAKEN } from '..//subworkflows/local/run_nt_kraken' + +// +// MODULE: Local modules +// +include { GC_CONTENT } from '../modules/local/gc_content' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -40,7 +45,7 @@ include { RUN_NT_KRAKEN } from '..//subworkflows/local/run_nt_kraken' // // MODULE: Installed directly from nf-core/modules // -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -63,15 +68,19 @@ workflow ASCC { ) ch_versions = ch_versions.mix(YAML_INPUT.out.versions) - //Channel.fromPath( YAML_INPUT.out.nt_database, checkIfExists=true ) -// .set { blast_db } + GC_CONTENT ( + YAML_INPUT.out.ref_tuple + ) + + //Channel + // .fromPath( YAML_INPUT.out.nt_database, checkIfExists=true ) + // .set { blast_db } // // SUBWORKFLOW: GENERATE GENOME FILE // GENERATE_GENOME ( - YAML_INPUT.out.assembly_title, - YAML_INPUT.out.reference + YAML_INPUT.out.ref_tuple ) ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) @@ -115,7 +124,7 @@ workflow ASCC { RUN_FCSADAPTOR ( GENERATE_GENOME.out.reference_tuple ) - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) // // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS From 70f8793dade546d08ac640287b7a5023e864e8fd Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 12:15:40 +0100 Subject: [PATCH 02/17] Adding modules for generate_genome --- modules.json | 83 +++++++++++++++---- modules/local/get_largest_scaff.nf | 40 +++++++++ modules/nf-core/custom/getchromsizes/main.nf | 44 ++++++++++ modules/nf-core/custom/getchromsizes/meta.yml | 53 ++++++++++++ modules/nf-core/gnu/sort/main.nf | 52 ++++++++++++ modules/nf-core/gnu/sort/meta.yml | 42 ++++++++++ modules/nf-core/samtools/faidx/main.nf | 50 +++++++++++ modules/nf-core/samtools/faidx/meta.yml | 57 +++++++++++++ 8 files changed, 405 insertions(+), 16 deletions(-) create mode 100644 modules/local/get_largest_scaff.nf create mode 100644 modules/nf-core/custom/getchromsizes/main.nf create mode 100644 modules/nf-core/custom/getchromsizes/meta.yml create mode 100644 modules/nf-core/gnu/sort/main.nf create mode 100644 modules/nf-core/gnu/sort/meta.yml create mode 100644 modules/nf-core/samtools/faidx/main.nf create mode 100644 modules/nf-core/samtools/faidx/meta.yml diff --git a/modules.json b/modules.json index 180afda..dab76c6 100644 --- a/modules.json +++ b/modules.json @@ -8,81 +8,132 @@ "blast/blastn": { "branch": "master", "git_sha": "1728df69ccbaf4ccb1027c6fd4e9191f48c22194", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "blast/makeblastdb": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "custom/getchromsizes": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": [ + "modules" + ] }, "diamond/blastx": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fcs/fcsadaptor": { "branch": "master", "git_sha": "5a35af8b60d45425c4b9193e567d16b614d93dbe", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "gnu/sort": { + "branch": "master", + "git_sha": "88f6e982fb8bd40488d837b3b08a65008e602840", + "installed_by": [ + "modules" + ] }, "kraken2/kraken2": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff" }, "minimap2/align": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/depth": { "branch": "master", "git_sha": "a1ffbc1fd87bd5a829e956cc26ec9cc53af3e817", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "samtools/faidx": { + "branch": "master", + "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", + "installed_by": [ + "modules" + ] }, "samtools/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/sort": { "branch": "master", "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "seqkit/sliding": { "branch": "master", "git_sha": "0aa251c4ce7318c86b9868d8cb8dd6dd5d7da849", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tiara/tiara": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/local/get_largest_scaff.nf b/modules/local/get_largest_scaff.nf new file mode 100644 index 0000000..2296958 --- /dev/null +++ b/modules/local/get_largest_scaff.nf @@ -0,0 +1,40 @@ +process GET_LARGEST_SCAFF { + + tag "$meta.id" + label 'process_low' + + conda "conda-forge::coreutils=9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'docker.io/ubuntu:20.04' }" + + input: + tuple val( meta ), path( file ) + + output: + env largest_scaff , emit: scaff_size + path "versions.yml" , emit: versions + + shell: + def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + $/ + largest_scaff=`head -n 1 "${file}" | cut -d$'\t' -f2` + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coreutils: $VERSION + END_VERSIONS + /$ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + largest_scaff=1000000 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coreutils: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/getchromsizes/main.nf b/modules/nf-core/custom/getchromsizes/main.nf new file mode 100644 index 0000000..060a2e8 --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/main.nf @@ -0,0 +1,44 @@ +process CUSTOM_GETCHROMSIZES { + tag "$fasta" + label 'process_single' + + conda "bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'biocontainers/samtools:1.16.1--h6899075_1' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path ("*.sizes"), emit: sizes + tuple val(meta), path ("*.fai") , emit: fai + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools faidx $fasta + cut -f 1,2 ${fasta}.fai > ${fasta}.sizes + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + getchromsizes: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${fasta}.fai + touch ${fasta}.sizes + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + getchromsizes: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/getchromsizes/meta.yml b/modules/nf-core/custom/getchromsizes/meta.yml new file mode 100644 index 0000000..219ca1d --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/meta.yml @@ -0,0 +1,53 @@ +name: custom_getchromsizes +description: Generates a FASTA file of chromosome sizes and a fasta index file +keywords: + - fasta + - chromosome + - indexing +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + tool_dev_url: https://github.com/samtools/samtools + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta,fna,fas}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sizes: + type: file + description: File containing chromosome lengths + pattern: "*.{sizes}" + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@tamara-hodgetts" + - "@chris-cheshire" + - "@muffato" diff --git a/modules/nf-core/gnu/sort/main.nf b/modules/nf-core/gnu/sort/main.nf new file mode 100644 index 0000000..b0a57fb --- /dev/null +++ b/modules/nf-core/gnu/sort/main.nf @@ -0,0 +1,52 @@ +process GNU_SORT { + tag "${meta.id}" + label "process_low" + + conda "bioconda::coreutils=8.25" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/coreutils:8.25--1' : + 'biocontainers/coreutils:8.25--1' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), file( "${output_file}" ) , emit: sorted + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.extension}" + output_file = "${prefix}.${suffix}" + def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + if ("$input" == "$output_file") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + sort ${args} ${input} > ${output_file} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coreutils: $VERSION + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.extension}" + output_file = "${prefix}.${suffix}" + def VERSION = "9.1" + + if ("$input" == "$output_file") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + sort ${args} ${input} > ${output_file} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coreutils: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/gnu/sort/meta.yml b/modules/nf-core/gnu/sort/meta.yml new file mode 100644 index 0000000..e7fb028 --- /dev/null +++ b/modules/nf-core/gnu/sort/meta.yml @@ -0,0 +1,42 @@ +name: "GNU_SORT" +description: | + Writes a sorted concatenation of file/s +keywords: + - GNU + - sort + - merge compare +tools: + - sort: + description: "Writes a sorted concatenation of file/s" + homepage: "https://github.com/vgl-hub/gfastats" + documentation: "https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html" + licence: ["GPL"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Draft assembly file + pattern: "*.{txt,bed,interval,genome,bins}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sorted: + type: file + description: The sorted txt file generated by sort + pattern: "*.{txt,bed,interval,genome,bins}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@DLBPointon" diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf new file mode 100644 index 0000000..59ed308 --- /dev/null +++ b/modules/nf-core/samtools/faidx/main.nf @@ -0,0 +1,50 @@ +process SAMTOOLS_FAIDX { + tag "$fasta" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(fai) + + output: + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + faidx \\ + $fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' + """ + ${fastacmd} + touch ${fasta}.fai + + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml new file mode 100644 index 0000000..957b25e --- /dev/null +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -0,0 +1,57 @@ +name: samtools_faidx +description: Index FASTA file +keywords: + - index + - fasta + - faidx +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@phue" From 6eb79325254a19c046fda7d8664c0607100e04e0 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 12:17:35 +0100 Subject: [PATCH 03/17] Linting --- bin/gc_content.py | 4 +++ modules.json | 74 ++++++++++++----------------------------------- 2 files changed, 23 insertions(+), 55 deletions(-) diff --git a/bin/gc_content.py b/bin/gc_content.py index b2f786b..0289d3e 100644 --- a/bin/gc_content.py +++ b/bin/gc_content.py @@ -1,6 +1,10 @@ #!/usr/bin/env python3 """ Script for finding the GC content of each sequence in a multiFASTA file + +Written by Eerik Aunin @eeaunin + +Adapted by Damon-Lee Pointon @DLBPointon """ import argparse diff --git a/modules.json b/modules.json index dab76c6..1bcb845 100644 --- a/modules.json +++ b/modules.json @@ -8,132 +8,96 @@ "blast/blastn": { "branch": "master", "git_sha": "1728df69ccbaf4ccb1027c6fd4e9191f48c22194", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "blast/makeblastdb": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "custom/getchromsizes": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "diamond/blastx": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fastqc": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fcs/fcsadaptor": { "branch": "master", "git_sha": "5a35af8b60d45425c4b9193e567d16b614d93dbe", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gnu/sort": { "branch": "master", "git_sha": "88f6e982fb8bd40488d837b3b08a65008e602840", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "kraken2/kraken2": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff" }, "minimap2/align": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "minimap2/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/depth": { "branch": "master", "git_sha": "a1ffbc1fd87bd5a829e956cc26ec9cc53af3e817", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/sort": { "branch": "master", "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "seqkit/sliding": { "branch": "master", "git_sha": "0aa251c4ce7318c86b9868d8cb8dd6dd5d7da849", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tiara/tiara": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } } } } -} \ No newline at end of file +} From 0dcf74abcd23564d8fc9cd8b13dc6df6a9a67392 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 12:24:15 +0100 Subject: [PATCH 04/17] Correcting naming --- workflows/ascc.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 6060d12..8571f26 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -69,7 +69,7 @@ workflow ASCC { ch_versions = ch_versions.mix(YAML_INPUT.out.versions) GC_CONTENT ( - YAML_INPUT.out.ref_tuple + YAML_INPUT.out.reference_tuple ) //Channel @@ -80,7 +80,7 @@ workflow ASCC { // SUBWORKFLOW: GENERATE GENOME FILE // GENERATE_GENOME ( - YAML_INPUT.out.ref_tuple + YAML_INPUT.out.reference_tuple ) ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) From ad8ae33acb7b3b32fe01f7a764947bf51277910e Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 12:27:50 +0100 Subject: [PATCH 05/17] Updating module for our needs --- modules.json | 3 +- .../getchromsizes/custom-getchromsizes.diff | 63 +++++++++++++++++++ modules/nf-core/custom/getchromsizes/main.nf | 31 +++++---- 3 files changed, 85 insertions(+), 12 deletions(-) create mode 100644 modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff diff --git a/modules.json b/modules.json index dab76c6..1ff4258 100644 --- a/modules.json +++ b/modules.json @@ -31,7 +31,8 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": [ "modules" - ] + ], + "patch": "modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff" }, "diamond/blastx": { "branch": "master", diff --git a/modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff b/modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff new file mode 100644 index 0000000..6d72652 --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff @@ -0,0 +1,63 @@ +Changes in module 'nf-core/custom/getchromsizes' +--- modules/nf-core/custom/getchromsizes/main.nf ++++ modules/nf-core/custom/getchromsizes/main.nf +@@ -1,5 +1,9 @@ ++// Forked from the nf-core module to: ++// 1. allow selecting a different extension for the `sizes` channel ++// 2. force all output files to be named according to the prefix ++// 3. rename the input fasta file too and output it so that it can be "published" + process CUSTOM_GETCHROMSIZES { +- tag "$fasta" ++ tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.16.1" +@@ -8,22 +12,26 @@ + 'biocontainers/samtools:1.16.1--h6899075_1' }" + + input: +- tuple val(meta), path(fasta) ++ tuple val(meta), path(fasta, stageAs: 'input/*') ++ val suffix + + output: +- tuple val(meta), path ("*.sizes"), emit: sizes +- tuple val(meta), path ("*.fai") , emit: fai +- tuple val(meta), path ("*.gzi") , emit: gzi, optional: true +- path "versions.yml" , emit: versions ++ tuple val(meta), path ("*.${suffix}") , emit: sizes ++ tuple val(meta), path ("*.fa") , emit: fasta ++ tuple val(meta), path ("*.fai") , emit: fai ++ tuple val(meta), path ("*.gzi") , emit: gzi, optional: true ++ path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: +- def args = task.ext.args ?: '' ++ def args = task.ext.args ?: '' ++ def prefix = task.ext.prefix ?: "${meta.id}" + """ +- samtools faidx $fasta +- cut -f 1,2 ${fasta}.fai > ${fasta}.sizes ++ ln -s ${fasta} ${prefix}.fa ++ samtools faidx ${prefix}.fa -o ${prefix}.fa.fai ++ cut -f 1,2 ${prefix}.fa.fai > ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": +@@ -33,8 +41,9 @@ + + stub: + """ +- touch ${fasta}.fai +- touch ${fasta}.sizes ++ ln -s ${fasta} ${prefix}.fa ++ touch ${prefix}.fa.fai ++ touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/custom/getchromsizes/main.nf b/modules/nf-core/custom/getchromsizes/main.nf index 060a2e8..b6387e0 100644 --- a/modules/nf-core/custom/getchromsizes/main.nf +++ b/modules/nf-core/custom/getchromsizes/main.nf @@ -1,5 +1,9 @@ +// Forked from the nf-core module to: +// 1. allow selecting a different extension for the `sizes` channel +// 2. force all output files to be named according to the prefix +// 3. rename the input fasta file too and output it so that it can be "published" process CUSTOM_GETCHROMSIZES { - tag "$fasta" + tag "$meta.id" label 'process_single' conda "bioconda::samtools=1.16.1" @@ -8,22 +12,26 @@ process CUSTOM_GETCHROMSIZES { 'biocontainers/samtools:1.16.1--h6899075_1' }" input: - tuple val(meta), path(fasta) + tuple val(meta), path(fasta, stageAs: 'input/*') + val suffix output: - tuple val(meta), path ("*.sizes"), emit: sizes - tuple val(meta), path ("*.fai") , emit: fai - tuple val(meta), path ("*.gzi") , emit: gzi, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path ("*.${suffix}") , emit: sizes + tuple val(meta), path ("*.fa") , emit: fasta + tuple val(meta), path ("*.fai") , emit: fai + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" """ - samtools faidx $fasta - cut -f 1,2 ${fasta}.fai > ${fasta}.sizes + ln -s ${fasta} ${prefix}.fa + samtools faidx ${prefix}.fa -o ${prefix}.fa.fai + cut -f 1,2 ${prefix}.fa.fai > ${prefix}.${suffix} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -33,8 +41,9 @@ process CUSTOM_GETCHROMSIZES { stub: """ - touch ${fasta}.fai - touch ${fasta}.sizes + ln -s ${fasta} ${prefix}.fa + touch ${prefix}.fa.fai + touch ${prefix}.${suffix} cat <<-END_VERSIONS > versions.yml "${task.process}": From ce46b9e8d5831801df2a1f416628f6be0b266cd1 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 12:30:58 +0100 Subject: [PATCH 06/17] Updating yaml_input --- subworkflows/local/yaml_input.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 4ffd8c7..602f79d 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -51,7 +51,7 @@ workflow YAML_INPUT { group.assembly_title .combine( group.assembly_path ) .map { id, file -> - tuple( [ id: meta.id ], + tuple( [ id: id ], file ) } @@ -60,7 +60,7 @@ workflow YAML_INPUT { group.assembly_title .combine( group.pacbio_reads ) .map { id, file -> - tuple( [ id: meta.id ], + tuple( [ id: id ], file ) } From a8d1e7120a4a2d5c1aa67a0dab513c7f240ba504 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 12:39:25 +0100 Subject: [PATCH 07/17] General fixes --- conf/test_full.config | 7 +++++++ workflows/ascc.nf | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/conf/test_full.config b/conf/test_full.config index 6857fb1..338f7c8 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -9,6 +9,13 @@ ---------------------------------------------------------------------------------------- */ +process { + maxForks = 1 +} + +executor { + queueSize=1 +} params { config_profile_name = 'Full test profile' diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 8571f26..ccfb130 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -95,7 +95,7 @@ workflow ASCC { // // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE // - /*GENERATE_GENOME.out.reference_tuple + /*YAML_INPUT.out.reference_tuple .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() ) .combine ( YAML_INPUT.out.seqkit_window.toInteger() ) .map { meta, ref, sliding, window -> @@ -122,7 +122,7 @@ workflow ASCC { // SUBWORKFLOW: // RUN_FCSADAPTOR ( - GENERATE_GENOME.out.reference_tuple + YAML_INPUT.out.reference_tuple ) ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) From b02bad6c32f96583e0c3c7fd4e35013e92a669ad Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 12:53:57 +0100 Subject: [PATCH 08/17] General fixes --- conf/modules.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index 5413100..b950c9b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,4 +52,9 @@ process { ext.prefix = { "${meta.id}_euk" } } + withName: '.*:.*:GENERATE_GENOME:GNU_SORT' { + ext.prefix = { "${meta.id}_sorted"} + ext.args = { '-k2,2 -nr' } + } + } From 689512a624ffba7d157ff5176eead2335000cd54 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 12:57:28 +0100 Subject: [PATCH 09/17] PERMISSIONS --- bin/BedTools.py | 0 bin/extract_contaminants_by_type.py | 0 bin/filter_barcode_blast_results.py | 0 bin/gc_content.py | 0 bin/organelle_contamination_recommendation.py | 0 bin/pacbio_barcode_check.py | 0 bin/reformat_diamond_outfmt6.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/BedTools.py mode change 100644 => 100755 bin/extract_contaminants_by_type.py mode change 100644 => 100755 bin/filter_barcode_blast_results.py mode change 100644 => 100755 bin/gc_content.py mode change 100644 => 100755 bin/organelle_contamination_recommendation.py mode change 100644 => 100755 bin/pacbio_barcode_check.py mode change 100644 => 100755 bin/reformat_diamond_outfmt6.py diff --git a/bin/BedTools.py b/bin/BedTools.py old mode 100644 new mode 100755 diff --git a/bin/extract_contaminants_by_type.py b/bin/extract_contaminants_by_type.py old mode 100644 new mode 100755 diff --git a/bin/filter_barcode_blast_results.py b/bin/filter_barcode_blast_results.py old mode 100644 new mode 100755 diff --git a/bin/gc_content.py b/bin/gc_content.py old mode 100644 new mode 100755 diff --git a/bin/organelle_contamination_recommendation.py b/bin/organelle_contamination_recommendation.py old mode 100644 new mode 100755 diff --git a/bin/pacbio_barcode_check.py b/bin/pacbio_barcode_check.py old mode 100644 new mode 100755 diff --git a/bin/reformat_diamond_outfmt6.py b/bin/reformat_diamond_outfmt6.py old mode 100644 new mode 100755 From 586d26d1a99455bec0ec471b1adadaa9efc7b4a1 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 5 Oct 2023 16:15:43 +0100 Subject: [PATCH 10/17] Updates to versions --- subworkflows/local/generate_genome.nf | 1 + workflows/ascc.nf | 1 + 2 files changed, 2 insertions(+) diff --git a/subworkflows/local/generate_genome.nf b/subworkflows/local/generate_genome.nf index 61cd434..4480d0b 100755 --- a/subworkflows/local/generate_genome.nf +++ b/subworkflows/local/generate_genome.nf @@ -31,6 +31,7 @@ workflow GENERATE_GENOME { GNU_SORT ( CUSTOM_GETCHROMSIZES.out.sizes ) + ch_versions = ch_versions.mix( GNU_SORT.out.versions ) // // MODULE: Cut out the largest scaffold size and use as comparator against 512MB diff --git a/workflows/ascc.nf b/workflows/ascc.nf index ccfb130..f6fc79c 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -71,6 +71,7 @@ workflow ASCC { GC_CONTENT ( YAML_INPUT.out.reference_tuple ) + ch_versions = ch_versions.mix(GC_CONTENT.out.versions) //Channel // .fromPath( YAML_INPUT.out.nt_database, checkIfExists=true ) From 1d2051ed2ac79b059290fd5235e79c68d1619524 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 9 Oct 2023 10:37:12 +0100 Subject: [PATCH 11/17] Correcting version outputs and containers of use --- conf/test_full.config | 7 +++++ modules/local/blast_chunk_to_full.nf | 7 +++-- modules/local/blast_get_top_hits.nf | 8 +++-- modules/local/check_barcode.nf | 8 +++-- modules/local/extract_contaminants.nf | 8 +++-- modules/local/filter_barcode.nf | 8 +++-- modules/local/format_diamond_outfmt6.nf | 8 +++-- modules/local/gc_content.nf | 8 +++-- modules/local/get_lineage_for_kraken.nf | 1 + modules/local/get_lineage_for_top.nf | 8 +++-- ...organelle_contamination_recommendations.nf | 8 +++-- modules/local/reformat_full_outfmt6.nf | 8 +++-- modules/nf-core/custom/getchromsizes/main.nf | 31 ++++++++++++------- 13 files changed, 77 insertions(+), 41 deletions(-) diff --git a/conf/test_full.config b/conf/test_full.config index 6857fb1..338f7c8 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -9,6 +9,13 @@ ---------------------------------------------------------------------------------------- */ +process { + maxForks = 1 +} + +executor { + queueSize=1 +} params { config_profile_name = 'Full test profile' diff --git a/modules/local/blast_chunk_to_full.nf b/modules/local/blast_chunk_to_full.nf index 6c4c228..6fdb04f 100644 --- a/modules/local/blast_chunk_to_full.nf +++ b/modules/local/blast_chunk_to_full.nf @@ -2,10 +2,10 @@ process BLAST_CHUNK_TO_FULL { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(chunked) @@ -22,6 +22,7 @@ process BLAST_CHUNK_TO_FULL { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') blast_hit_chunk_coords_to_full_coords: \$(blast_hit_chunk_coords_to_full_coords.py -v) END_VERSIONS """ diff --git a/modules/local/blast_get_top_hits.nf b/modules/local/blast_get_top_hits.nf index 5faaa9c..6590576 100644 --- a/modules/local/blast_get_top_hits.nf +++ b/modules/local/blast_get_top_hits.nf @@ -2,10 +2,10 @@ process BLAST_GET_TOP_HITS { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(outfmt6) @@ -21,6 +21,7 @@ process BLAST_GET_TOP_HITS { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') blast_get_top_hits: \$(blast_get_top_hits.py -v) END_VERSIONS """ @@ -31,6 +32,7 @@ process BLAST_GET_TOP_HITS { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') reformat_blast_outfmt6: \$(blast_get_top_hits.py -v) END_VERSIONS """ diff --git a/modules/local/check_barcode.nf b/modules/local/check_barcode.nf index b02ce59..c3ccfc4 100644 --- a/modules/local/check_barcode.nf +++ b/modules/local/check_barcode.nf @@ -2,10 +2,10 @@ process CHECK_BARCODE { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta) , path(barcodes) @@ -27,6 +27,7 @@ process CHECK_BARCODE { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') pacbio_barcode_check: \$(pacbio_barcode_check.py -v) END_VERSIONS """ @@ -37,6 +38,7 @@ process CHECK_BARCODE { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') pacbio_barcode_check: \$(pacbio_barcode_check.py -v) END_VERSIONS """ diff --git a/modules/local/extract_contaminants.nf b/modules/local/extract_contaminants.nf index b2a7c8b..63116cd 100644 --- a/modules/local/extract_contaminants.nf +++ b/modules/local/extract_contaminants.nf @@ -2,10 +2,10 @@ process EXTRACT_CONTAMINANTS { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(fasta) @@ -24,6 +24,7 @@ process EXTRACT_CONTAMINANTS { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') extract_contaminants_by_type: \$(extract_contaminants_by_type.py -v) END_VERSIONS """ @@ -35,6 +36,7 @@ process EXTRACT_CONTAMINANTS { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') extract_contaminants_by_type: \$(extract_contaminants_by_type.py -v) END_VERSIONS """ diff --git a/modules/local/filter_barcode.nf b/modules/local/filter_barcode.nf index 0b2c90d..15f9c16 100644 --- a/modules/local/filter_barcode.nf +++ b/modules/local/filter_barcode.nf @@ -2,10 +2,10 @@ process FILTER_BARCODE { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(fasta) @@ -28,6 +28,7 @@ process FILTER_BARCODE { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') filter_barcode_blast_results: \$(filter_barcode_blast_results.py -v) END_VERSIONS """ @@ -41,6 +42,7 @@ process FILTER_BARCODE { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') filter_barcode_blast_results: \$(filter_barcode_blast_results.py -v) END_VERSIONS """ diff --git a/modules/local/format_diamond_outfmt6.nf b/modules/local/format_diamond_outfmt6.nf index f9fdcf3..e2acf1c 100644 --- a/modules/local/format_diamond_outfmt6.nf +++ b/modules/local/format_diamond_outfmt6.nf @@ -2,10 +2,10 @@ process REFORMAT_FULL_OUTFMT6 { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(diamond_blast) @@ -21,6 +21,7 @@ process REFORMAT_FULL_OUTFMT6 { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') reformat_diamond_outfmt6: \$(reformat_diamond_outfmt6.py -v) END_VERSIONS """ @@ -32,6 +33,7 @@ process REFORMAT_FULL_OUTFMT6 { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') reformat_diamond_outfmt6: \$(reformat_diamond_outfmt6.py -v) END_VERSIONS """ diff --git a/modules/local/gc_content.nf b/modules/local/gc_content.nf index 0a650bf..76520f1 100644 --- a/modules/local/gc_content.nf +++ b/modules/local/gc_content.nf @@ -2,10 +2,10 @@ process GC_CONTENT { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(fasta) @@ -21,6 +21,7 @@ process GC_CONTENT { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') gc_content: \$(gc_content.py -v) END_VERSIONS """ @@ -31,6 +32,7 @@ process GC_CONTENT { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') gc_content: \$(gc_content.py -v) END_VERSIONS """ diff --git a/modules/local/get_lineage_for_kraken.nf b/modules/local/get_lineage_for_kraken.nf index 59523c0..6a55a5e 100755 --- a/modules/local/get_lineage_for_kraken.nf +++ b/modules/local/get_lineage_for_kraken.nf @@ -44,6 +44,7 @@ process GET_LINEAGE_FOR_KRAKEN { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') + pandas: \$(pip list | grep "pandas" | sed 's/[[:blank:]]//g' | sed 's/pandas//g') general_purpose_functions.py: \$(general_purpose_functions.py --version | cut -d' ' -f2) get_lineage_for_kraken_results.py: \$(get_lineage_for_kraken_results.py --version | cut -d' ' -f2) END_VERSIONS diff --git a/modules/local/get_lineage_for_top.nf b/modules/local/get_lineage_for_top.nf index 7f29868..489ba06 100644 --- a/modules/local/get_lineage_for_top.nf +++ b/modules/local/get_lineage_for_top.nf @@ -2,10 +2,10 @@ process GET_LINEAGE_FOR_TOP { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(tophits) @@ -22,6 +22,7 @@ process GET_LINEAGE_FOR_TOP { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') get_lineage_for_top: \$(get_lineage_for_top.py -v) END_VERSIONS """ @@ -32,6 +33,7 @@ process GET_LINEAGE_FOR_TOP { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') get_lineage_for_top: \$(get_lineage_for_top.py -v) END_VERSIONS """ diff --git a/modules/local/organelle_contamination_recommendations.nf b/modules/local/organelle_contamination_recommendations.nf index dda5487..f89b1dc 100644 --- a/modules/local/organelle_contamination_recommendations.nf +++ b/modules/local/organelle_contamination_recommendations.nf @@ -2,10 +2,10 @@ process ORGANELLE_CONTAMINATION_RECOMMENDATIONS { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(fasta) @@ -24,6 +24,7 @@ process ORGANELLE_CONTAMINATION_RECOMMENDATIONS { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') organelle_contamination_recommendation: \$(organelle_contamination_recommendation.py -v) END_VERSIONS """ @@ -36,6 +37,7 @@ process ORGANELLE_CONTAMINATION_RECOMMENDATIONS { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') organelle_contamination_recommendation: \$(organelle_contamination_recommendation.py -v) END_VERSIONS """ diff --git a/modules/local/reformat_full_outfmt6.nf b/modules/local/reformat_full_outfmt6.nf index 7fb83e2..fad2989 100644 --- a/modules/local/reformat_full_outfmt6.nf +++ b/modules/local/reformat_full_outfmt6.nf @@ -2,10 +2,10 @@ process REFORMAT_FULL_OUTFMT6 { tag "${meta.id}" label 'process_low' - conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(full_blast) @@ -21,6 +21,7 @@ process REFORMAT_FULL_OUTFMT6 { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') reformat_blast_outfmt6: \$(reformat_blast_outfmt6.py -v) END_VERSIONS """ @@ -31,6 +32,7 @@ process REFORMAT_FULL_OUTFMT6 { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') reformat_blast_outfmt6: \$(reformat_blast_outfmt6.py -v) END_VERSIONS """ diff --git a/modules/nf-core/custom/getchromsizes/main.nf b/modules/nf-core/custom/getchromsizes/main.nf index 060a2e8..b6387e0 100644 --- a/modules/nf-core/custom/getchromsizes/main.nf +++ b/modules/nf-core/custom/getchromsizes/main.nf @@ -1,5 +1,9 @@ +// Forked from the nf-core module to: +// 1. allow selecting a different extension for the `sizes` channel +// 2. force all output files to be named according to the prefix +// 3. rename the input fasta file too and output it so that it can be "published" process CUSTOM_GETCHROMSIZES { - tag "$fasta" + tag "$meta.id" label 'process_single' conda "bioconda::samtools=1.16.1" @@ -8,22 +12,26 @@ process CUSTOM_GETCHROMSIZES { 'biocontainers/samtools:1.16.1--h6899075_1' }" input: - tuple val(meta), path(fasta) + tuple val(meta), path(fasta, stageAs: 'input/*') + val suffix output: - tuple val(meta), path ("*.sizes"), emit: sizes - tuple val(meta), path ("*.fai") , emit: fai - tuple val(meta), path ("*.gzi") , emit: gzi, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path ("*.${suffix}") , emit: sizes + tuple val(meta), path ("*.fa") , emit: fasta + tuple val(meta), path ("*.fai") , emit: fai + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" """ - samtools faidx $fasta - cut -f 1,2 ${fasta}.fai > ${fasta}.sizes + ln -s ${fasta} ${prefix}.fa + samtools faidx ${prefix}.fa -o ${prefix}.fa.fai + cut -f 1,2 ${prefix}.fa.fai > ${prefix}.${suffix} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -33,8 +41,9 @@ process CUSTOM_GETCHROMSIZES { stub: """ - touch ${fasta}.fai - touch ${fasta}.sizes + ln -s ${fasta} ${prefix}.fa + touch ${prefix}.fa.fai + touch ${prefix}.${suffix} cat <<-END_VERSIONS > versions.yml "${task.process}": From cf44a53a758f8e9830c29437ca42759b72300beb Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 9 Oct 2023 10:55:26 +0100 Subject: [PATCH 12/17] Reformatting python script --- bin/reformat_blast_outfmt6.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/bin/reformat_blast_outfmt6.py b/bin/reformat_blast_outfmt6.py index 7e94ed2..1270fe8 100755 --- a/bin/reformat_blast_outfmt6.py +++ b/bin/reformat_blast_outfmt6.py @@ -23,20 +23,6 @@ in_data = gpf.ll(in_path) for line in in_data: - split_line = line.split() - assert len(split_line) == 14 - output_line = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( - split_line[0], - split_line[4], - split_line[5], - split_line[6], - split_line[7], - split_line[8], - split_line[9], - split_line[10], - split_line[11], - split_line[12], - split_line[13], - split_line[2], - ) - print(output_line) + s = line.split() + assert len(s) == 14 + print(f"{s[0]}\t{s[4]}\t{s[5]}\t{s[6]}\t{s[7]}\t{s[8]}\t{s[9]}\t{s[10]}\t{s[11]}\t{s[12]}\t{s[13]}\t{s[2]}") From 9fa6e177345755869987338e06d3b65506a2eb5d Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 10 Oct 2023 14:27:10 +0100 Subject: [PATCH 13/17] Adding easier to read line print --- bin/reformat_blast_outfmt6.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/reformat_blast_outfmt6.py b/bin/reformat_blast_outfmt6.py index 1270fe8..a867ee5 100755 --- a/bin/reformat_blast_outfmt6.py +++ b/bin/reformat_blast_outfmt6.py @@ -25,4 +25,4 @@ for line in in_data: s = line.split() assert len(s) == 14 - print(f"{s[0]}\t{s[4]}\t{s[5]}\t{s[6]}\t{s[7]}\t{s[8]}\t{s[9]}\t{s[10]}\t{s[11]}\t{s[12]}\t{s[13]}\t{s[2]}") + print('\t'.join( s[0:1] + s[4:] + s[2:3] )) From abc081466bada7a0af592db86af1f2898969b8cb Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 10 Oct 2023 14:28:24 +0100 Subject: [PATCH 14/17] Black Linting --- bin/reformat_blast_outfmt6.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/reformat_blast_outfmt6.py b/bin/reformat_blast_outfmt6.py index a867ee5..e32755e 100755 --- a/bin/reformat_blast_outfmt6.py +++ b/bin/reformat_blast_outfmt6.py @@ -25,4 +25,4 @@ for line in in_data: s = line.split() assert len(s) == 14 - print('\t'.join( s[0:1] + s[4:] + s[2:3] )) + print("\t".join(s[0:1] + s[4:] + s[2:3])) From 65fdac303ac54c97e2ab5ac15dc6e2f7fd9e4aeb Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 10 Oct 2023 14:33:37 +0100 Subject: [PATCH 15/17] Fix overcorrection --- modules.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 09f3b6a..759164e 100644 --- a/modules.json +++ b/modules.json @@ -44,7 +44,8 @@ "gnu/sort": { "branch": "master", "git_sha": "88f6e982fb8bd40488d837b3b08a65008e602840", - + "installed_by": ["modules"] + }, "fcs/fcsgx": { "branch": "master", "git_sha": "8c4542e5d421c4690cf1fa6ec729e9304763fdaf", From 332f02ec898f46ca52e09425c4d77e0823cfc4b5 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 10 Oct 2023 14:36:23 +0100 Subject: [PATCH 16/17] Correction to channel from dev --- workflows/ascc.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index f128f60..9227657 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -127,17 +127,17 @@ workflow ASCC { YAML_INPUT.out.reference_tuple ) ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) - + // // SUBWORKFLOW: // RUN_FCSGX ( - YAML_INPUT.out.reference, - YAML_INPUT.out.fcs_gx_database_path, - YAML_INPUT.out.taxid, - YAML_INPUT.out.ncbi_rankedlineage_path + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.fcs_gx_database_path, + YAML_INPUT.out.taxid, + YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_versions = ch_versions.mix(RUN_FCSGX.out.versions) + ch_versions = ch_versions.mix(RUN_FCSGX.out.versions) // // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS From 5e01a76d4857aff7f51eb8302eae10760848cf26 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 10 Oct 2023 14:48:36 +0100 Subject: [PATCH 17/17] Correction to channel from dev --- subworkflows/local/run_fcsgx.nf | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/run_fcsgx.nf b/subworkflows/local/run_fcsgx.nf index c592578..a748f67 100644 --- a/subworkflows/local/run_fcsgx.nf +++ b/subworkflows/local/run_fcsgx.nf @@ -13,23 +13,22 @@ workflow RUN_FCSGX { ch_versions = Channel.empty() Channel - .of('all.gxi', 'all.gxs', 'all.taxa.tsv', 'all.meta.jsonl', 'all.blast_div.tsv.gz') - .combine(fcsgxpath) - .map {suxfix, dbpath -> [file(dbpath + '/' + suxfix)]} - .collect() - .set {fcsgxdb} + .of('all.gxi', 'all.gxs', 'all.taxa.tsv', 'all.meta.jsonl', 'all.blast_div.tsv.gz') + .combine(fcsgxpath) + .map {suxfix, dbpath -> [file(dbpath + '/' + suxfix)]} + .collect() + .set {fcsgxdb} // // Create input channel for FCS_FCSGX, taxid is required to be the meta id. // reference - .combine( taxid ) - .map { it -> - tuple ([taxid: it[1]], - it[0]) - } - .set { reference_with_taxid } - + .combine( taxid ) + .map { it -> + tuple ([taxid: it[2]], + it[1]) + } + .set { reference_with_taxid } // // MODULE: FCS_FCSGX run on assembly fasta tuple with taxid againist fcsgxdb. @@ -45,12 +44,12 @@ workflow RUN_FCSGX { // FCS_FCSGX.out.fcs_gx_report .map{ it -> - tuple(it[0], - it[1].getParent() + tuple( it[0], + it[1].getParent() ) } .set { report_path } - + // // MODULE: PARSE_FCSGX_RESULT to parse the FCS_FCSGX result output in csv format. // @@ -64,4 +63,4 @@ workflow RUN_FCSGX { fcsgxresult = PARSE_FCSGX_RESULT.out.fcsgxresult versions = ch_versions.ifEmpty(null) -} \ No newline at end of file +}