From 0788bee679924783cc9afa25bb3bd5c592a2fb3f Mon Sep 17 00:00:00 2001 From: William Eagles Date: Thu, 13 Jul 2023 14:22:58 +0100 Subject: [PATCH 1/6] Add intial kraken subworkflow --- bin/general_purpose_functions.py | 228 +++++++++++++++++++++++ bin/get_lineage_for_kraken_results.py | 94 ++++++++++ conf/modules.config | 4 + modules.json | 21 ++- modules/local/get_lineage_for_kraken.nf | 51 +++++ modules/nf-core/kraken2/kraken2/main.nf | 58 ++++++ modules/nf-core/kraken2/kraken2/meta.yml | 75 ++++++++ subworkflows/local/run_nt_kraken.nf | 43 +++++ 8 files changed, 570 insertions(+), 4 deletions(-) create mode 100755 bin/general_purpose_functions.py create mode 100755 bin/get_lineage_for_kraken_results.py create mode 100755 modules/local/get_lineage_for_kraken.nf create mode 100644 modules/nf-core/kraken2/kraken2/main.nf create mode 100644 modules/nf-core/kraken2/kraken2/meta.yml create mode 100755 subworkflows/local/run_nt_kraken.nf diff --git a/bin/general_purpose_functions.py b/bin/general_purpose_functions.py new file mode 100755 index 0000000..c2c216d --- /dev/null +++ b/bin/general_purpose_functions.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +""" +General purpose functions +File for functions that can be reused in many Python scripts +""" +# MIT License +# +# Copyright (c) 2020-2021 Genome Research Ltd. +# +# Author: Eerik Aunin (ea10@sanger.ac.uk) +# +# This file is a part of the Genome Decomposition Analysis (GDA) pipeline. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import os +from os.path import isfile +import sys +import subprocess +import signal +from datetime import datetime +import argparse + + +def l(path): + """ + Function for loading text file as a list and removing line breaks from line ends + """ + lines = [] + if isfile(path): + with open(path, "r") as in_file: + lines = in_file.readlines() + lines = [x.rstrip() for x in lines] + else: + sys.stderr.write("Error: file not found (" + path + ")\n") + sys.exit(1) + return lines + + +def ll(in_path): + """ + Function for reading a text file line by line + """ + if isfile(in_path): + with open(in_path, "r") as in_file: + for line in in_file: + line = line.rstrip() + yield line + else: + sys.stderr.write("Error: file not found (" + in_path + ")\n") + sys.exit(1) + + +def export_list_as_line_break_separated_file(out_list_f, out_path): + """ + Exports a list to a file, each item on a separate row + """ + with open(out_path, "w") as out_file: + for item in out_list_f: + out_file.write(str(item)) + out_file.write("\n") + + +def spl(line, left_splitter, right_splitter, direction=0): + """ + Function for cropping a string from left and right side + Direction: if 0: the string will be cropped first from left and then right + if 1: the string will be cropped first from right and then left + Returns None if the splitting cannot be done because a splitter or both splitters are not in the input string + """ + out_line = None + if left_splitter in line and right_splitter in line: + if direction == 0: + out_line = line.split(left_splitter)[1] + out_line = out_line.split(right_splitter)[0] + elif direction == 1: + out_line = line.split(right_splitter)[0] + out_line = out_line.split(left_splitter)[1] + return out_line + + +def print_with_fixed_row_length(seq, max_length): + """ + Input: 1) a string 2) maximum line length in output + Output: the input string printed to STDOUT in chunks with fixed maximum line length + """ + split_seq = [seq[i:i+max_length] for i in range(0, len(seq), max_length)] + for line in split_seq: + print(line) + + +def split_with_fixed_row_length(seq, max_length): + """ + Input: 1) a string 2) maximum line length in output + Output: the input string split in chunks with fixed maximum line length + """ + split_seq = [seq[i:i + max_length] for i in range(0, len(seq), max_length)] + return split_seq + + +def reverse_complement(seq): + """ + Returns the reverse complement of a DNA sequence + """ + complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'n': 'n'} + reverse_comp = "".join(complement.get(base, base) for base in reversed(seq)) + return reverse_comp + + +def read_fasta_in_chunks(in_path): + """ + Input: path to FASTA file + Output (iterator): string tuples where the first element is a FASTA header and the second element is the corresponding FASTA sequence + """ + in_data = ll(in_path) + current_seq_header = None + seq = "" + for line in in_data: + if line != "": + if line[0] == ">": + if seq != "": + yield (current_seq_header, seq) + seq = "" + current_seq_header = line[1:len(line)] + else: + seq += line + if seq != "": + yield (current_seq_header, seq) + + +def list_to_chunks(lst, n): + """ + Yield successive n-sized chunks from lst + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks + """ + for i in range(0, len(lst), n): + yield lst[i: i + n] + + +def string_to_chunks(line, n): + """ + Function for splitting a string every nth character + https://stackoverflow.com/questions/9475241/split-string-every-nth-character + """ + return [line[i: i + n] for i in range(0, len(line), n)] + + +def run_system_command(system_command, verbose=True, dry_run=False, tries=1, expected_exit_code=0): + """ + Executes a system command and checks its exit code + """ + triggering_script_name = sys.argv[0].split("/")[-1] + try_counter_string = "" + if dry_run == False: + for i in range(0, tries): + if verbose == True: + time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if i > 0: + try_counter_string = ", try {}".format(i + 1) + out_message = "<{}, {}{}> executing command: {}\n".format(time_now, triggering_script_name, try_counter_string, system_command) + sys.stderr.write(out_message) + try: + output = subprocess.check_output(system_command, stderr=subprocess.STDOUT, shell=True, timeout=None, universal_newlines=True) + break + except subprocess.CalledProcessError as exc: + out_errormessage = "<" + triggering_script_name + "> " + " exited with error code " + str(exc.returncode) + if exc.output.isspace() == False: + out_errormessage += ". Error message: " + exc.output + if i == tries - 1: + if exc.returncode != expected_exit_code: + sys.stderr.write(out_errormessage + "\n") + os.kill(os.getpid(), signal.SIGINT) + + + +def check_if_file_exists(in_path): + """ + Function for checking if a file exists + """ + if os.path.isfile(in_path) == False: + sys.stderr.write("The input file " + in_path + " was not found\n") + sys.exit(1) + + +def get_file_paths(in_folder_path, extension): + """ + Function for getting the paths to all files with a specific extension in a user-specified folder + in_folder_path: path to the folder with input files + extension: file extension of input files + Output: paths to individual files with the specific extension (list) + """ + onlyfiles = list() + selected_file_paths = list() + if os.path.isdir(in_folder_path): + onlyfiles = [f for f in os.listdir(in_folder_path) if os.path.isfile(os.path.join(in_folder_path, f))] + for file_item in onlyfiles: + if "." + extension in file_item: + file_item_split = file_item.split(".") + if file_item_split[len(file_item_split) - 1] == extension: + selected_file_paths.append(in_folder_path + "/" + file_item) + else: + sys.stderr.write("Error: folder not found (" + in_folder_path + ")\n") + sys.exit(1) + return selected_file_paths + +def main(): + # Placeholder for accessing version. + pass + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-v", "--version", action="version", version="1.0") diff --git a/bin/get_lineage_for_kraken_results.py b/bin/get_lineage_for_kraken_results.py new file mode 100755 index 0000000..a039990 --- /dev/null +++ b/bin/get_lineage_for_kraken_results.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +Script for getting lineage for Kraken results +Developed by Eerik Aunin (ea10@sanger.ac.uk) +""" + +import general_purpose_functions as gpf +import sys +from collections import OrderedDict +import pandas as pd +import os +import signal +import argparse + + +def load_kraken_results(kraken_results_path): + """ + Reads sequence names and taxid values from Kraken results file into a dictionary + """ + kraken_dict = OrderedDict() + kraken_data = gpf.ll(kraken_results_path) + for line in kraken_data: + if "(taxid " in line: + split_line = line.split() + if len(split_line) >= 5: + seq_name = split_line[1] + taxid = gpf.spl(line, "(taxid ", ")") + if seq_name in kraken_dict: + sys.stderr.write("Duplicate read names found in input ({})\n".format(seq_name)) + os.kill(os.getpid(), signal.SIGINT) + else: + kraken_dict[seq_name] = taxid + else: + sys.stderr.write("Failed to parse Kraken output file line:\n{}\n".format(line)) + else: + sys.stderr.write("No taxid found in input file line:\n{}\n".format(line)) + return kraken_dict + + +def load_lineage(lineage_dump_path, kraken_db_name): + """ + Reads lineage information from NCBI rankedlineage.dmp file (downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz) as a dictionary of dictionaries. + Keys: taxid values. Values: a dictionary where the keys are taxonomic unit names + (e.g. "species", "genus" or "family") and the values are the corresponding taxonomic names + """ + lineage_data = gpf.ll(lineage_dump_path) + lineage_dict = OrderedDict() + for line in lineage_data: + split_line = line.split("|") + split_line = [n.strip() for n in split_line] + entry_dict = {kraken_db_name + "_kraken_name": split_line[1], kraken_db_name + "_kraken_species": split_line[2], kraken_db_name + "_kraken_genus": split_line[3], kraken_db_name + "_kraken_family": split_line[4], kraken_db_name + "_kraken_order": split_line[5], kraken_db_name + "_kraken_class": split_line[6], kraken_db_name + "_kraken_phylum": split_line[7], kraken_db_name + "_kraken_kingdom": split_line[8], kraken_db_name + "_kraken_domain": split_line[9]} + lineage_dict[split_line[0]] = entry_dict + return lineage_dict + + +def get_kraken_and_lineage_dict(kraken_dict, lineage_dict, kraken_db_name): + """ + Merges the kraken results with lineage information for the taxid numbers + """ + kraken_and_lineage_dict = OrderedDict() + for seq_name in kraken_dict: + taxid = kraken_dict[seq_name] + lineage_entry = {kraken_db_name + "_kraken_taxid": "0", kraken_db_name + "_kraken_name": None, kraken_db_name + "_kraken_species": None, kraken_db_name + "_kraken_genus": None, kraken_db_name + "_kraken_family": None, kraken_db_name + "_kraken_order": None, kraken_db_name + "_kraken_class": None, kraken_db_name + "_kraken_phylum": None, kraken_db_name + "_kraken_kingdom": None, kraken_db_name + "_kraken_domain": None} + if taxid in lineage_dict: + lineage_entry = lineage_dict[taxid] + lineage_entry[kraken_db_name + "_kraken_taxid"] = taxid + else: + if taxid != "0": + sys.stderr.write("Taxid {} was not found in the lineages dump file\n".format(taxid)) + kraken_and_lineage_dict[seq_name] = lineage_entry + + return kraken_and_lineage_dict + + +def main(kraken_results_path, lineage_dump_path, kraken_db_name, out_path): + + kraken_dict = load_kraken_results(kraken_results_path) + lineage_dict = load_lineage(lineage_dump_path, kraken_db_name) + kraken_and_lineage_dict = get_kraken_and_lineage_dict(kraken_dict, lineage_dict, kraken_db_name) + df = pd.DataFrame.from_dict(kraken_and_lineage_dict) + df = df.transpose() + df.index = df.index.rename("scaff") + df.to_csv(out_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-v", "--version", action="version", version="1.0") + parser.add_argument("kraken_results_path", help="Path to output file of a Kraken run", type=str) + parser.add_argument("lineage_dump_path", help="Path to an NCBI taxonomy rankedlineage.dmp file (downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz)", type=str) + parser.add_argument("kraken_db_name", help="Kraken database name", type=str, choices=["bacterial", "nt"]) + parser.add_argument("out_path", help="Path for output CSV file", type=str) + args = parser.parse_args() + main(args.kraken_results_path, args.lineage_dump_path, args.kraken_db_name, args.out_path) diff --git a/conf/modules.config b/conf/modules.config index da58a5d..2460dc0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -38,4 +38,8 @@ process { ] } + withName: KRAKEN2_KRAKEN2 { + ext.args = { "--report-zero-counts --use-names" } + } + } diff --git a/modules.json b/modules.json index 6299d40..2821ead 100644 --- a/modules.json +++ b/modules.json @@ -8,20 +8,33 @@ "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "kraken2/kraken2": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/local/get_lineage_for_kraken.nf b/modules/local/get_lineage_for_kraken.nf new file mode 100755 index 0000000..a5f6dd6 --- /dev/null +++ b/modules/local/get_lineage_for_kraken.nf @@ -0,0 +1,51 @@ +process GET_LINEAGE_FOR_KRAKEN { + + tag "$meta.id" + label 'process_low' + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(kraken_file) + path ncbi_rankedlineage_path + + output: + path '*_nt_kraken_lineage_file.txt', emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = args.ext.prefix ?: "${meta.id}" + """ + get_lineage_for_kraken_results.py \\ + $kraken_file \\ + $ncbi_rankedlineage_path \\ + nt \\ + ${prefix}_nt_kraken_lineage_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + general_purpose_functions.py: \$(general_purpose_functions.py --version | cut -d' ' -f2) + get_lineage_for_kraken_results.py: \$(get_lineage_for_kraken_results.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = args.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_nt_kraken_lineage_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + general_purpose_functions.py: \$(general_purpose_functions.py --version | cut -d' ' -f2) + get_lineage_for_kraken_results.py: \$(get_lineage_for_kraken_results.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf new file mode 100644 index 0000000..da8d8c6 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/main.nf @@ -0,0 +1,58 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda "bioconda::kraken2=2.1.2 conda-forge::pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + val save_output_fastqs + val save_reads_assignment + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classifiedreads.txt') , optional:true, emit: classified_reads_assignment + tuple val(meta), path('*report.txt') , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : "" + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" + def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" + + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $unclassified_option \\ + $classified_option \\ + $readclassification_option \\ + $paired \\ + $args \\ + $reads + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml new file mode 100644 index 0000000..4721f45 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/meta.yml @@ -0,0 +1,75 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database + - save_output_fastqs: + type: string + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: string + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - classified_reads_assignment: + type: file + description: | + Kraken2 output file indicating the taxonomic assignment of + each input read + - report: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/subworkflows/local/run_nt_kraken.nf b/subworkflows/local/run_nt_kraken.nf new file mode 100755 index 0000000..0e302d8 --- /dev/null +++ b/subworkflows/local/run_nt_kraken.nf @@ -0,0 +1,43 @@ +#!/usr/bin/env nextflow + +// +// MODULE IMPORT BLOCK +// +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' +include { GET_LINEAGE_FOR_KRAKEN } from '../../modules/local/get_lineage_for_kraken' + +workflow RUN_NT_KRAKEN { + take: + assembly_fasta + nt_kraken_db_path + ncbi_rankedlineage_path + + main: + ch_versions = Channel.empty() + + // + // MODULE: Kraken2 run on assembly fasta. + // + KRAKEN2_KRAKEN2 ( + assembly_fasta, // val(meta), path(reads) + nt_kraken_db_path, // path db + false, // val save_output_fastqs + true // val save_reads_assignment + ) + ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions) + + // + // MODULE: Get lineage for kraken output. + // + GET_LINEAGE_FOR_KRAKEN ( + KRAKEN2_KRAKEN2.out.classified_reads_assignment, + ncbi_rankedlineage_path + ) + ch_versions = ch_versions.mix(GET_LINEAGE_FOR_KRAKEN.out.versions) + + emit: + KRAKEN2_KRAKEN2.out.classified_reads_assignment + KRAKEN2_KRAKEN2.out.report + GET_LINEAGE_FOR_KRAKEN.out.txt + versions = ch_versions.ifEmpty(null) +} \ No newline at end of file From 0b6a223da55a680bfdf3052a94ad27b64614e525 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 18 Jul 2023 14:44:34 +0100 Subject: [PATCH 2/6] Updates --- workflows/ascc.nf | 68 ++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 2c205a3..5a37303 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -35,7 +35,7 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { RUN_NT_KRAKEN } from '..//subworkflows/local/run_nt_kraken' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -46,8 +46,6 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* @@ -57,57 +55,55 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft */ // Info required for completion email and summary -def multiqc_report = [] workflow ASCC { + main: ch_versions = Channel.empty() + input_ch = Channel.fromPath(params.input, checkIfExists: true) + // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files + // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE // - INPUT_CHECK ( - file(params.input) + YAML_INPUT ( + input_ch ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - // ! There is currently no tooling to help you write a sample sheet schema + ch_versions = ch_versions.mix(YAML_INPUT.out.versions) // - // MODULE: Run FastQC + // SUBWORKFLOW: GENERATE GENOME FILE // - FASTQC ( - INPUT_CHECK.out.reads + GENERATE_GENOME ( + YAML_INPUT.out.assembly_title, + YAML_INPUT.out.reference ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') + // + // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA + // + EXTRACT_TIARA_HITS ( + GENERATE_GENOME.out.reference_tuple ) + ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) // - // MODULE: MultiQC + // SUBWORKFLOW: // - workflow_summary = WorkflowAscc.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - - methods_description = WorkflowAscc.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) - ch_methods_description = Channel.value(methods_description) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() + RUN_NT_KRAKEN ( + GENERATE_GENOME.out.reference_tuple + ) + + // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") + // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ + // ! There is currently no tooling to help you write a sample sheet schema + + + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') ) - multiqc_report = MULTIQC.out.report.toList() + } /* From 0ac544c308f2818644b2e5c413d38180bcc169c4 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 18 Jul 2023 15:06:47 +0100 Subject: [PATCH 3/6] Integrating YAML_input with Kraken module, modifying input chanel to include single_end variable --- subworkflows/local/run_nt_kraken.nf | 19 +++++++--- workflows/ascc.nf | 54 ++++------------------------- 2 files changed, 20 insertions(+), 53 deletions(-) diff --git a/subworkflows/local/run_nt_kraken.nf b/subworkflows/local/run_nt_kraken.nf index 0e302d8..db2f668 100755 --- a/subworkflows/local/run_nt_kraken.nf +++ b/subworkflows/local/run_nt_kraken.nf @@ -15,11 +15,20 @@ workflow RUN_NT_KRAKEN { main: ch_versions = Channel.empty() + assembly_fasta + .map{ it -> + tuple([id: it[0].id, + single_end: true + ], + it[1] + ) + } + .set { modified_input } // // MODULE: Kraken2 run on assembly fasta. // - KRAKEN2_KRAKEN2 ( - assembly_fasta, // val(meta), path(reads) + KRAKEN2_KRAKEN2 ( + modified_input, // val(meta), path(reads) nt_kraken_db_path, // path db false, // val save_output_fastqs true // val save_reads_assignment @@ -29,15 +38,15 @@ workflow RUN_NT_KRAKEN { // // MODULE: Get lineage for kraken output. // - GET_LINEAGE_FOR_KRAKEN ( + GET_LINEAGE_FOR_KRAKEN ( KRAKEN2_KRAKEN2.out.classified_reads_assignment, ncbi_rankedlineage_path ) ch_versions = ch_versions.mix(GET_LINEAGE_FOR_KRAKEN.out.versions) - + emit: KRAKEN2_KRAKEN2.out.classified_reads_assignment KRAKEN2_KRAKEN2.out.report GET_LINEAGE_FOR_KRAKEN.out.txt versions = ch_versions.ifEmpty(null) -} \ No newline at end of file +} diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 7a100b7..22c040b 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -28,23 +28,13 @@ include { EXTRACT_TIARA_HITS } from '../subworkflows/local/extract_tiara_hits' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -<<<<<<< HEAD include { RUN_NT_KRAKEN } from '..//subworkflows/local/run_nt_kraken' -======= ->>>>>>> dev /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -<<<<<<< HEAD - -// -// MODULE: Installed directly from nf-core/modules -// -======= ->>>>>>> dev include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* @@ -53,55 +43,23 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -<<<<<<< HEAD -// Info required for completion email and summary -======= - ->>>>>>> dev workflow ASCC { main: ch_versions = Channel.empty() -<<<<<<< HEAD - input_ch = Channel.fromPath(params.input, checkIfExists: true) - - // - // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE - // - YAML_INPUT ( - input_ch - ) - ch_versions = ch_versions.mix(YAML_INPUT.out.versions) - - // - // SUBWORKFLOW: GENERATE GENOME FILE - // - GENERATE_GENOME ( - YAML_INPUT.out.assembly_title, - YAML_INPUT.out.reference - ) - ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) -======= input_ch = Channel.fromPath(params.input, checkIfExists: true) YAML_INPUT ( input_ch ) ch_versions = ch_versions.mix(YAML_INPUT.out.versions) - GENERATE_GENOME ( YAML_INPUT.out.assembly_title, - YAML_INPUT.out.reference + GENERATE_GENOME ( YAML_INPUT.out.assembly_title, + YAML_INPUT.out.reference ) - ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) - EXTRACT_TIARA_HITS ( - GENERATE_GENOME.out.reference_tuple - ) - ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions.first()) ->>>>>>> dev - // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA // @@ -110,13 +68,15 @@ workflow ASCC { ) ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) -<<<<<<< HEAD // // SUBWORKFLOW: // RUN_NT_KRAKEN ( - GENERATE_GENOME.out.reference_tuple + GENERATE_GENOME.out.reference_tuple, + YAML_INPUT.out.nt_kraken_db_path, + YAML_INPUT.out.ncbi_rankedlineage_path ) + ch_versions = ch_versions.mix(RUN_NT_KRAKEN.out.versions) // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ @@ -127,11 +87,9 @@ workflow ASCC { ch_versions.unique().collectFile(name: 'collated_versions.yml') ) -======= emit: software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions ->>>>>>> dev } /* From 03a84f89140b8a183e82fc9cd62cd45fbdcba5da Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 18 Jul 2023 15:42:47 +0100 Subject: [PATCH 4/6] Fixed get_lineage module by changing container and conda envs --- conf/base.config | 6 ++++++ modules/local/get_lineage_for_kraken.nf | 8 ++++---- modules/nf-core/kraken2/kraken2/main.nf | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/conf/base.config b/conf/base.config index f4a6aca..70a7459 100644 --- a/conf/base.config +++ b/conf/base.config @@ -19,6 +19,12 @@ process { maxRetries = 1 maxErrors = '-1' + withName:KRAKEN2_KRAKEN2 { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 400.GB * task.attempt, 'memory' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } + } + // Process-specific resource requirements // NOTE - Please try and re-use the labels below as much as possible. // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. diff --git a/modules/local/get_lineage_for_kraken.nf b/modules/local/get_lineage_for_kraken.nf index a5f6dd6..59523c0 100755 --- a/modules/local/get_lineage_for_kraken.nf +++ b/modules/local/get_lineage_for_kraken.nf @@ -3,10 +3,10 @@ process GET_LINEAGE_FOR_KRAKEN { tag "$meta.id" label 'process_low' - conda "conda-forge::python=3.9" + conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'biocontainers/python:3.9' }" + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" input: tuple val(meta), path(kraken_file) @@ -48,4 +48,4 @@ process GET_LINEAGE_FOR_KRAKEN { get_lineage_for_kraken_results.py: \$(get_lineage_for_kraken_results.py --version | cut -d' ' -f2) END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf index da8d8c6..811341f 100644 --- a/modules/nf-core/kraken2/kraken2/main.nf +++ b/modules/nf-core/kraken2/kraken2/main.nf @@ -33,13 +33,13 @@ process KRAKEN2_KRAKEN2 { def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null" def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" - + // --gzip-compressed \\ """ kraken2 \\ + --db kraken2 \\ --db $db \\ --threads $task.cpus \\ --report ${prefix}.kraken2.report.txt \\ - --gzip-compressed \\ $unclassified_option \\ $classified_option \\ $readclassification_option \\ From 7623f04cc438ef07c2dab93aae81673716203723 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 18 Jul 2023 15:43:57 +0100 Subject: [PATCH 5/6] adding diff file --- modules.json | 5 +++-- .../kraken2/kraken2/kraken2-kraken2.diff | 21 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff diff --git a/modules.json b/modules.json index c20056c..95aac67 100644 --- a/modules.json +++ b/modules.json @@ -24,7 +24,8 @@ "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": [ "modules" - ] + ], + "patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff" }, "multiqc": { "branch": "master", @@ -44,4 +45,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff b/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff new file mode 100644 index 0000000..f7c794f --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff @@ -0,0 +1,21 @@ +Changes in module 'nf-core/kraken2/kraken2' +--- modules/nf-core/kraken2/kraken2/main.nf ++++ modules/nf-core/kraken2/kraken2/main.nf +@@ -33,13 +33,13 @@ + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" + def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" +- ++ // --gzip-compressed \\ + """ + kraken2 \\ ++ --db kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --report ${prefix}.kraken2.report.txt \\ +- --gzip-compressed \\ + $unclassified_option \\ + $classified_option \\ + $readclassification_option \\ + +************************************************************ From 49e76636fab6a215694611b597d4e5a63b1fa415 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 19 Jul 2023 14:53:42 +0100 Subject: [PATCH 6/6] Updates --- conf/modules.config | 2 +- workflows/ascc.nf | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 181ae79..238d4fb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -27,7 +27,7 @@ process { } withName: KRAKEN2_KRAKEN2 { - ext.args = { "--report-zero-counts --use-names" } + ext.args = { "--report-zero-counts --use-names --memory-mapping" } } } diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 22c040b..e1e7c9f 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -106,6 +106,8 @@ workflow.onComplete { if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) } + // TreeValProject.summary(workflow, reference_tuple, summary_params, projectDir) + } /*