From 0788bee679924783cc9afa25bb3bd5c592a2fb3f Mon Sep 17 00:00:00 2001
From: William Eagles <weaglesbio@gmail.com>
Date: Thu, 13 Jul 2023 14:22:58 +0100
Subject: [PATCH 1/6] Add intial kraken subworkflow

---
 bin/general_purpose_functions.py         | 228 +++++++++++++++++++++++
 bin/get_lineage_for_kraken_results.py    |  94 ++++++++++
 conf/modules.config                      |   4 +
 modules.json                             |  21 ++-
 modules/local/get_lineage_for_kraken.nf  |  51 +++++
 modules/nf-core/kraken2/kraken2/main.nf  |  58 ++++++
 modules/nf-core/kraken2/kraken2/meta.yml |  75 ++++++++
 subworkflows/local/run_nt_kraken.nf      |  43 +++++
 8 files changed, 570 insertions(+), 4 deletions(-)
 create mode 100755 bin/general_purpose_functions.py
 create mode 100755 bin/get_lineage_for_kraken_results.py
 create mode 100755 modules/local/get_lineage_for_kraken.nf
 create mode 100644 modules/nf-core/kraken2/kraken2/main.nf
 create mode 100644 modules/nf-core/kraken2/kraken2/meta.yml
 create mode 100755 subworkflows/local/run_nt_kraken.nf

diff --git a/bin/general_purpose_functions.py b/bin/general_purpose_functions.py
new file mode 100755
index 0000000..c2c216d
--- /dev/null
+++ b/bin/general_purpose_functions.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+General purpose functions
+File for functions that can be reused in many Python scripts
+"""
+# MIT License
+# 
+# Copyright (c) 2020-2021 Genome Research Ltd.
+# 
+# Author: Eerik Aunin (ea10@sanger.ac.uk)
+# 
+# This file is a part of the Genome Decomposition Analysis (GDA) pipeline.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+from os.path import isfile
+import sys
+import subprocess
+import signal
+from datetime import datetime
+import argparse
+
+
+def l(path):
+    """
+    Function for loading text file as a list and removing line breaks from line ends
+    """
+    lines = []
+    if isfile(path):
+        with open(path, "r") as in_file:
+            lines = in_file.readlines()
+            lines = [x.rstrip() for x in lines]
+    else:
+        sys.stderr.write("Error: file not found (" + path + ")\n")
+        sys.exit(1)
+    return lines
+
+
+def ll(in_path):
+    """
+    Function for reading a text file line by line
+    """
+    if isfile(in_path):
+        with open(in_path, "r") as in_file:
+            for line in in_file:
+                line = line.rstrip()
+                yield line
+    else:
+        sys.stderr.write("Error: file not found (" + in_path + ")\n")
+        sys.exit(1)
+
+
+def export_list_as_line_break_separated_file(out_list_f, out_path):
+    """
+    Exports a list to a file, each item on a separate row
+    """
+    with open(out_path, "w") as out_file:
+        for item in out_list_f:
+            out_file.write(str(item))
+            out_file.write("\n")
+
+
+def spl(line, left_splitter, right_splitter, direction=0):
+    """
+    Function for cropping a string from left and right side
+    Direction:  if 0: the string will be cropped first from left and then right
+                if 1: the string will be cropped first from right and then left
+    Returns None if the splitting cannot be done because a splitter or both splitters are not in the input string
+    """
+    out_line = None
+    if left_splitter in line and right_splitter in line:
+        if direction == 0:
+            out_line = line.split(left_splitter)[1]
+            out_line = out_line.split(right_splitter)[0]
+        elif direction == 1:
+            out_line = line.split(right_splitter)[0]
+            out_line = out_line.split(left_splitter)[1]
+    return out_line
+
+
+def print_with_fixed_row_length(seq, max_length):
+    """
+    Input: 1) a string 2) maximum line length in output
+    Output: the input string printed to STDOUT in chunks with fixed maximum line length
+    """
+    split_seq = [seq[i:i+max_length] for i in range(0, len(seq), max_length)]
+    for line in split_seq:
+        print(line)
+
+
+def split_with_fixed_row_length(seq, max_length):
+    """
+    Input: 1) a string 2) maximum line length in output
+    Output: the input string split in chunks with fixed maximum line length
+    """
+    split_seq = [seq[i:i + max_length] for i in range(0, len(seq), max_length)]
+    return split_seq
+
+
+def reverse_complement(seq):
+    """
+    Returns the reverse complement of a DNA sequence
+    """
+    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'n': 'n'}
+    reverse_comp = "".join(complement.get(base, base) for base in reversed(seq))
+    return reverse_comp
+
+
+def read_fasta_in_chunks(in_path):
+    """
+    Input: path to FASTA file
+    Output (iterator): string tuples where the first element is a FASTA header and the second element is the corresponding FASTA sequence
+    """
+    in_data = ll(in_path)
+    current_seq_header = None
+    seq = ""
+    for line in in_data:
+        if line != "":
+            if line[0] == ">":
+                if seq != "":
+                    yield (current_seq_header, seq)
+                seq = ""
+                current_seq_header = line[1:len(line)]
+            else:
+                seq += line
+    if seq != "":
+        yield (current_seq_header, seq)
+
+
+def list_to_chunks(lst, n):
+    """
+    Yield successive n-sized chunks from lst
+    https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
+    """
+    for i in range(0, len(lst), n):
+        yield lst[i: i + n]
+
+
+def string_to_chunks(line, n):
+    """
+    Function for splitting a string every nth character
+    https://stackoverflow.com/questions/9475241/split-string-every-nth-character
+    """
+    return [line[i: i + n] for i in range(0, len(line), n)]
+
+
+def run_system_command(system_command, verbose=True, dry_run=False, tries=1, expected_exit_code=0):
+    """
+    Executes a system command and checks its exit code
+    """
+    triggering_script_name = sys.argv[0].split("/")[-1]
+    try_counter_string = ""
+    if dry_run == False:
+        for i in range(0, tries):
+            if verbose == True:
+                time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                if i > 0:
+                    try_counter_string = ", try {}".format(i + 1)
+                out_message = "<{}, {}{}> executing command: {}\n".format(time_now, triggering_script_name, try_counter_string, system_command)
+                sys.stderr.write(out_message)
+            try:
+                output = subprocess.check_output(system_command, stderr=subprocess.STDOUT, shell=True, timeout=None, universal_newlines=True)
+                break
+            except subprocess.CalledProcessError as exc:
+                out_errormessage = "<" + triggering_script_name + "> " + " exited with error code " + str(exc.returncode)
+                if exc.output.isspace() == False:
+                    out_errormessage += ". Error message: " + exc.output
+                if i == tries - 1:
+                    if exc.returncode != expected_exit_code:
+                        sys.stderr.write(out_errormessage + "\n")
+                        os.kill(os.getpid(), signal.SIGINT)
+
+
+
+def check_if_file_exists(in_path):
+    """
+    Function for checking if a file exists
+    """
+    if os.path.isfile(in_path) == False:
+        sys.stderr.write("The input file " + in_path + " was not found\n")
+        sys.exit(1)
+
+
+def get_file_paths(in_folder_path, extension):
+    """
+    Function for getting the paths to all files with a specific extension in a user-specified folder
+    in_folder_path: path to the folder with input files
+    extension: file extension of input files
+    Output: paths to individual files with the specific extension (list)
+    """
+    onlyfiles = list()
+    selected_file_paths = list()
+    if os.path.isdir(in_folder_path):
+        onlyfiles = [f for f in os.listdir(in_folder_path) if os.path.isfile(os.path.join(in_folder_path, f))]
+        for file_item in onlyfiles:
+            if "." + extension in file_item:
+                file_item_split = file_item.split(".")
+                if file_item_split[len(file_item_split) - 1] == extension:
+                    selected_file_paths.append(in_folder_path + "/" + file_item)
+    else:
+        sys.stderr.write("Error: folder not found (" + in_folder_path + ")\n")
+        sys.exit(1)
+    return selected_file_paths
+
+def main():
+    # Placeholder for accessing version.
+    pass
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-v", "--version", action="version", version="1.0")
diff --git a/bin/get_lineage_for_kraken_results.py b/bin/get_lineage_for_kraken_results.py
new file mode 100755
index 0000000..a039990
--- /dev/null
+++ b/bin/get_lineage_for_kraken_results.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""
+Script for getting lineage for Kraken results
+Developed by Eerik Aunin (ea10@sanger.ac.uk)
+"""
+
+import general_purpose_functions as gpf
+import sys
+from collections import OrderedDict
+import pandas as pd
+import os
+import signal
+import argparse
+
+
+def load_kraken_results(kraken_results_path):
+    """
+    Reads sequence names and taxid values from Kraken results file into a dictionary
+    """
+    kraken_dict = OrderedDict()
+    kraken_data = gpf.ll(kraken_results_path)
+    for line in kraken_data:
+        if "(taxid " in line:
+            split_line = line.split()
+            if len(split_line) >= 5:
+                seq_name = split_line[1]
+                taxid = gpf.spl(line, "(taxid ", ")")
+                if seq_name in kraken_dict:
+                    sys.stderr.write("Duplicate read names found in input ({})\n".format(seq_name))
+                    os.kill(os.getpid(), signal.SIGINT)
+                else:
+                    kraken_dict[seq_name] = taxid
+            else:
+                sys.stderr.write("Failed to parse Kraken output file line:\n{}\n".format(line))
+        else:
+            sys.stderr.write("No taxid found in input file line:\n{}\n".format(line))
+    return kraken_dict
+
+
+def load_lineage(lineage_dump_path, kraken_db_name):
+    """
+    Reads lineage information from NCBI rankedlineage.dmp file (downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz) as a dictionary of dictionaries.
+    Keys: taxid values. Values: a dictionary where the keys are taxonomic unit names 
+        (e.g. "species", "genus" or "family") and the values are the corresponding taxonomic names
+    """
+    lineage_data = gpf.ll(lineage_dump_path)
+    lineage_dict = OrderedDict()
+    for line in lineage_data:
+        split_line = line.split("|")
+        split_line = [n.strip() for n in split_line]
+        entry_dict = {kraken_db_name + "_kraken_name": split_line[1], kraken_db_name + "_kraken_species": split_line[2], kraken_db_name + "_kraken_genus": split_line[3], kraken_db_name + "_kraken_family": split_line[4], kraken_db_name + "_kraken_order": split_line[5], kraken_db_name + "_kraken_class": split_line[6], kraken_db_name + "_kraken_phylum": split_line[7], kraken_db_name + "_kraken_kingdom": split_line[8], kraken_db_name + "_kraken_domain": split_line[9]}
+        lineage_dict[split_line[0]] = entry_dict
+    return lineage_dict
+
+
+def get_kraken_and_lineage_dict(kraken_dict, lineage_dict, kraken_db_name):
+    """
+    Merges the kraken results with lineage information for the taxid numbers
+    """
+    kraken_and_lineage_dict = OrderedDict()
+    for seq_name in kraken_dict:
+        taxid = kraken_dict[seq_name]
+        lineage_entry = {kraken_db_name + "_kraken_taxid": "0", kraken_db_name + "_kraken_name": None, kraken_db_name + "_kraken_species": None, kraken_db_name + "_kraken_genus": None, kraken_db_name + "_kraken_family": None, kraken_db_name + "_kraken_order": None, kraken_db_name + "_kraken_class": None, kraken_db_name + "_kraken_phylum": None, kraken_db_name + "_kraken_kingdom": None, kraken_db_name + "_kraken_domain": None}
+        if taxid in lineage_dict:
+            lineage_entry = lineage_dict[taxid]
+            lineage_entry[kraken_db_name + "_kraken_taxid"] = taxid
+        else:
+            if taxid != "0":
+                sys.stderr.write("Taxid {} was not found in the lineages dump file\n".format(taxid))
+        kraken_and_lineage_dict[seq_name] = lineage_entry
+            
+    return kraken_and_lineage_dict
+
+
+def main(kraken_results_path, lineage_dump_path, kraken_db_name, out_path):
+    
+    kraken_dict = load_kraken_results(kraken_results_path)
+    lineage_dict = load_lineage(lineage_dump_path, kraken_db_name)
+    kraken_and_lineage_dict = get_kraken_and_lineage_dict(kraken_dict, lineage_dict, kraken_db_name)
+    df = pd.DataFrame.from_dict(kraken_and_lineage_dict)
+    df = df.transpose()
+    df.index = df.index.rename("scaff")
+    df.to_csv(out_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    parser.add_argument("kraken_results_path", help="Path to output file of a Kraken run", type=str)
+    parser.add_argument("lineage_dump_path", help="Path to an NCBI taxonomy rankedlineage.dmp file (downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz)", type=str)
+    parser.add_argument("kraken_db_name", help="Kraken database name", type=str, choices=["bacterial", "nt"])
+    parser.add_argument("out_path", help="Path for output CSV file", type=str)
+    args = parser.parse_args()
+    main(args.kraken_results_path, args.lineage_dump_path, args.kraken_db_name, args.out_path)
diff --git a/conf/modules.config b/conf/modules.config
index da58a5d..2460dc0 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -38,4 +38,8 @@ process {
         ]
     }
 
+    withName: KRAKEN2_KRAKEN2 {
+        ext.args = { "--report-zero-counts --use-names" }
+    }
+
 }
diff --git a/modules.json b/modules.json
index 6299d40..2821ead 100644
--- a/modules.json
+++ b/modules.json
@@ -8,20 +8,33 @@
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
                         "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "fastqc": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
+                    },
+                    "kraken2/kraken2": {
+                        "branch": "master",
+                        "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     }
                 }
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/modules/local/get_lineage_for_kraken.nf b/modules/local/get_lineage_for_kraken.nf
new file mode 100755
index 0000000..a5f6dd6
--- /dev/null
+++ b/modules/local/get_lineage_for_kraken.nf
@@ -0,0 +1,51 @@
+process GET_LINEAGE_FOR_KRAKEN {
+
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "conda-forge::python=3.9"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.9' :
+        'biocontainers/python:3.9' }"
+
+    input:
+    tuple val(meta), path(kraken_file)
+    path ncbi_rankedlineage_path
+
+    output:
+    path '*_nt_kraken_lineage_file.txt', emit: txt
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = args.ext.prefix ?: "${meta.id}"
+    """
+    get_lineage_for_kraken_results.py \\
+        $kraken_file \\
+        $ncbi_rankedlineage_path \\
+        nt \\
+        ${prefix}_nt_kraken_lineage_file.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        general_purpose_functions.py: \$(general_purpose_functions.py --version | cut -d' ' -f2)
+        get_lineage_for_kraken_results.py: \$(get_lineage_for_kraken_results.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = args.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}_nt_kraken_lineage_file.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+        general_purpose_functions.py: \$(general_purpose_functions.py --version | cut -d' ' -f2)
+        get_lineage_for_kraken_results.py: \$(get_lineage_for_kraken_results.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf
new file mode 100644
index 0000000..da8d8c6
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/main.nf
@@ -0,0 +1,58 @@
+process KRAKEN2_KRAKEN2 {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "bioconda::kraken2=2.1.2 conda-forge::pigz=2.6"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' :
+        'biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path  db
+    val save_output_fastqs
+    val save_reads_assignment
+
+    output:
+    tuple val(meta), path('*.classified{.,_}*')     , optional:true, emit: classified_reads_fastq
+    tuple val(meta), path('*.unclassified{.,_}*')   , optional:true, emit: unclassified_reads_fastq
+    tuple val(meta), path('*classifiedreads.txt')   , optional:true, emit: classified_reads_assignment
+    tuple val(meta), path('*report.txt')                           , emit: report
+    path "versions.yml"                                            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def paired       = meta.single_end ? "" : "--paired"
+    def classified   = meta.single_end ? "${prefix}.classified.fastq"   : "${prefix}.classified#.fastq"
+    def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
+    def classified_option = save_output_fastqs ? "--classified-out ${classified}" : ""
+    def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ""
+    def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null"
+    def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : ""
+
+    """
+    kraken2 \\
+        --db $db \\
+        --threads $task.cpus \\
+        --report ${prefix}.kraken2.report.txt \\
+        --gzip-compressed \\
+        $unclassified_option \\
+        $classified_option \\
+        $readclassification_option \\
+        $paired \\
+        $args \\
+        $reads
+
+    $compress_reads_command
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml
new file mode 100644
index 0000000..4721f45
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/meta.yml
@@ -0,0 +1,75 @@
+name: kraken2_kraken2
+description: Classifies metagenomic sequence data
+keywords:
+  - classify
+  - metagenomics
+  - fastq
+  - db
+tools:
+  - kraken2:
+      description: |
+        Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
+      homepage: https://ccb.jhu.edu/software/kraken2/
+      documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
+      doi: 10.1186/s13059-019-1891-0
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+  - db:
+      type: directory
+      description: Kraken2 database
+  - save_output_fastqs:
+      type: string
+      description: |
+        If true, optional commands are added to save classified and unclassified reads
+        as fastq files
+  - save_reads_assignment:
+      type: string
+      description: |
+        If true, an optional command is added to save a file reporting the taxonomic
+        classification of each input read
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - classified_reads_fastq:
+      type: file
+      description: |
+        Reads classified as belonging to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - unclassified_reads_fastq:
+      type: file
+      description: |
+        Reads not classified to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - classified_reads_assignment:
+      type: file
+      description: |
+        Kraken2 output file indicating the taxonomic assignment of
+        each input read
+  - report:
+      type: file
+      description: |
+        Kraken2 report containing stats about classified
+        and not classifed reads.
+      pattern: "*.{report.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@joseespinosa"
+  - "@drpatelh"
diff --git a/subworkflows/local/run_nt_kraken.nf b/subworkflows/local/run_nt_kraken.nf
new file mode 100755
index 0000000..0e302d8
--- /dev/null
+++ b/subworkflows/local/run_nt_kraken.nf
@@ -0,0 +1,43 @@
+#!/usr/bin/env nextflow
+
+//
+// MODULE IMPORT BLOCK
+//
+include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main'
+include { GET_LINEAGE_FOR_KRAKEN } from '../../modules/local/get_lineage_for_kraken'
+
+workflow RUN_NT_KRAKEN {
+    take:
+    assembly_fasta
+    nt_kraken_db_path
+    ncbi_rankedlineage_path
+
+    main:
+    ch_versions     = Channel.empty()
+
+    //
+    // MODULE: Kraken2 run on assembly fasta.
+    //
+    KRAKEN2_KRAKEN2 ( 
+        assembly_fasta,      // val(meta), path(reads)
+        nt_kraken_db_path,   // path db
+        false,               // val save_output_fastqs
+        true                 // val save_reads_assignment
+    )
+    ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions)
+
+    //
+    // MODULE: Get lineage for kraken output.
+    //
+    GET_LINEAGE_FOR_KRAKEN ( 
+        KRAKEN2_KRAKEN2.out.classified_reads_assignment,
+        ncbi_rankedlineage_path
+    )
+    ch_versions = ch_versions.mix(GET_LINEAGE_FOR_KRAKEN.out.versions)
+    
+    emit:
+    KRAKEN2_KRAKEN2.out.classified_reads_assignment
+    KRAKEN2_KRAKEN2.out.report
+    GET_LINEAGE_FOR_KRAKEN.out.txt
+    versions        = ch_versions.ifEmpty(null)
+}
\ No newline at end of file

From 0b6a223da55a680bfdf3052a94ad27b64614e525 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Tue, 18 Jul 2023 14:44:34 +0100
Subject: [PATCH 2/6] Updates

---
 workflows/ascc.nf | 68 ++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 36 deletions(-)

diff --git a/workflows/ascc.nf b/workflows/ascc.nf
index 2c205a3..5a37303 100644
--- a/workflows/ascc.nf
+++ b/workflows/ascc.nf
@@ -35,7 +35,7 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-include { INPUT_CHECK } from '../subworkflows/local/input_check'
+include { RUN_NT_KRAKEN } from '..//subworkflows/local/run_nt_kraken'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -46,8 +46,6 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check'
 //
 // MODULE: Installed directly from nf-core/modules
 //
-include { FASTQC                      } from '../modules/nf-core/fastqc/main'
-include { MULTIQC                     } from '../modules/nf-core/multiqc/main'
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
 
 /*
@@ -57,57 +55,55 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft
 */
 
 // Info required for completion email and summary
-def multiqc_report = []
 
 workflow ASCC {
 
+    main:
     ch_versions = Channel.empty()
 
+    input_ch = Channel.fromPath(params.input, checkIfExists: true)
+
     //
-    // SUBWORKFLOW: Read in samplesheet, validate and stage input files
+    // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE
     //
-    INPUT_CHECK (
-        file(params.input)
+    YAML_INPUT (
+        input_ch
     )
-    ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
-    // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input")
-    // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/
-    // ! There is currently no tooling to help you write a sample sheet schema
+    ch_versions = ch_versions.mix(YAML_INPUT.out.versions)
 
     //
-    // MODULE: Run FastQC
+    // SUBWORKFLOW: GENERATE GENOME FILE
     //
-    FASTQC (
-        INPUT_CHECK.out.reads
+    GENERATE_GENOME (
+        YAML_INPUT.out.assembly_title,
+        YAML_INPUT.out.reference
     )
-    ch_versions = ch_versions.mix(FASTQC.out.versions.first())
+    ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions)
 
-    CUSTOM_DUMPSOFTWAREVERSIONS (
-        ch_versions.unique().collectFile(name: 'collated_versions.yml')
+    //
+    // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA
+    //
+    EXTRACT_TIARA_HITS (
+        GENERATE_GENOME.out.reference_tuple
     )
+    ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions)
 
     //
-    // MODULE: MultiQC
+    // SUBWORKFLOW:
     //
-    workflow_summary    = WorkflowAscc.paramsSummaryMultiqc(workflow, summary_params)
-    ch_workflow_summary = Channel.value(workflow_summary)
-
-    methods_description    = WorkflowAscc.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params)
-    ch_methods_description = Channel.value(methods_description)
-
-    ch_multiqc_files = Channel.empty()
-    ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
-    ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml'))
-    ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
-    ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
-
-    MULTIQC (
-        ch_multiqc_files.collect(),
-        ch_multiqc_config.toList(),
-        ch_multiqc_custom_config.toList(),
-        ch_multiqc_logo.toList()
+    RUN_NT_KRAKEN (
+        GENERATE_GENOME.out.reference_tuple
+    )
+
+    // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input")
+    // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/
+    // ! There is currently no tooling to help you write a sample sheet schema
+
+
+    CUSTOM_DUMPSOFTWAREVERSIONS (
+        ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )
-    multiqc_report = MULTIQC.out.report.toList()
+
 }
 
 /*

From 0ac544c308f2818644b2e5c413d38180bcc169c4 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Tue, 18 Jul 2023 15:06:47 +0100
Subject: [PATCH 3/6] Integrating YAML_input with Kraken module, modifying
 input chanel to include single_end variable

---
 subworkflows/local/run_nt_kraken.nf | 19 +++++++---
 workflows/ascc.nf                   | 54 ++++-------------------------
 2 files changed, 20 insertions(+), 53 deletions(-)

diff --git a/subworkflows/local/run_nt_kraken.nf b/subworkflows/local/run_nt_kraken.nf
index 0e302d8..db2f668 100755
--- a/subworkflows/local/run_nt_kraken.nf
+++ b/subworkflows/local/run_nt_kraken.nf
@@ -15,11 +15,20 @@ workflow RUN_NT_KRAKEN {
     main:
     ch_versions     = Channel.empty()
 
+    assembly_fasta
+        .map{ it ->
+            tuple([id: it[0].id,
+                    single_end: true
+                ],
+                it[1]
+            )
+        }
+        .set { modified_input }
     //
     // MODULE: Kraken2 run on assembly fasta.
     //
-    KRAKEN2_KRAKEN2 ( 
-        assembly_fasta,      // val(meta), path(reads)
+    KRAKEN2_KRAKEN2 (
+        modified_input,      // val(meta), path(reads)
         nt_kraken_db_path,   // path db
         false,               // val save_output_fastqs
         true                 // val save_reads_assignment
@@ -29,15 +38,15 @@ workflow RUN_NT_KRAKEN {
     //
     // MODULE: Get lineage for kraken output.
     //
-    GET_LINEAGE_FOR_KRAKEN ( 
+    GET_LINEAGE_FOR_KRAKEN (
         KRAKEN2_KRAKEN2.out.classified_reads_assignment,
         ncbi_rankedlineage_path
     )
     ch_versions = ch_versions.mix(GET_LINEAGE_FOR_KRAKEN.out.versions)
-    
+
     emit:
     KRAKEN2_KRAKEN2.out.classified_reads_assignment
     KRAKEN2_KRAKEN2.out.report
     GET_LINEAGE_FOR_KRAKEN.out.txt
     versions        = ch_versions.ifEmpty(null)
-}
\ No newline at end of file
+}
diff --git a/workflows/ascc.nf b/workflows/ascc.nf
index 7a100b7..22c040b 100644
--- a/workflows/ascc.nf
+++ b/workflows/ascc.nf
@@ -28,23 +28,13 @@ include { EXTRACT_TIARA_HITS   } from '../subworkflows/local/extract_tiara_hits'
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-<<<<<<< HEAD
 include { RUN_NT_KRAKEN } from '..//subworkflows/local/run_nt_kraken'
-=======
->>>>>>> dev
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     IMPORT NF-CORE MODULES/SUBWORKFLOWS
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
-<<<<<<< HEAD
-
-//
-// MODULE: Installed directly from nf-core/modules
-//
-=======
->>>>>>> dev
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
 
 /*
@@ -53,55 +43,23 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-<<<<<<< HEAD
-// Info required for completion email and summary
-=======
-
->>>>>>> dev
 
 workflow ASCC {
 
     main:
     ch_versions = Channel.empty()
 
-<<<<<<< HEAD
-    input_ch = Channel.fromPath(params.input, checkIfExists: true)
-
-    //
-    // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE
-    //
-    YAML_INPUT (
-        input_ch
-    )
-    ch_versions = ch_versions.mix(YAML_INPUT.out.versions)
-
-    //
-    // SUBWORKFLOW: GENERATE GENOME FILE
-    //
-    GENERATE_GENOME (
-        YAML_INPUT.out.assembly_title,
-        YAML_INPUT.out.reference
-    )
-    ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions)
-=======
 
     input_ch = Channel.fromPath(params.input, checkIfExists: true)
 
     YAML_INPUT ( input_ch )
     ch_versions = ch_versions.mix(YAML_INPUT.out.versions)
 
-    GENERATE_GENOME ( YAML_INPUT.out.assembly_title,
-                      YAML_INPUT.out.reference
+    GENERATE_GENOME (   YAML_INPUT.out.assembly_title,
+                        YAML_INPUT.out.reference
     )
-
     ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions)
 
-    EXTRACT_TIARA_HITS (
-        GENERATE_GENOME.out.reference_tuple
-    )
-    ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions.first())
->>>>>>> dev
-
     //
     // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA
     //
@@ -110,13 +68,15 @@ workflow ASCC {
     )
     ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions)
 
-<<<<<<< HEAD
     //
     // SUBWORKFLOW:
     //
     RUN_NT_KRAKEN (
-        GENERATE_GENOME.out.reference_tuple
+        GENERATE_GENOME.out.reference_tuple,
+        YAML_INPUT.out.nt_kraken_db_path,
+        YAML_INPUT.out.ncbi_rankedlineage_path
     )
+    ch_versions = ch_versions.mix(RUN_NT_KRAKEN.out.versions)
 
     // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input")
     // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/
@@ -127,11 +87,9 @@ workflow ASCC {
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )
 
-=======
     emit:
     software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml
     versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions
->>>>>>> dev
 }
 
 /*

From 03a84f89140b8a183e82fc9cd62cd45fbdcba5da Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Tue, 18 Jul 2023 15:42:47 +0100
Subject: [PATCH 4/6] Fixed get_lineage module by changing container and conda
 envs

---
 conf/base.config                        | 6 ++++++
 modules/local/get_lineage_for_kraken.nf | 8 ++++----
 modules/nf-core/kraken2/kraken2/main.nf | 4 ++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/conf/base.config b/conf/base.config
index f4a6aca..70a7459 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -19,6 +19,12 @@ process {
     maxRetries    = 1
     maxErrors     = '-1'
 
+    withName:KRAKEN2_KRAKEN2 {
+        cpus   = { check_max( 12     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 400.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 16.h   * task.attempt, 'time'    ) }
+    }
+
     // Process-specific resource requirements
     // NOTE - Please try and re-use the labels below as much as possible.
     //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
diff --git a/modules/local/get_lineage_for_kraken.nf b/modules/local/get_lineage_for_kraken.nf
index a5f6dd6..59523c0 100755
--- a/modules/local/get_lineage_for_kraken.nf
+++ b/modules/local/get_lineage_for_kraken.nf
@@ -3,10 +3,10 @@ process GET_LINEAGE_FOR_KRAKEN {
     tag "$meta.id"
     label 'process_low'
 
-    conda "conda-forge::python=3.9"
+    conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/python:3.9' :
-        'biocontainers/python:3.9' }"
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+        'quay.io/biocontainers/pandas:1.5.2' }"
 
     input:
     tuple val(meta), path(kraken_file)
@@ -48,4 +48,4 @@ process GET_LINEAGE_FOR_KRAKEN {
         get_lineage_for_kraken_results.py: \$(get_lineage_for_kraken_results.py --version | cut -d' ' -f2)
     END_VERSIONS
     """
-}
\ No newline at end of file
+}
diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf
index da8d8c6..811341f 100644
--- a/modules/nf-core/kraken2/kraken2/main.nf
+++ b/modules/nf-core/kraken2/kraken2/main.nf
@@ -33,13 +33,13 @@ process KRAKEN2_KRAKEN2 {
     def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ""
     def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null"
     def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : ""
-
+    // --gzip-compressed \\
     """
     kraken2 \\
+        --db kraken2 \\
         --db $db \\
         --threads $task.cpus \\
         --report ${prefix}.kraken2.report.txt \\
-        --gzip-compressed \\
         $unclassified_option \\
         $classified_option \\
         $readclassification_option \\

From 7623f04cc438ef07c2dab93aae81673716203723 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Tue, 18 Jul 2023 15:43:57 +0100
Subject: [PATCH 5/6] adding diff file

---
 modules.json                                  |  5 +++--
 .../kraken2/kraken2/kraken2-kraken2.diff      | 21 +++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff

diff --git a/modules.json b/modules.json
index c20056c..95aac67 100644
--- a/modules.json
+++ b/modules.json
@@ -24,7 +24,8 @@
                         "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",
                         "installed_by": [
                             "modules"
-                        ]
+                        ],
+                        "patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff"
                     },
                     "multiqc": {
                         "branch": "master",
@@ -44,4 +45,4 @@
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff b/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff
new file mode 100644
index 0000000..f7c794f
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff
@@ -0,0 +1,21 @@
+Changes in module 'nf-core/kraken2/kraken2'
+--- modules/nf-core/kraken2/kraken2/main.nf
++++ modules/nf-core/kraken2/kraken2/main.nf
+@@ -33,13 +33,13 @@
+     def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ""
+     def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null"
+     def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : ""
+-
++    // --gzip-compressed \\
+     """
+     kraken2 \\
++        --db kraken2 \\
+         --db $db \\
+         --threads $task.cpus \\
+         --report ${prefix}.kraken2.report.txt \\
+-        --gzip-compressed \\
+         $unclassified_option \\
+         $classified_option \\
+         $readclassification_option \\
+
+************************************************************

From 49e76636fab6a215694611b597d4e5a63b1fa415 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Wed, 19 Jul 2023 14:53:42 +0100
Subject: [PATCH 6/6] Updates

---
 conf/modules.config | 2 +-
 workflows/ascc.nf   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index 181ae79..238d4fb 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -27,7 +27,7 @@ process {
     }
 
     withName: KRAKEN2_KRAKEN2 {
-        ext.args = { "--report-zero-counts --use-names" }
+        ext.args = { "--report-zero-counts --use-names --memory-mapping" }
     }
 
 }
diff --git a/workflows/ascc.nf b/workflows/ascc.nf
index 22c040b..e1e7c9f 100644
--- a/workflows/ascc.nf
+++ b/workflows/ascc.nf
@@ -106,6 +106,8 @@ workflow.onComplete {
     if (params.hook_url) {
         NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log)
     }
+    // TreeValProject.summary(workflow, reference_tuple, summary_params, projectDir)
+
 }
 
 /*