Merge pull request #22 from sanger-tol/kraken

Kraken
sanger-tol · Jul 19, 2023 · f2d1d58 · f2d1d58
2 parents cc631b7 + 15457ed
commit f2d1d58
Show file tree

Hide file tree

Showing 11 changed files with 622 additions and 5 deletions.
diff --git a/bin/general_purpose_functions.py b/bin/general_purpose_functions.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+General purpose functions
+File for functions that can be reused in many Python scripts
+"""
+# MIT License
+# 
+# Copyright (c) 2020-2021 Genome Research Ltd.
+# 
+# Author: Eerik Aunin ([email protected])
+# 
+# This file is a part of the Genome Decomposition Analysis (GDA) pipeline.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+from os.path import isfile
+import sys
+import subprocess
+import signal
+from datetime import datetime
+import argparse
+
+
+def l(path):
+    """
+    Function for loading text file as a list and removing line breaks from line ends
+    """
+    lines = []
+    if isfile(path):
+        with open(path, "r") as in_file:
+            lines = in_file.readlines()
+            lines = [x.rstrip() for x in lines]
+    else:
+        sys.stderr.write("Error: file not found (" + path + ")\n")
+        sys.exit(1)
+    return lines
+
+
+def ll(in_path):
+    """
+    Function for reading a text file line by line
+    """
+    if isfile(in_path):
+        with open(in_path, "r") as in_file:
+            for line in in_file:
+                line = line.rstrip()
+                yield line
+    else:
+        sys.stderr.write("Error: file not found (" + in_path + ")\n")
+        sys.exit(1)
+
+
+def export_list_as_line_break_separated_file(out_list_f, out_path):
+    """
+    Exports a list to a file, each item on a separate row
+    """
+    with open(out_path, "w") as out_file:
+        for item in out_list_f:
+            out_file.write(str(item))
+            out_file.write("\n")
+
+
+def spl(line, left_splitter, right_splitter, direction=0):
+    """
+    Function for cropping a string from left and right side
+    Direction:  if 0: the string will be cropped first from left and then right
+                if 1: the string will be cropped first from right and then left
+    Returns None if the splitting cannot be done because a splitter or both splitters are not in the input string
+    """
+    out_line = None
+    if left_splitter in line and right_splitter in line:
+        if direction == 0:
+            out_line = line.split(left_splitter)[1]
+            out_line = out_line.split(right_splitter)[0]
+        elif direction == 1:
+            out_line = line.split(right_splitter)[0]
+            out_line = out_line.split(left_splitter)[1]
+    return out_line
+
+
+def print_with_fixed_row_length(seq, max_length):
+    """
+    Input: 1) a string 2) maximum line length in output
+    Output: the input string printed to STDOUT in chunks with fixed maximum line length
+    """
+    split_seq = [seq[i:i+max_length] for i in range(0, len(seq), max_length)]
+    for line in split_seq:
+        print(line)
+
+
+def split_with_fixed_row_length(seq, max_length):
+    """
+    Input: 1) a string 2) maximum line length in output
+    Output: the input string split in chunks with fixed maximum line length
+    """
+    split_seq = [seq[i:i + max_length] for i in range(0, len(seq), max_length)]
+    return split_seq
+
+
+def reverse_complement(seq):
+    """
+    Returns the reverse complement of a DNA sequence
+    """
+    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'n': 'n'}
+    reverse_comp = "".join(complement.get(base, base) for base in reversed(seq))
+    return reverse_comp
+
+
+def read_fasta_in_chunks(in_path):
+    """
+    Input: path to FASTA file
+    Output (iterator): string tuples where the first element is a FASTA header and the second element is the corresponding FASTA sequence
+    """
+    in_data = ll(in_path)
+    current_seq_header = None
+    seq = ""
+    for line in in_data:
+        if line != "":
+            if line[0] == ">":
+                if seq != "":
+                    yield (current_seq_header, seq)
+                seq = ""
+                current_seq_header = line[1:len(line)]
+            else:
+                seq += line
+    if seq != "":
+        yield (current_seq_header, seq)
+
+
+def list_to_chunks(lst, n):
+    """
+    Yield successive n-sized chunks from lst
+    https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
+    """
+    for i in range(0, len(lst), n):
+        yield lst[i: i + n]
+
+
+def string_to_chunks(line, n):
+    """
+    Function for splitting a string every nth character
+    https://stackoverflow.com/questions/9475241/split-string-every-nth-character
+    """
+    return [line[i: i + n] for i in range(0, len(line), n)]
+
+
+def run_system_command(system_command, verbose=True, dry_run=False, tries=1, expected_exit_code=0):
+    """
+    Executes a system command and checks its exit code
+    """
+    triggering_script_name = sys.argv[0].split("/")[-1]
+    try_counter_string = ""
+    if dry_run == False:
+        for i in range(0, tries):
+            if verbose == True:
+                time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                if i > 0:
+                    try_counter_string = ", try {}".format(i + 1)
+                out_message = "<{}, {}{}> executing command: {}\n".format(time_now, triggering_script_name, try_counter_string, system_command)
+                sys.stderr.write(out_message)
+            try:
+                output = subprocess.check_output(system_command, stderr=subprocess.STDOUT, shell=True, timeout=None, universal_newlines=True)
+                break
+            except subprocess.CalledProcessError as exc:
+                out_errormessage = "<" + triggering_script_name + "> " + " exited with error code " + str(exc.returncode)
+                if exc.output.isspace() == False:
+                    out_errormessage += ". Error message: " + exc.output
+                if i == tries - 1:
+                    if exc.returncode != expected_exit_code:
+                        sys.stderr.write(out_errormessage + "\n")
+                        os.kill(os.getpid(), signal.SIGINT)
+
+
+
+def check_if_file_exists(in_path):
+    """
+    Function for checking if a file exists
+    """
+    if os.path.isfile(in_path) == False:
+        sys.stderr.write("The input file " + in_path + " was not found\n")
+        sys.exit(1)
+
+
+def get_file_paths(in_folder_path, extension):
+    """
+    Function for getting the paths to all files with a specific extension in a user-specified folder
+    in_folder_path: path to the folder with input files
+    extension: file extension of input files
+    Output: paths to individual files with the specific extension (list)
+    """
+    onlyfiles = list()
+    selected_file_paths = list()
+    if os.path.isdir(in_folder_path):
+        onlyfiles = [f for f in os.listdir(in_folder_path) if os.path.isfile(os.path.join(in_folder_path, f))]
+        for file_item in onlyfiles:
+            if "." + extension in file_item:
+                file_item_split = file_item.split(".")
+                if file_item_split[len(file_item_split) - 1] == extension:
+                    selected_file_paths.append(in_folder_path + "/" + file_item)
+    else:
+        sys.stderr.write("Error: folder not found (" + in_folder_path + ")\n")
+        sys.exit(1)
+    return selected_file_paths
+
+def main():
+    # Placeholder for accessing version.
+    pass
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-v", "--version", action="version", version="1.0")
diff --git a/bin/get_lineage_for_kraken_results.py b/bin/get_lineage_for_kraken_results.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""
+Script for getting lineage for Kraken results
+Developed by Eerik Aunin ([email protected])
+"""
+
+import general_purpose_functions as gpf
+import sys
+from collections import OrderedDict
+import pandas as pd
+import os
+import signal
+import argparse
+
+
+def load_kraken_results(kraken_results_path):
+    """
+    Reads sequence names and taxid values from Kraken results file into a dictionary
+    """
+    kraken_dict = OrderedDict()
+    kraken_data = gpf.ll(kraken_results_path)
+    for line in kraken_data:
+        if "(taxid " in line:
+            split_line = line.split()
+            if len(split_line) >= 5:
+                seq_name = split_line[1]
+                taxid = gpf.spl(line, "(taxid ", ")")
+                if seq_name in kraken_dict:
+                    sys.stderr.write("Duplicate read names found in input ({})\n".format(seq_name))
+                    os.kill(os.getpid(), signal.SIGINT)
+                else:
+                    kraken_dict[seq_name] = taxid
+            else:
+                sys.stderr.write("Failed to parse Kraken output file line:\n{}\n".format(line))
+        else:
+            sys.stderr.write("No taxid found in input file line:\n{}\n".format(line))
+    return kraken_dict
+
+
+def load_lineage(lineage_dump_path, kraken_db_name):
+    """
+    Reads lineage information from NCBI rankedlineage.dmp file (downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz) as a dictionary of dictionaries.
+    Keys: taxid values. Values: a dictionary where the keys are taxonomic unit names 
+        (e.g. "species", "genus" or "family") and the values are the corresponding taxonomic names
+    """
+    lineage_data = gpf.ll(lineage_dump_path)
+    lineage_dict = OrderedDict()
+    for line in lineage_data:
+        split_line = line.split("|")
+        split_line = [n.strip() for n in split_line]
+        entry_dict = {kraken_db_name + "_kraken_name": split_line[1], kraken_db_name + "_kraken_species": split_line[2], kraken_db_name + "_kraken_genus": split_line[3], kraken_db_name + "_kraken_family": split_line[4], kraken_db_name + "_kraken_order": split_line[5], kraken_db_name + "_kraken_class": split_line[6], kraken_db_name + "_kraken_phylum": split_line[7], kraken_db_name + "_kraken_kingdom": split_line[8], kraken_db_name + "_kraken_domain": split_line[9]}
+        lineage_dict[split_line[0]] = entry_dict
+    return lineage_dict
+
+
+def get_kraken_and_lineage_dict(kraken_dict, lineage_dict, kraken_db_name):
+    """
+    Merges the kraken results with lineage information for the taxid numbers
+    """
+    kraken_and_lineage_dict = OrderedDict()
+    for seq_name in kraken_dict:
+        taxid = kraken_dict[seq_name]
+        lineage_entry = {kraken_db_name + "_kraken_taxid": "0", kraken_db_name + "_kraken_name": None, kraken_db_name + "_kraken_species": None, kraken_db_name + "_kraken_genus": None, kraken_db_name + "_kraken_family": None, kraken_db_name + "_kraken_order": None, kraken_db_name + "_kraken_class": None, kraken_db_name + "_kraken_phylum": None, kraken_db_name + "_kraken_kingdom": None, kraken_db_name + "_kraken_domain": None}
+        if taxid in lineage_dict:
+            lineage_entry = lineage_dict[taxid]
+            lineage_entry[kraken_db_name + "_kraken_taxid"] = taxid
+        else:
+            if taxid != "0":
+                sys.stderr.write("Taxid {} was not found in the lineages dump file\n".format(taxid))
+        kraken_and_lineage_dict[seq_name] = lineage_entry
+
+    return kraken_and_lineage_dict
+
+
+def main(kraken_results_path, lineage_dump_path, kraken_db_name, out_path):
+
+    kraken_dict = load_kraken_results(kraken_results_path)
+    lineage_dict = load_lineage(lineage_dump_path, kraken_db_name)
+    kraken_and_lineage_dict = get_kraken_and_lineage_dict(kraken_dict, lineage_dict, kraken_db_name)
+    df = pd.DataFrame.from_dict(kraken_and_lineage_dict)
+    df = df.transpose()
+    df.index = df.index.rename("scaff")
+    df.to_csv(out_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    parser.add_argument("kraken_results_path", help="Path to output file of a Kraken run", type=str)
+    parser.add_argument("lineage_dump_path", help="Path to an NCBI taxonomy rankedlineage.dmp file (downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz)", type=str)
+    parser.add_argument("kraken_db_name", help="Kraken database name", type=str, choices=["bacterial", "nt"])
+    parser.add_argument("out_path", help="Path for output CSV file", type=str)
+    args = parser.parse_args()
+    main(args.kraken_results_path, args.lineage_dump_path, args.kraken_db_name, args.out_path)
diff --git a/conf/base.config b/conf/base.config
@@ -19,6 +19,12 @@ process {
     maxRetries    = 1
     maxErrors     = '-1'
 
+    withName:KRAKEN2_KRAKEN2 {
+        cpus   = { check_max( 12     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 400.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 16.h   * task.attempt, 'time'    ) }
+    }
+
     // Process-specific resource requirements
     // NOTE - Please try and re-use the labels below as much as possible.
     //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.

diff --git a/conf/modules.config b/conf/modules.config
@@ -26,4 +26,8 @@ process {
         ]
     }
 
+    withName: KRAKEN2_KRAKEN2 {
+        ext.args = { "--report-zero-counts --use-names --memory-mapping" }
+    }
+
 }
diff --git a/modules.json b/modules.json
@@ -19,6 +19,14 @@
                             "modules"
                         ]
                     },
+                    "kraken2/kraken2": {
+                        "branch": "master",
+                        "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",
+                        "installed_by": [
+                            "modules"
+                        ],
+                        "patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff"
+                    },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7",
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,4 +26,8 @@ process { @@
             ]
         }
+        withName: KRAKEN2_KRAKEN2 {
+            ext.args = { "--report-zero-counts --use-names --memory-mapping" }
+        }
     }