Skip to content

Commit

Permalink
Merge pull request #22 from sanger-tol/kraken
Browse files Browse the repository at this point in the history
Kraken
  • Loading branch information
DLBPointon authored Jul 19, 2023
2 parents cc631b7 + 15457ed commit f2d1d58
Show file tree
Hide file tree
Showing 11 changed files with 622 additions and 5 deletions.
228 changes: 228 additions & 0 deletions bin/general_purpose_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
#!/usr/bin/env python3
"""
General purpose functions
File for functions that can be reused in many Python scripts
"""
# MIT License
#
# Copyright (c) 2020-2021 Genome Research Ltd.
#
# Author: Eerik Aunin ([email protected])
#
# This file is a part of the Genome Decomposition Analysis (GDA) pipeline.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
from os.path import isfile
import sys
import subprocess
import signal
from datetime import datetime
import argparse


def l(path):
"""
Function for loading text file as a list and removing line breaks from line ends
"""
lines = []
if isfile(path):
with open(path, "r") as in_file:
lines = in_file.readlines()
lines = [x.rstrip() for x in lines]
else:
sys.stderr.write("Error: file not found (" + path + ")\n")
sys.exit(1)
return lines


def ll(in_path):
"""
Function for reading a text file line by line
"""
if isfile(in_path):
with open(in_path, "r") as in_file:
for line in in_file:
line = line.rstrip()
yield line
else:
sys.stderr.write("Error: file not found (" + in_path + ")\n")
sys.exit(1)


def export_list_as_line_break_separated_file(out_list_f, out_path):
"""
Exports a list to a file, each item on a separate row
"""
with open(out_path, "w") as out_file:
for item in out_list_f:
out_file.write(str(item))
out_file.write("\n")


def spl(line, left_splitter, right_splitter, direction=0):
"""
Function for cropping a string from left and right side
Direction: if 0: the string will be cropped first from left and then right
if 1: the string will be cropped first from right and then left
Returns None if the splitting cannot be done because a splitter or both splitters are not in the input string
"""
out_line = None
if left_splitter in line and right_splitter in line:
if direction == 0:
out_line = line.split(left_splitter)[1]
out_line = out_line.split(right_splitter)[0]
elif direction == 1:
out_line = line.split(right_splitter)[0]
out_line = out_line.split(left_splitter)[1]
return out_line


def print_with_fixed_row_length(seq, max_length):
"""
Input: 1) a string 2) maximum line length in output
Output: the input string printed to STDOUT in chunks with fixed maximum line length
"""
split_seq = [seq[i:i+max_length] for i in range(0, len(seq), max_length)]
for line in split_seq:
print(line)


def split_with_fixed_row_length(seq, max_length):
"""
Input: 1) a string 2) maximum line length in output
Output: the input string split in chunks with fixed maximum line length
"""
split_seq = [seq[i:i + max_length] for i in range(0, len(seq), max_length)]
return split_seq


def reverse_complement(seq):
"""
Returns the reverse complement of a DNA sequence
"""
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'n': 'n'}
reverse_comp = "".join(complement.get(base, base) for base in reversed(seq))
return reverse_comp


def read_fasta_in_chunks(in_path):
"""
Input: path to FASTA file
Output (iterator): string tuples where the first element is a FASTA header and the second element is the corresponding FASTA sequence
"""
in_data = ll(in_path)
current_seq_header = None
seq = ""
for line in in_data:
if line != "":
if line[0] == ">":
if seq != "":
yield (current_seq_header, seq)
seq = ""
current_seq_header = line[1:len(line)]
else:
seq += line
if seq != "":
yield (current_seq_header, seq)


def list_to_chunks(lst, n):
"""
Yield successive n-sized chunks from lst
https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
"""
for i in range(0, len(lst), n):
yield lst[i: i + n]


def string_to_chunks(line, n):
"""
Function for splitting a string every nth character
https://stackoverflow.com/questions/9475241/split-string-every-nth-character
"""
return [line[i: i + n] for i in range(0, len(line), n)]


def run_system_command(system_command, verbose=True, dry_run=False, tries=1, expected_exit_code=0):
"""
Executes a system command and checks its exit code
"""
triggering_script_name = sys.argv[0].split("/")[-1]
try_counter_string = ""
if dry_run == False:
for i in range(0, tries):
if verbose == True:
time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if i > 0:
try_counter_string = ", try {}".format(i + 1)
out_message = "<{}, {}{}> executing command: {}\n".format(time_now, triggering_script_name, try_counter_string, system_command)
sys.stderr.write(out_message)
try:
output = subprocess.check_output(system_command, stderr=subprocess.STDOUT, shell=True, timeout=None, universal_newlines=True)
break
except subprocess.CalledProcessError as exc:
out_errormessage = "<" + triggering_script_name + "> " + " exited with error code " + str(exc.returncode)
if exc.output.isspace() == False:
out_errormessage += ". Error message: " + exc.output
if i == tries - 1:
if exc.returncode != expected_exit_code:
sys.stderr.write(out_errormessage + "\n")
os.kill(os.getpid(), signal.SIGINT)



def check_if_file_exists(in_path):
"""
Function for checking if a file exists
"""
if os.path.isfile(in_path) == False:
sys.stderr.write("The input file " + in_path + " was not found\n")
sys.exit(1)


def get_file_paths(in_folder_path, extension):
"""
Function for getting the paths to all files with a specific extension in a user-specified folder
in_folder_path: path to the folder with input files
extension: file extension of input files
Output: paths to individual files with the specific extension (list)
"""
onlyfiles = list()
selected_file_paths = list()
if os.path.isdir(in_folder_path):
onlyfiles = [f for f in os.listdir(in_folder_path) if os.path.isfile(os.path.join(in_folder_path, f))]
for file_item in onlyfiles:
if "." + extension in file_item:
file_item_split = file_item.split(".")
if file_item_split[len(file_item_split) - 1] == extension:
selected_file_paths.append(in_folder_path + "/" + file_item)
else:
sys.stderr.write("Error: folder not found (" + in_folder_path + ")\n")
sys.exit(1)
return selected_file_paths

def main():
# Placeholder for accessing version.
pass

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-v", "--version", action="version", version="1.0")
94 changes: 94 additions & 0 deletions bin/get_lineage_for_kraken_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""
Script for getting lineage for Kraken results
Developed by Eerik Aunin ([email protected])
"""

import general_purpose_functions as gpf
import sys
from collections import OrderedDict
import pandas as pd
import os
import signal
import argparse


def load_kraken_results(kraken_results_path):
"""
Reads sequence names and taxid values from Kraken results file into a dictionary
"""
kraken_dict = OrderedDict()
kraken_data = gpf.ll(kraken_results_path)
for line in kraken_data:
if "(taxid " in line:
split_line = line.split()
if len(split_line) >= 5:
seq_name = split_line[1]
taxid = gpf.spl(line, "(taxid ", ")")
if seq_name in kraken_dict:
sys.stderr.write("Duplicate read names found in input ({})\n".format(seq_name))
os.kill(os.getpid(), signal.SIGINT)
else:
kraken_dict[seq_name] = taxid
else:
sys.stderr.write("Failed to parse Kraken output file line:\n{}\n".format(line))
else:
sys.stderr.write("No taxid found in input file line:\n{}\n".format(line))
return kraken_dict


def load_lineage(lineage_dump_path, kraken_db_name):
"""
Reads lineage information from NCBI rankedlineage.dmp file (downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz) as a dictionary of dictionaries.
Keys: taxid values. Values: a dictionary where the keys are taxonomic unit names
(e.g. "species", "genus" or "family") and the values are the corresponding taxonomic names
"""
lineage_data = gpf.ll(lineage_dump_path)
lineage_dict = OrderedDict()
for line in lineage_data:
split_line = line.split("|")
split_line = [n.strip() for n in split_line]
entry_dict = {kraken_db_name + "_kraken_name": split_line[1], kraken_db_name + "_kraken_species": split_line[2], kraken_db_name + "_kraken_genus": split_line[3], kraken_db_name + "_kraken_family": split_line[4], kraken_db_name + "_kraken_order": split_line[5], kraken_db_name + "_kraken_class": split_line[6], kraken_db_name + "_kraken_phylum": split_line[7], kraken_db_name + "_kraken_kingdom": split_line[8], kraken_db_name + "_kraken_domain": split_line[9]}
lineage_dict[split_line[0]] = entry_dict
return lineage_dict


def get_kraken_and_lineage_dict(kraken_dict, lineage_dict, kraken_db_name):
"""
Merges the kraken results with lineage information for the taxid numbers
"""
kraken_and_lineage_dict = OrderedDict()
for seq_name in kraken_dict:
taxid = kraken_dict[seq_name]
lineage_entry = {kraken_db_name + "_kraken_taxid": "0", kraken_db_name + "_kraken_name": None, kraken_db_name + "_kraken_species": None, kraken_db_name + "_kraken_genus": None, kraken_db_name + "_kraken_family": None, kraken_db_name + "_kraken_order": None, kraken_db_name + "_kraken_class": None, kraken_db_name + "_kraken_phylum": None, kraken_db_name + "_kraken_kingdom": None, kraken_db_name + "_kraken_domain": None}
if taxid in lineage_dict:
lineage_entry = lineage_dict[taxid]
lineage_entry[kraken_db_name + "_kraken_taxid"] = taxid
else:
if taxid != "0":
sys.stderr.write("Taxid {} was not found in the lineages dump file\n".format(taxid))
kraken_and_lineage_dict[seq_name] = lineage_entry

return kraken_and_lineage_dict


def main(kraken_results_path, lineage_dump_path, kraken_db_name, out_path):

kraken_dict = load_kraken_results(kraken_results_path)
lineage_dict = load_lineage(lineage_dump_path, kraken_db_name)
kraken_and_lineage_dict = get_kraken_and_lineage_dict(kraken_dict, lineage_dict, kraken_db_name)
df = pd.DataFrame.from_dict(kraken_and_lineage_dict)
df = df.transpose()
df.index = df.index.rename("scaff")
df.to_csv(out_path)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-v", "--version", action="version", version="1.0")
parser.add_argument("kraken_results_path", help="Path to output file of a Kraken run", type=str)
parser.add_argument("lineage_dump_path", help="Path to an NCBI taxonomy rankedlineage.dmp file (downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz)", type=str)
parser.add_argument("kraken_db_name", help="Kraken database name", type=str, choices=["bacterial", "nt"])
parser.add_argument("out_path", help="Path for output CSV file", type=str)
args = parser.parse_args()
main(args.kraken_results_path, args.lineage_dump_path, args.kraken_db_name, args.out_path)
6 changes: 6 additions & 0 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ process {
maxRetries = 1
maxErrors = '-1'

withName:KRAKEN2_KRAKEN2 {
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
memory = { check_max( 400.GB * task.attempt, 'memory' ) }
time = { check_max( 16.h * task.attempt, 'time' ) }
}

// Process-specific resource requirements
// NOTE - Please try and re-use the labels below as much as possible.
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
Expand Down
4 changes: 4 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,8 @@ process {
]
}

withName: KRAKEN2_KRAKEN2 {
ext.args = { "--report-zero-counts --use-names --memory-mapping" }
}

}
8 changes: 8 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
"modules"
]
},
"kraken2/kraken2": {
"branch": "master",
"git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",
"installed_by": [
"modules"
],
"patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff"
},
"multiqc": {
"branch": "master",
"git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7",
Expand Down
Loading

0 comments on commit f2d1d58

Please sign in to comment.