Skip to content

Commit

Permalink
Merge pull request #118 from padix-key/master
Browse files Browse the repository at this point in the history
More efficient CodonTable
  • Loading branch information
padix-key authored Jul 22, 2019
2 parents 2587e23 + 57d65ac commit eb1bb5e
Show file tree
Hide file tree
Showing 4 changed files with 331 additions and 114 deletions.
302 changes: 237 additions & 65 deletions src/biotite/sequence/codon.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,21 @@

import copy
from os.path import join, dirname, realpath
import numpy as np
from numbers import Integral
from .seqtypes import NucleotideSequence, ProteinSequence


# Abbreviations
_NUC_ALPH = NucleotideSequence.alphabet
_PROT_ALPH = ProteinSequence.alphabet

# Multiplier array that converts a codon in code representation
# into a unique integer
_radix = len(_NUC_ALPH)
_radix_multiplier = np.array([_radix**n for n in (2,1,0)], dtype=int)


class CodonTable(object):
"""
A `CodonTable` maps a codon (sequence of 3 nucleotides) to an amino
Expand All @@ -25,11 +37,11 @@ class CodonTable(object):
Parameters
----------
codon_dict : dict
codon_dict : dict of (str -> str)
A dictionary that maps codons to amino acids. The keys must be
strings of length 3 and the values strings of length 1
(all upper case). The dictionary must provide entries for all
64 possible codons.
(all upper case).
The dictionary must provide entries for all 64 possible codons.
starts : iterable object of str
The start codons. Each entry must be a string of length 3
(all upper case).
Expand All @@ -51,83 +63,133 @@ class CodonTable(object):
>>> print(table["M"])
('ATG',)
>>> print(table[14])
((1, 2, 3), (1, 2, 1), (1, 2, 0), (1, 2, 2), (0, 2, 0), (0, 2, 2))
((0, 2, 0), (0, 2, 2), (1, 2, 0), (1, 2, 1), (1, 2, 2), (1, 2, 3))
"""

# file for codon tables
# For efficient mapping of codon codes to amino acid codes,
# especially in in the 'map_codon_codes()' function, the class
# maps each possible codon into a unique number using a radix based
# approach.
# For example the codon (3,1,2) would be represented as
# 3*16 + 1*4 + 2**1 = 53

# file for builtin codon tables from NCBI
_table_file = join(dirname(realpath(__file__)), "codon_tables.txt")

def __init__(self, codon_dict, starts):
# Check if 'starts' is iterable objectof length 3 strings
# Check if 'starts' is iterable object of length 3 string
for start in starts:
if not isinstance(start, str) or len(start) != 3:
raise ValueError(f"Invalid codon '{start}' as start codon")
self._symbol_dict = dict(codon_dict.items())
self._code_dict = {}
for key, value in self._symbol_dict.items():
key_code = tuple(NucleotideSequence.alphabet.encode_multiple(key))
val_code = ProteinSequence.alphabet.encode(value)
self._code_dict[key_code] = val_code
self._start_symbols = tuple((starts))
self._start_codes = tuple(
[tuple(NucleotideSequence.alphabet.encode_multiple(start))
for start in self._start_symbols]
# Internally store codons as single unique numbers
start_codon_codes = np.array(
[_NUC_ALPH.encode_multiple(start) for start in starts], dtype=int
)
self._starts = CodonTable._to_number(start_codon_codes)
# Use -1 as error code
# The array uses the number representation of codons as index
# and stores the corresponding symbol codes for amino acids
self._codons = np.full(_radix**3, -1, dtype=int)
for key, value in codon_dict.items():
codon_code = _NUC_ALPH.encode_multiple(key)
codon_number = CodonTable._to_number(codon_code)
aa_code = _PROT_ALPH.encode(value)
self._codons[codon_number] = aa_code
if (self._codons == -1).any():
# Find the missing codon
missing_index = np.where(self._codons == -1)[0][0]
codon_code = CodonTable._to_codon(missing_index)
codon = _NUC_ALPH.decode_multiple(codon_code)
codon_str = "".join(codon)
raise ValueError(
f"Codon dictionary does not contain codon '{codon_str}'"
)

def __getitem__(self, item):
if isinstance(item, str):
if len(item) == 1:
# Amino acid -> return possible codons
codons = []
for key, val in self._symbol_dict.items():
if val == item:
codons.append(key)
return tuple(codons)
aa_code = _PROT_ALPH.encode(item)
codon_numbers = np.where(self._codons == aa_code)[0]
codon_codes = CodonTable._to_codon(codon_numbers)
codons = tuple(
["".join(_NUC_ALPH.decode_multiple(codon_code))
for codon_code in codon_codes]
)
return codons
elif len(item) == 3:
# Codon -> return corresponding amino acid
return self._symbol_dict[item]
codon_code = _NUC_ALPH.encode_multiple(item)
codon_number = CodonTable._to_number(codon_code)
aa_code = self._codons[codon_number]
aa = _PROT_ALPH.decode(aa_code)
return aa
else:
raise ValueError(f"'{item}' is an invalid index")
elif isinstance(item, int):
# Code for amino acid -> return possible codon codes
codons = []
for key, val in self._code_dict.items():
if val == item:
codons.append(key)
return tuple(codons)
elif isinstance(item, tuple):
# Code for codon -> return corresponding amino acid code
return self._code_dict[item]
codon_numbers = np.where(self._codons == item)[0]
codon_codes = tuple(CodonTable._to_codon(codon_numbers))
codon_codes = tuple([tuple(code) for code in codon_codes])
return codon_codes
else:
raise TypeError(
f"'{type(item).__name__}' objects are invalid indices"
)
# Code for codon as any iterable object
# Code for codon -> return corresponding amino acid codes
if len(item) != 3:
raise ValueError(
f"{item} is an invalid sequence code for a codon"
)
codon_number = CodonTable._to_number(item)
aa_code = self._codons[codon_number]
return aa_code

def __str__(self):
string = ""
bases = ["A","C","G","T"]
for b1 in bases:
for b2 in bases:
for b3 in bases:
codon = b1 + b2 + b3
string += codon + " " + self._symbol_dict[codon]
# Indicator for start codon
if codon in self._start_symbols:
string += " i "
else:
string += " "
# Add space for next codon
string += " "*3
# Remove terminal space
string = string [:-6]
# Jump to next line
string += "\n"
# Add empty line
string += "\n"
# Remove the two terminal new lines
string = string[:-2]
return string

def map_codon_codes(self, codon_codes):
"""
Efficiently map multiple codons to the corresponding amino
acids.
Parameters
----------
codon_codes : ndarray, dtype=int, shape=(n,3)
The codons to be translated into amino acids.
The codons are given as symbol codes.
*n* is the amount of codons.
Returns
-------
aa_codes : ndarray, dtype=int, shape=(n,)
The amino acids as symbol codes.
Examples
--------
>>> dna = NucleotideSequence("ATGGTTTAA")
>>> sequence_code = dna.code
>>> print(sequence_code)
[0 3 2 2 3 3 3 0 0]
>>> # Reshape to get codons
>>> codon_codes = sequence_code.reshape(-1, 3)
>>> print(codon_codes)
[[0 3 2]
[2 3 3]
[3 0 0]]
>>> # Map to amino acids
>>> aa_codes = CodonTable.default_table().map_codon_codes(codon_codes)
>>> print(aa_codes)
[10 17 23]
>>> # Put into a protein sequence
>>> protein = ProteinSequence()
>>> protein.code = aa_codes
>>> print(protein)
MV*
"""
if codon_codes.shape[-1] != 3:
raise ValueError(
f"Codons must be length 3, "
f"but size of last dimension is {codon_codes.shape[-1]}"
)
codon_numbers = CodonTable._to_number(codon_codes)
aa_codes = self._codons[codon_numbers]
return aa_codes

def codon_dict(self, code=False):
"""
Expand All @@ -146,9 +208,17 @@ def codon_dict(self, code=False):
The dictionary mapping codons to amino acids.
"""
if code:
return copy.copy(self._code_dict)
return {tuple(CodonTable._to_codon(codon_number)): aa_code
for codon_number, aa_code in enumerate(self._codons)}
else:
return copy.copy(self._symbol_dict)
return {"".join(_NUC_ALPH.decode_multiple(codon_code)):
_PROT_ALPH.decode(aa_code)
for codon_code, aa_code
in self.codon_dict(code=True).items()}

def is_start_codon(self, codon_codes):
codon_numbers = CodonTable._to_number(codon_codes)
return np.isin(codon_numbers, self._starts)

def start_codons(self, code=False):
"""
Expand All @@ -167,10 +237,112 @@ def start_codons(self, code=False):
the `code` parameter.
"""
if code:
return self._start_codes
return tuple(
[tuple(CodonTable._to_codon(codon_number))
for codon_number in self._starts]
)
else:
return self._start_symbols
return tuple(
["".join(_NUC_ALPH.decode_multiple(codon_code))
for codon_code in self.start_codons(code=True)]
)

def with_start_codons(self, starts):
"""
Create an new `CodonTable` with the same codon mappings, but
changed start codons.
Parameters
----------
starts : iterable object of str
The new start codons.
Returns
-------
new_table : CodonTable
The codon table with the new start codons.
"""
# Copy this table and replace the start codons
new_table = copy.deepcopy(self)
start_codon_codes = np.array(
[_NUC_ALPH.encode_multiple(start) for start in starts], dtype=int
)
new_table._starts = CodonTable._to_number(start_codon_codes)
return new_table

def with_codon_mappings(self, codon_dict):
"""
Create an new `CodonTable` with partially changed codon
mappings.
Parameters
----------
codon_dict : dict of (str -> str)
The changed codon mappings.
Returns
-------
new_table : CodonTable
The codon table with changed codon mappings.
"""
# Copy this table and replace the codon
new_table = copy.deepcopy(self)
for key, value in codon_dict.items():
codon_code = _NUC_ALPH.encode_multiple(key)
codon_number = CodonTable._to_number(codon_code)
aa_code = _PROT_ALPH.encode(value)
new_table._codons[codon_number] = aa_code
return new_table


def __str__(self):
string = ""
# ['A', 'C', 'G', 'T']
bases = _NUC_ALPH.get_symbols()
for b1 in bases:
for b2 in bases:
for b3 in bases:
codon = b1 + b2 + b3
string += codon + " " + self[codon]
# Indicator for start codon
codon_code = _NUC_ALPH.encode_multiple(codon)
if CodonTable._to_number(codon_code) in self._starts:
string += " i "
else:
string += " "
# Add space for next codon
string += " "*3
# Remove terminal space
string = string [:-6]
# Jump to next line
string += "\n"
# Add empty line
string += "\n"
# Remove the two terminal new lines
string = string[:-2]
return string

@staticmethod
def _to_number(codons):
if not isinstance(codons, np.ndarray):
codons = np.array(list(codons), dtype=int)
return np.sum(_radix_multiplier * codons, axis=-1)

@staticmethod
def _to_codon(numbers):
if isinstance(numbers, Integral):
# Only a single number
return CodonTable._to_codon(np.array([numbers]))[0]
if not isinstance(numbers, np.ndarray):
numbers = np.array(list(numbers), dtype=int)
codons = np.zeros(numbers.shape + (3,), dtype=int)
for n in (2,1,0):
val = _radix**n
digit = numbers // val
codons[..., -(n+1)] = digit
numbers = numbers - digit * val
return codons

@staticmethod
def load(table_name):
"""
Expand Down Expand Up @@ -226,7 +398,7 @@ def load(table_name):
elif line.startswith("Base3"):
base3 = line[5:].strip()

# Create codon tbale from data
# Create codon table from data
if aa is not None and init is not None \
and base1 is not None and base2 is not None and base3 is not None:
symbol_dict = {}
Expand All @@ -240,7 +412,7 @@ def load(table_name):
return CodonTable(symbol_dict, starts)
else:
raise ValueError(f"Codon table '{table_name}' was not found")

@staticmethod
def table_names():
"""
Expand Down Expand Up @@ -273,6 +445,6 @@ def default_table():
"""
return _default_table

_default_table = CodonTable(CodonTable.load("Standard").codon_dict(), ["ATG"])
_default_table = CodonTable.load("Standard").with_start_codons(["ATG"])


Loading

0 comments on commit eb1bb5e

Please sign in to comment.