From 1942db43bcb862605a4b5b07f382b123f4e265be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lars=20Sch=C3=B6ning?= Date: Thu, 31 Mar 2016 16:40:17 +0200 Subject: [PATCH] feat: add basic support for ranges --- genotype.enbf | 6 +++--- gnomic/grammar.py | 17 +++++++++-------- gnomic/models.py | 23 +++++++++++++++++++++++ gnomic/semantics.py | 16 +++++++++++++++- setup.py | 2 +- tests/test_genotype.py | 36 +++++++++++++++++++++++++++++++++++- tests/test_grammar.py | 13 +++++++++---- 7 files changed, 95 insertions(+), 18 deletions(-) diff --git a/genotype.enbf b/genotype.enbf index 700f470..de1f0cd 100644 --- a/genotype.enbf +++ b/genotype.enbf @@ -50,10 +50,10 @@ VARIANT_DEFINITION = @:IDENTIFIER {("," | ";") [sep] @:IDENTIFIER}*; BINARY_VARIANT = "+" | "-"; -RANGE = "[" [type:RANGE_SEQUENCE_TYPE] start:INTEGER "_" end:INTEGER "]" - | "[" [type:RANGE_SEQUENCE_TYPE] pos:INTEGER "]"; +RANGE = "[" [level:RANGE_SEQUENCE_LEVEL] start:INTEGER "_" end:INTEGER "]" + | "[" [level:RANGE_SEQUENCE_LEVEL] pos:INTEGER "]"; -RANGE_SEQUENCE_TYPE = ("c" | "p") "."; +RANGE_SEQUENCE_LEVEL = @:("c" | "p") "."; (* NOTE ACCESSION with its optional ":" can be ambiguous when it is used within a fusion and the IDENTIFIER is not numeric. In these cases, a DATABASE should be specified. *) diff --git a/gnomic/grammar.py b/gnomic/grammar.py index 15a1234..4c4db6f 100644 --- a/gnomic/grammar.py +++ b/gnomic/grammar.py @@ -17,7 +17,7 @@ from grako.util import re, RE_FLAGS -__version__ = (2016, 1, 19, 8, 48, 30, 1) +__version__ = (2016, 3, 31, 14, 28, 44, 3) __all__ = [ 'GnomicParser', @@ -373,8 +373,8 @@ def _RANGE_(self): with self._option(): self._token('[') with self._optional(): - self._RANGE_SEQUENCE_TYPE_() - self.ast['type'] = self.last_node + self._RANGE_SEQUENCE_LEVEL_() + self.ast['level'] = self.last_node self._INTEGER_() self.ast['start'] = self.last_node self._token('_') @@ -384,20 +384,20 @@ def _RANGE_(self): with self._option(): self._token('[') with self._optional(): - self._RANGE_SEQUENCE_TYPE_() - self.ast['type'] = self.last_node + self._RANGE_SEQUENCE_LEVEL_() + self.ast['level'] = self.last_node self._INTEGER_() self.ast['pos'] = self.last_node self._token(']') self._error('no available options') self.ast._define( - ['type', 'start', 'end', 'pos'], + ['level', 'start', 'end', 'pos'], [] ) @graken() - def _RANGE_SEQUENCE_TYPE_(self): + def _RANGE_SEQUENCE_LEVEL_(self): with self._group(): with self._choice(): with self._option(): @@ -405,6 +405,7 @@ def _RANGE_SEQUENCE_TYPE_(self): with self._option(): self._token('p') self._error('expecting one of: c p') + self.ast['@'] = self.last_node self._token('.') @graken() @@ -518,7 +519,7 @@ def BINARY_VARIANT(self, ast): def RANGE(self, ast): return ast - def RANGE_SEQUENCE_TYPE(self, ast): + def RANGE_SEQUENCE_LEVEL(self, ast): return ast def ACCESSION(self, ast): diff --git a/gnomic/models.py b/gnomic/models.py index 89e0a02..b348a1a 100644 --- a/gnomic/models.py +++ b/gnomic/models.py @@ -249,6 +249,29 @@ def __repr__(self): for key, value in self.__dict__.items() if value)) +class Range(object): + """ + An inclusive range at a coding (DNA), RNA or protein level. + """ + def __init__(self, level, start, end): + self.level = level + self.start = start + self.end = end + + def __hash__(self): + return hash(self.level) + \ + hash(self.start) + \ + hash(self.end) + + def __len__(self): + return self.end - self.start + 1 + + def __repr__(self): + if self.start == self.end: + return '{}({}, {})'.format(self.__class__.__name__, repr(self.level), self.start) + return '{}({}, {}, {})'.format(self.__class__.__name__, repr(self.level), self.start, self.end) + + class Organism(object): def __init__(self, name, aliases=None): self.name = name diff --git a/gnomic/semantics.py b/gnomic/semantics.py index 7dc0320..dcc91a6 100644 --- a/gnomic/semantics.py +++ b/gnomic/semantics.py @@ -1,4 +1,4 @@ -from gnomic.models import Mutation, Fusion, Plasmid, Feature, Organism, Accession, Type, FeatureTree +from gnomic.models import Mutation, Fusion, Plasmid, Feature, Organism, Accession, Type, FeatureTree, Range from gnomic.grammar import GnomicSemantics @@ -37,6 +37,20 @@ def replacement(self, ast): def deletion(self, ast): return Mutation(ast.old, None, marker=ast.marker) + def RANGE(self, ast): + level = { + 'c': 'coding', + 'r': 'RNA', + 'p': 'protein' + }[ast.level] + + if ast.pos: + return Range(level, ast.pos, ast.pos) + return Range(level, ast.start, ast.end) + + def INTEGER(self, ast): + return int(ast) + def ACCESSION(self, ast): return Accession(ast['id'], ast['db']) diff --git a/setup.py b/setup.py index bf5c638..b8702ba 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='gnomic', - version='0.1.1', + version='0.2.0', packages=find_packages(exclude=['*tests*']), license='Apache', author='Lars Schöning', diff --git a/tests/test_genotype.py b/tests/test_genotype.py index a085c43..c690000 100644 --- a/tests/test_genotype.py +++ b/tests/test_genotype.py @@ -1,6 +1,6 @@ from unittest import TestCase, SkipTest -from gnomic import Genotype, Feature, Ins, Del, Fusion, Sub, Type +from gnomic import Genotype, Feature, Ins, Del, Fusion, Sub, Type, Range class BaseTestCase(TestCase): @@ -118,6 +118,40 @@ def test_no_delete_if_present(self): Del(Feature(name='geneA')), }, self.chain('+geneA(x)', '-geneA').changes()) + +class GenotypeRangeTestCase(BaseTestCase): + + def test_delete_range_basic(self): + self.assertEqual({ + Del(Feature(name='geneA', range=Range('coding', 5, 10))), + }, self.chain('-geneA[c.5_10]').changes()) + + self.assertEqual({ + Del(Feature(name='geneA', range=Range('protein', 5, 5))), + }, self.chain('-geneA[p.5]').changes()) + + def test_delete_insert(self): + self.assertEqual({ + Ins(Feature(name='geneA')), + }, self.chain('-geneA[c.5_10]', '+geneA').changes()) + + @SkipTest + def test_delete_multiple_ranges(self): + # TODO in the current implementation, only the most recently deleted range is accounted for. + # TODO this implementation may change + + self.assertEqual({ + # Del(Feature(name='geneA', range=Range('coding', 5, 10))), + Del(Feature(name='geneA', range=Range('coding', 11, 12))), + }, self.chain('-geneA[c.5_10]', '-geneA[c.11_12]').changes()) + + self.assertEqual({ + Del(Feature(name='geneA')) + }, self.chain('-geneA[c.5_10]', '-geneA').changes()) + + + # TODO detailed tracking of different bits & pieces of features. + class GenotypeFusionsTestCase(BaseTestCase): @SkipTest diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 46151b2..b317c00 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -25,17 +25,22 @@ def test_parse_simple_insertions(self): ], parse('+fooF')) self.assertEqual([ - Ins(Feature(name='fooF', accession=Accession(identifier='123', database='FOO'))) + Ins(Feature(name='fooF', accession=Accession(identifier=123, database='FOO'))) ], parse('+fooF#FOO:123')) self.assertEqual([ - Ins(Feature(accession=Accession(identifier='123', database='FOO'))) + Ins(Feature(accession=Accession(identifier=123, database='FOO'))) ], parse('+#FOO:123')) self.assertEqual([ - Ins(Feature(accession=Accession(identifier='123'))) + Ins(Feature(accession=Accession(identifier='BAR', database='FOO'))) + ], parse('+#FOO:BAR')) + + self.assertEqual([ + Ins(Feature(accession=Accession(identifier=123))) ], parse('+#123')) + def test_parse_variants(self): self.assertEqual([ Feature(type=Type('phene'), name='A', variant='wild-type') @@ -69,7 +74,7 @@ def test_parse_variants(self): self.assertEqual([ Feature(type=Type('phene'), - accession=Accession(identifier='123', database='FOO'), + accession=Accession(identifier=123, database='FOO'), variant='wild-type') ], parse('#FOO:123+'))