Skip to content

Commit

Permalink
Apply ontology to trees
Browse files Browse the repository at this point in the history
  • Loading branch information
kjosib committed Nov 16, 2021
1 parent ed33336 commit fd0ed82
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 51 deletions.
53 changes: 46 additions & 7 deletions boozetools/arborist/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
3. A consistent source-tracking channel for tracing the origin(s) of bogus input.
4. The possibility of data-driven transformations on tree structures.
What's here isn't necessarily perfect yet, but it's doing some jobs.
What's here is far from perfect yet, but it's doing some jobs.
"""

__all__ = ['make_symbol', 'Node']
__all__ = ['Ontology', 'OntologyError', 'Node']

from typing import NamedTuple, Optional
from dataclasses import dataclass
Expand All @@ -34,10 +34,10 @@
class _Symbol(NamedTuple):
""" A tree node's "symbol" corresponds to its "constructor" in an "abstract data types" conception of trees. """
label: str
arity: tuple[str, ...]
arity: tuple[str, ...] # Why strings? Why can't they be category-objects? Or have the list nature?
index: dict[str, int]
category: Optional[str] # Category may be thought of as a "data type" which may have several constructors/symbols.
origin: object
ontology: "Ontology"

def node(self, *, semantic:object, children:tuple["Node", ...], debug_info) -> "Node":
"""
Expand All @@ -58,9 +58,48 @@ def from_args(self, *children, debug_info=None):
""" Convenience function for mini-parse grammars. """
return self.node(semantic=None, children=children, debug_info=debug_info)

def make_symbol(label:str, kids:dict[str,str], category:str=None, origin=None):
return _Symbol(label, tuple(kids.values()), dict((k,i) for i,k in enumerate(kids.keys())), category, origin)
class OntologyError(ValueError):
pass

class Ontology:
"""
The symbols in a given ontology are meant to hang together.
This is fairly simplistic, in that the categories do not form any sort of network.
But it will do for experimentation.
Two obvious enhancement ideas:
For error reporting, you might want to know where the ontology came from.
For language embedding, you might want to import symbols from another ontology.
"""
def __init__(self):
self.symbols = {}
self.defined_categories = {}
self.mentioned_categories = set()

def __getitem__(self, item):
return self.symbols[item]

def define_category(self, category:str, cases:dict[str,dict[str,str]]):
"""
category: a string describing the general data type all the cases fulfill.
cases: dict[label, dict[field, category]]
Why not accept a term from a meta-ontology? Because -- well -- not yet.
"""
if category in self.defined_categories: raise OntologyError(category)
self.defined_categories[category] = set(cases.keys())
for label, kids in cases.items():
if label in self.symbols: raise OntologyError(label)
self.mentioned_categories.update(kids.values())
self.symbols[label] = _Symbol(label, tuple(kids.values()), dict((k,i) for i,k in enumerate(kids.keys())), category, self)

def check_sanity(self):
"""
The ontology is sane when every field's category is defined.
This would have to change if and when imports happen.
"""
bogons = self.mentioned_categories - self.defined_categories.keys()
if bogons: raise OntologyError(bogons)


@dataclass(eq=False)
class Node:
Expand All @@ -69,7 +108,7 @@ class Node:
This structure will be the backbone of the "arborist" framework.
"""
__slots__ = ('symbol', 'semantic', 'children', 'debug_info')
symbol: _Symbol # Refers into a dictionary of symbol definitions.
symbol: _Symbol # Refers into a dictionary of symbol definitions.
semantic: object # Mutable in general, but a bottom-up pass may provide a basis object.
children: tuple # Must have correct arity for the symbol.
debug_info: object # Although this remains application-defined, often a file position might work.
Expand Down
29 changes: 15 additions & 14 deletions boozetools/scanning/miniscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from ..parsing import miniparse
from . import finite, regular, charset, recognition

PRELOAD = {'ASCII': {k: regular.char_prebuilt.leaf(cls) for k, cls in charset.mode_ascii.items()}}
char_prebuilt = regular.VOCAB['CharPrebuilt']
PRELOAD = {'ASCII': {k: char_prebuilt.leaf(cls) for k, cls in charset.mode_ascii.items()}}


class Definition:
Expand Down Expand Up @@ -180,7 +181,7 @@ def _BOOTSTRAP_REGEX_SCANNER_():
def seq(head, *tail):
for t in tail: head = regular.VOCAB['Sequence'].from_args(head, t)
return head
def txt(s):return seq(*(regular.codepoint.leaf(ord(_)) for _ in s))
def txt(s):return seq(*(regular.VOCAB['Codepoint'].leaf(ord(_)) for _ in s))

def _metatoken(yy): yy.token(yy.matched_text(), None)
def _and_then(condition):
Expand All @@ -195,24 +196,24 @@ def fn(yy):
return fn
def _bracket_reference(yy:interfaces.Scanner):
name = yy.matched_text()[1:-1]
node = regular.named_subexpression.leaf(name, yy.current_span())
node = regular.VOCAB['NamedSubexpression'].leaf(name, yy.current_span())
yy.token('reference', node)
def _shorthand_reference(yy:interfaces.Scanner):
yy.token('reference', regular.named_subexpression.leaf(yy.matched_text()[1], yy.current_span()))
yy.token('reference', regular.VOCAB['NamedSubexpression'].leaf(yy.matched_text()[1], yy.current_span()))
def _dot_reference(yy:interfaces.Scanner):
yy.token('reference', regular.named_subexpression.leaf('DOT', yy.current_span()))
def _hex_escape(yy): yy.token('codepoint', regular.codepoint.leaf(int(yy.matched_text()[2:], 16)))
def _control(yy): yy.token('codepoint', regular.codepoint.leaf(31 & ord(yy.matched_text()[2:])))
def _arbitrary_character(yy): yy.token('codepoint', regular.codepoint.leaf(ord(yy.matched_text())))
yy.token('reference', regular.VOCAB['NamedSubexpression'].leaf('DOT', yy.current_span()))
def _hex_escape(yy): yy.token('codepoint', regular.VOCAB['Codepoint'].leaf(int(yy.matched_text()[2:], 16)))
def _control(yy): yy.token('codepoint', regular.VOCAB['Codepoint'].leaf(31 & ord(yy.matched_text()[2:])))
def _arbitrary_character(yy): yy.token('codepoint', regular.VOCAB['Codepoint'].leaf(ord(yy.matched_text())))
def _class_initial_close_bracket(yy):
yy.enter('in_class')
_arbitrary_character(yy)
def _class_final_dash(yy):
yy.token('codepoint', ord('-'))
yy.token(']', None)
yy.enter(None)
def _arbitrary_escape(yy): yy.token('codepoint', regular.codepoint.leaf(ord(yy.matched_text()[1:])))
def _number(yy): yy.token('number', regular.bound.leaf(int(yy.matched_text())))
def _arbitrary_escape(yy): yy.token('codepoint', regular.VOCAB['Codepoint'].leaf(ord(yy.matched_text()[1:])))
def _number(yy): yy.token('number', regular.VOCAB['Bound'].leaf(int(yy.matched_text())))
def _dollar(charclass):
def fn(yy:Scanner): yy.token('end', charclass)
return fn
Expand All @@ -228,8 +229,8 @@ def ref(x): return PRELOAD['ASCII'][x]

dot = ref('DOT')

eof_charclass = regular.char_prebuilt.leaf(charset.EOF)
dollar_charclass = regular.char_prebuilt.leaf(charset.union(charset.EOF, PRELOAD['ASCII']['vertical'].semantic))
eof_charclass = regular.VOCAB['CharPrebuilt'].leaf(charset.EOF)
dollar_charclass = regular.VOCAB['CharPrebuilt'].leaf(charset.union(charset.EOF, PRELOAD['ASCII']['vertical'].semantic))

meta = Definition()

Expand Down Expand Up @@ -257,9 +258,9 @@ def ref(x): return PRELOAD['ASCII'][x]
anywhere.install_rule(expression=seq(txt('{'), ref('alpha'), regular.VOCAB['Plus'].from_args(ref('word')), txt('}'), ), action=_bracket_reference)
whack = txt('\\') # NB: Python doesn't let you end a raw-string with a backslash.
for c, n in [('x', 2), ('u', 4), ('U', 8)]:
hexblock = regular.VOCAB['n_times'].from_args(ref('xdigit'), regular.bound.leaf(n))
hexblock = regular.VOCAB['n_times'].from_args(ref('xdigit'), regular.VOCAB['Bound'].leaf(n))
anywhere.install_rule(expression=seq(whack, txt(c), hexblock), action=_hex_escape)
anywhere.install_rule(expression=seq(whack, txt('codepoint'), regular.char_prebuilt.leaf(charset.range_class(64, 127))), action=_control)
anywhere.install_rule(expression=seq(whack, txt('codepoint'), regular.VOCAB['CharPrebuilt'].leaf(charset.range_class(64, 127))), action=_control)
anywhere.install_rule(expression=seq(whack, ref('alnum')), action=_shorthand_reference)
anywhere.install_rule(expression=seq(whack, dot), action=_arbitrary_escape)
anywhere.install_rule(expression=dot, action=_arbitrary_character)
Expand Down
60 changes: 33 additions & 27 deletions boozetools/scanning/regular.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" An AST class-hierarchy for regular expressions and mechanisms for translating them. """

from ..arborist.trees import make_symbol, Node
from ..arborist.trees import Ontology, Node
from . import finite, charset

class PatternError(Exception):
Expand Down Expand Up @@ -38,39 +38,45 @@ class PatternSyntaxError(PatternError):
"""
errmsg = "Pattern syntax does not compute."

codepoint = make_symbol('Codepoint', {}, 'char_class') # Semantic is codepoint.

VOCAB = Ontology()
# A bit of theory: A character class is an intersection of one or more (possibly-inverted) set/unions;
# each set consists of one or more of codepoints, ranges, and named-classes. Therefore, we get this alphabet:
VOCAB = {s:make_symbol(s,k,c) for (s,k,c) in [
('CharRange', {'first':'Codepoint', 'last':'Codepoint'}, 'char_class'),
('Sequence', {'a':'regular', 'b':'regular'}, 'regex'),
('Alternation', {'a':'regular', 'b':'regular'}, 'regex'),
('Star', {'sub':'regular'}, 'regex'),
('Hook', {'sub':'regular'}, 'regex'),
('Plus', {'sub':'regular'}, 'regex'),
('n_times', {'sub':'regular', 'num':'Bound'}, 'regex'),
('n_or_more', {'sub':'regular', 'min':'Bound'}, 'regex'),
('n_or_fewer', {'sub':'regular', 'max':'Bound'}, 'regex'),
('n_to_m', {'sub':'regular', 'min':'Bound', 'max':'Bound'}, 'regex'),
('CharUnion', {'a': 'char_class', 'b': 'char_class', }, 'char_class'),
('CharIntersection', {'a': 'char_class', 'b': 'char_class', }, 'char_class'),
('CharComplement', {'inverse': 'char_class'}, 'char_class'),
('pattern_regular', {'left_context':'left_context', 'stem':'regular'}, 'pattern'),
('pattern_with_trail', {'left_context':'left_context', 'stem':'regular', 'trail':'regular'}, 'pattern'),
('pattern_only_trail', {'left_context':'left_context', 'trail':'regular'}, 'pattern'),
]}
char_prebuilt = make_symbol('CharPrebuilt', {}, 'char_class')
bound = make_symbol('Bound', {}) # Semantic is number (or None).
named_subexpression = make_symbol('NamedSubexpression', {}, 'regex') # Semantic is subexpression name.

VOCAB.define_category('Codepoint', {
'Codepoint': {}, # Semantic is codepoint.
})
VOCAB.define_category('char_class', {
'CharRange': {'first':'Codepoint', 'last':'Codepoint'},
'CharUnion': {'a': 'char_class', 'b': 'char_class', },
'CharIntersection': {'a': 'char_class', 'b': 'char_class', },
'CharComplement': {'inverse': 'char_class'},
'CharPrebuilt': {}, # Semantic is a pre-built character-class vector.
})
VOCAB.define_category('regex', {
'Sequence': {'a':'regex', 'b':'regex'},
'Alternation': {'a':'regex', 'b':'regex'},
'Star': {'sub':'regex'},
'Hook': {'sub':'regex'},
'Plus': {'sub':'regex'},
'n_times': {'sub':'regex', 'num':'Bound'},
'n_or_more': {'sub':'regex', 'min':'Bound'},
'n_or_fewer': {'sub':'regex', 'max':'Bound'},
'n_to_m': {'sub':'regex', 'min':'Bound', 'max':'Bound'},
'NamedSubexpression': {}, # Semantic is subexpression name.
})
VOCAB.define_category('pattern', {
'pattern_regular': {'left_context':'left_context', 'stem':'regex'},
'pattern_with_trail': {'left_context':'left_context', 'stem':'regex', 'trail':'regex'},
'pattern_only_trail': {'left_context':'left_context', 'trail':'regex'},
})
VOCAB.define_category('Bound', {
"Bound": {} # Semantic is number (or None).
})
LEFT_CONTEXT = {
'anywhere': (True, True),
'begin_line': (False, True),
'mid_line': (True, False),
}
for x in LEFT_CONTEXT:
VOCAB[x] = make_symbol(x, {}, 'left_context')
VOCAB.define_category('left_context', {x:{} for x in LEFT_CONTEXT.keys()})

class Encoder:
"""
Expand Down
7 changes: 4 additions & 3 deletions tests/test_regular.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def test_00_lengths_behave_correctly(self):
Explains the nature of computing the a-priori length of a regular expression.
This gets used in working out the details for trailing-context expressions.
"""
rcl = regular.codepoint.leaf
rbl = regular.bound.leaf
rcl = regular.VOCAB['Codepoint'].leaf
rbl = regular.VOCAB['Bound'].leaf
one = regular.VOCAB['CharRange'].from_args(rcl(32), rcl(127), ) # The ascii printing characters :)
two = regular.VOCAB['Sequence'].from_args(one, one) # Two of them in a row
sizer = regular.Sizer({})
Expand All @@ -66,4 +66,5 @@ def test_00_lengths_behave_correctly(self):
(regular.VOCAB['n_to_m'].from_args(two, rbl(3), rbl(4)), None),
]: self.assertEqual(regex.tour(sizer), expected_size)


def test_01_ontology_is_sane(self):
regular.VOCAB.check_sanity()

0 comments on commit fd0ed82

Please sign in to comment.