Apply ontology to trees

kjosib · Nov 16, 2021 · fd0ed82 · fd0ed82
1 parent ed33336
commit fd0ed82
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 51 deletions.
diff --git a/boozetools/arborist/trees.py b/boozetools/arborist/trees.py
@@ -22,10 +22,10 @@
 	3. A consistent source-tracking channel for tracing the origin(s) of bogus input.
 	4. The possibility of data-driven transformations on tree structures.
 
-What's here isn't necessarily perfect yet, but it's doing some jobs.
+What's here is far from perfect yet, but it's doing some jobs.
 """
 
-__all__ = ['make_symbol', 'Node']
+__all__ = ['Ontology', 'OntologyError', 'Node']
 
 from typing import NamedTuple, Optional
 from dataclasses import dataclass
@@ -34,10 +34,10 @@
 class _Symbol(NamedTuple):
 	""" A tree node's "symbol" corresponds to its "constructor" in an "abstract data types" conception of trees. """
 	label: str
-	arity: tuple[str, ...]
+	arity: tuple[str, ...] # Why strings? Why can't they be category-objects? Or have the list nature?
 	index: dict[str, int]
 	category: Optional[str] # Category may be thought of as a "data type" which may have several constructors/symbols.
-	origin: object
+	ontology: "Ontology"
 
 	def node(self, *, semantic:object, children:tuple["Node", ...], debug_info) -> "Node":
 		"""
@@ -58,9 +58,48 @@ def from_args(self, *children, debug_info=None):
 		""" Convenience function for mini-parse grammars. """
 		return self.node(semantic=None, children=children, debug_info=debug_info)
 
-def make_symbol(label:str, kids:dict[str,str], category:str=None, origin=None):
-	return _Symbol(label, tuple(kids.values()), dict((k,i) for i,k in enumerate(kids.keys())), category, origin)
+class OntologyError(ValueError):
+	pass
 
+class Ontology:
+	"""
+	The symbols in a given ontology are meant to hang together.
+	This is fairly simplistic, in that the categories do not form any sort of network.
+	But it will do for experimentation.
+	
+	Two obvious enhancement ideas:
+		For error reporting, you might want to know where the ontology came from.
+		For language embedding, you might want to import symbols from another ontology.
+	"""
+	def __init__(self):
+		self.symbols = {}
+		self.defined_categories = {}
+		self.mentioned_categories = set()
+
+	def __getitem__(self, item):
+		return self.symbols[item]
+
+	def define_category(self, category:str, cases:dict[str,dict[str,str]]):
+		"""
+		category: a string describing the general data type all the cases fulfill.
+		cases: dict[label, dict[field, category]]
+		Why not accept a term from a meta-ontology? Because -- well -- not yet.
+		"""
+		if category in self.defined_categories: raise OntologyError(category)
+		self.defined_categories[category] = set(cases.keys())
+		for label, kids in cases.items():
+			if label in self.symbols: raise OntologyError(label)
+			self.mentioned_categories.update(kids.values())
+			self.symbols[label] = _Symbol(label, tuple(kids.values()), dict((k,i) for i,k in enumerate(kids.keys())), category, self)
+
+	def check_sanity(self):
+		"""
+		The ontology is sane when every field's category is defined.
+		This would have to change if and when imports happen.
+		"""
+		bogons = self.mentioned_categories - self.defined_categories.keys()
+		if bogons: raise OntologyError(bogons)
+
 
 @dataclass(eq=False)
 class Node:
@@ -69,7 +108,7 @@ class Node:
 	This structure will be the backbone of the "arborist" framework.
 	"""
 	__slots__ = ('symbol', 'semantic', 'children', 'debug_info')
-	symbol: _Symbol     # Refers into a dictionary of symbol definitions.
+	symbol: _Symbol    # Refers into a dictionary of symbol definitions.
 	semantic: object   # Mutable in general, but a bottom-up pass may provide a basis object.
 	children: tuple    # Must have correct arity for the symbol.
 	debug_info: object # Although this remains application-defined, often a file position might work.

diff --git a/boozetools/scanning/miniscan.py b/boozetools/scanning/miniscan.py
@@ -5,7 +5,8 @@
 from ..parsing import miniparse
 from . import finite, regular, charset, recognition
 
-PRELOAD = {'ASCII': {k: regular.char_prebuilt.leaf(cls) for k, cls in charset.mode_ascii.items()}}
+char_prebuilt = regular.VOCAB['CharPrebuilt']
+PRELOAD = {'ASCII': {k: char_prebuilt.leaf(cls) for k, cls in charset.mode_ascii.items()}}
 
 
 class Definition:
@@ -180,7 +181,7 @@ def _BOOTSTRAP_REGEX_SCANNER_():
 	def seq(head, *tail):
 		for t in tail: head = regular.VOCAB['Sequence'].from_args(head, t)
 		return head
-	def txt(s):return seq(*(regular.codepoint.leaf(ord(_)) for _ in s))
+	def txt(s):return seq(*(regular.VOCAB['Codepoint'].leaf(ord(_)) for _ in s))
 
 	def _metatoken(yy): yy.token(yy.matched_text(), None)
 	def _and_then(condition):
@@ -195,24 +196,24 @@ def fn(yy):
 		return fn
 	def _bracket_reference(yy:interfaces.Scanner):
 		name = yy.matched_text()[1:-1]
-		node = regular.named_subexpression.leaf(name, yy.current_span())
+		node = regular.VOCAB['NamedSubexpression'].leaf(name, yy.current_span())
 		yy.token('reference', node)
 	def _shorthand_reference(yy:interfaces.Scanner):
-		yy.token('reference', regular.named_subexpression.leaf(yy.matched_text()[1], yy.current_span()))
+		yy.token('reference', regular.VOCAB['NamedSubexpression'].leaf(yy.matched_text()[1], yy.current_span()))
 	def _dot_reference(yy:interfaces.Scanner):
-		yy.token('reference', regular.named_subexpression.leaf('DOT', yy.current_span()))
-	def _hex_escape(yy): yy.token('codepoint', regular.codepoint.leaf(int(yy.matched_text()[2:], 16)))
-	def _control(yy): yy.token('codepoint', regular.codepoint.leaf(31 & ord(yy.matched_text()[2:])))
-	def _arbitrary_character(yy): yy.token('codepoint', regular.codepoint.leaf(ord(yy.matched_text())))
+		yy.token('reference', regular.VOCAB['NamedSubexpression'].leaf('DOT', yy.current_span()))
+	def _hex_escape(yy): yy.token('codepoint', regular.VOCAB['Codepoint'].leaf(int(yy.matched_text()[2:], 16)))
+	def _control(yy): yy.token('codepoint', regular.VOCAB['Codepoint'].leaf(31 & ord(yy.matched_text()[2:])))
+	def _arbitrary_character(yy): yy.token('codepoint', regular.VOCAB['Codepoint'].leaf(ord(yy.matched_text())))
 	def _class_initial_close_bracket(yy):
 		yy.enter('in_class')
 		_arbitrary_character(yy)
 	def _class_final_dash(yy):
 		yy.token('codepoint', ord('-'))
 		yy.token(']', None)
 		yy.enter(None)
-	def _arbitrary_escape(yy): yy.token('codepoint', regular.codepoint.leaf(ord(yy.matched_text()[1:])))
-	def _number(yy): yy.token('number', regular.bound.leaf(int(yy.matched_text())))
+	def _arbitrary_escape(yy): yy.token('codepoint', regular.VOCAB['Codepoint'].leaf(ord(yy.matched_text()[1:])))
+	def _number(yy): yy.token('number', regular.VOCAB['Bound'].leaf(int(yy.matched_text())))
 	def _dollar(charclass):
 		def fn(yy:Scanner): yy.token('end', charclass)
 		return fn
@@ -228,8 +229,8 @@ def ref(x): return PRELOAD['ASCII'][x]
 
 	dot = ref('DOT')
 
-	eof_charclass = regular.char_prebuilt.leaf(charset.EOF)
-	dollar_charclass = regular.char_prebuilt.leaf(charset.union(charset.EOF, PRELOAD['ASCII']['vertical'].semantic))
+	eof_charclass = regular.VOCAB['CharPrebuilt'].leaf(charset.EOF)
+	dollar_charclass = regular.VOCAB['CharPrebuilt'].leaf(charset.union(charset.EOF, PRELOAD['ASCII']['vertical'].semantic))
 
 	meta = Definition()
 
@@ -257,9 +258,9 @@ def ref(x): return PRELOAD['ASCII'][x]
 		anywhere.install_rule(expression=seq(txt('{'), ref('alpha'), regular.VOCAB['Plus'].from_args(ref('word')), txt('}'), ), action=_bracket_reference)
 		whack = txt('\\') # NB: Python doesn't let you end a raw-string with a backslash.
 		for c, n in [('x', 2), ('u', 4), ('U', 8)]:
-			hexblock = regular.VOCAB['n_times'].from_args(ref('xdigit'), regular.bound.leaf(n))
+			hexblock = regular.VOCAB['n_times'].from_args(ref('xdigit'), regular.VOCAB['Bound'].leaf(n))
 			anywhere.install_rule(expression=seq(whack, txt(c), hexblock), action=_hex_escape)
-		anywhere.install_rule(expression=seq(whack, txt('codepoint'), regular.char_prebuilt.leaf(charset.range_class(64, 127))), action=_control)
+		anywhere.install_rule(expression=seq(whack, txt('codepoint'), regular.VOCAB['CharPrebuilt'].leaf(charset.range_class(64, 127))), action=_control)
 		anywhere.install_rule(expression=seq(whack, ref('alnum')), action=_shorthand_reference)
 		anywhere.install_rule(expression=seq(whack, dot), action=_arbitrary_escape)
 		anywhere.install_rule(expression=dot, action=_arbitrary_character)

diff --git a/boozetools/scanning/regular.py b/boozetools/scanning/regular.py
@@ -1,6 +1,6 @@
 """ An AST class-hierarchy for regular expressions and mechanisms for translating them. """
 
-from ..arborist.trees import make_symbol, Node
+from ..arborist.trees import Ontology, Node
 from . import finite, charset
 
 class PatternError(Exception):
@@ -38,39 +38,45 @@ class PatternSyntaxError(PatternError):
 	"""
 	errmsg = "Pattern syntax does not compute."
 
-codepoint = make_symbol('Codepoint', {}, 'char_class') # Semantic is codepoint.
-
+VOCAB = Ontology()
 # A bit of theory: A character class is an intersection of one or more (possibly-inverted) set/unions;
 # each set consists of one or more of codepoints, ranges, and named-classes. Therefore, we get this alphabet:
-VOCAB = {s:make_symbol(s,k,c) for (s,k,c) in [
-	('CharRange', {'first':'Codepoint', 'last':'Codepoint'}, 'char_class'),
-    ('Sequence', {'a':'regular', 'b':'regular'}, 'regex'),
-    ('Alternation', {'a':'regular', 'b':'regular'}, 'regex'),
-	('Star', {'sub':'regular'}, 'regex'),
-	('Hook', {'sub':'regular'}, 'regex'),
-	('Plus', {'sub':'regular'}, 'regex'),
-	('n_times', {'sub':'regular', 'num':'Bound'}, 'regex'),
-	('n_or_more', {'sub':'regular', 'min':'Bound'}, 'regex'),
-	('n_or_fewer', {'sub':'regular', 'max':'Bound'}, 'regex'),
-	('n_to_m', {'sub':'regular', 'min':'Bound', 'max':'Bound'}, 'regex'),
-	('CharUnion', {'a': 'char_class', 'b': 'char_class', }, 'char_class'),
-	('CharIntersection', {'a': 'char_class', 'b': 'char_class', }, 'char_class'),
-	('CharComplement', {'inverse': 'char_class'}, 'char_class'),
-	('pattern_regular', {'left_context':'left_context', 'stem':'regular'}, 'pattern'),
-	('pattern_with_trail', {'left_context':'left_context', 'stem':'regular', 'trail':'regular'}, 'pattern'),
-	('pattern_only_trail', {'left_context':'left_context', 'trail':'regular'}, 'pattern'),
-]}
-char_prebuilt = make_symbol('CharPrebuilt', {}, 'char_class')
-bound = make_symbol('Bound', {}) # Semantic is number (or None).
-named_subexpression = make_symbol('NamedSubexpression', {}, 'regex') # Semantic is subexpression name.
-
+VOCAB.define_category('Codepoint', {
+	'Codepoint': {},  # Semantic is codepoint.
+})
+VOCAB.define_category('char_class', {
+	'CharRange': {'first':'Codepoint', 'last':'Codepoint'},
+	'CharUnion': {'a': 'char_class', 'b': 'char_class', },
+	'CharIntersection': {'a': 'char_class', 'b': 'char_class', },
+	'CharComplement': {'inverse': 'char_class'},
+	'CharPrebuilt': {}, # Semantic is a pre-built character-class vector.
+})
+VOCAB.define_category('regex', {
+    'Sequence': {'a':'regex', 'b':'regex'},
+    'Alternation': {'a':'regex', 'b':'regex'},
+	'Star': {'sub':'regex'},
+	'Hook': {'sub':'regex'},
+	'Plus': {'sub':'regex'},
+	'n_times': {'sub':'regex', 'num':'Bound'},
+	'n_or_more': {'sub':'regex', 'min':'Bound'},
+	'n_or_fewer': {'sub':'regex', 'max':'Bound'},
+	'n_to_m': {'sub':'regex', 'min':'Bound', 'max':'Bound'},
+	'NamedSubexpression': {}, # Semantic is subexpression name.
+})
+VOCAB.define_category('pattern', {
+	'pattern_regular':    {'left_context':'left_context', 'stem':'regex'},                   
+	'pattern_with_trail': {'left_context':'left_context', 'stem':'regex', 'trail':'regex'},
+	'pattern_only_trail': {'left_context':'left_context', 'trail':'regex'},                  
+})
+VOCAB.define_category('Bound', {
+	"Bound": {} # Semantic is number (or None).
+})
 LEFT_CONTEXT = {
 	'anywhere': (True, True),
 	'begin_line': (False, True),
 	'mid_line': (True, False),
 }
-for x in LEFT_CONTEXT:
-	VOCAB[x] = make_symbol(x, {}, 'left_context')
+VOCAB.define_category('left_context', {x:{} for x in LEFT_CONTEXT.keys()})
 
 class Encoder:
 	"""

diff --git a/tests/test_regular.py b/tests/test_regular.py
@@ -47,8 +47,8 @@ def test_00_lengths_behave_correctly(self):
 		Explains the nature of computing the a-priori length of a regular expression.
 		This gets used in working out the details for trailing-context expressions.
 		"""
-		rcl = regular.codepoint.leaf
-		rbl = regular.bound.leaf
+		rcl = regular.VOCAB['Codepoint'].leaf
+		rbl = regular.VOCAB['Bound'].leaf
 		one = regular.VOCAB['CharRange'].from_args(rcl(32), rcl(127), ) # The ascii printing characters :)
 		two = regular.VOCAB['Sequence'].from_args(one, one) # Two of them in a row
 		sizer = regular.Sizer({})
@@ -66,4 +66,5 @@ def test_00_lengths_behave_correctly(self):
 			(regular.VOCAB['n_to_m'].from_args(two, rbl(3), rbl(4)), None),
 		]: self.assertEqual(regex.tour(sizer), expected_size)
 
-
+	def test_01_ontology_is_sane(self):
+		regular.VOCAB.check_sanity()