diff --git a/scripts/pegen/__init__.py b/scripts/pegen/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/pegen/__main__.py b/scripts/pegen/__main__.py new file mode 100644 index 000000000..69337ba59 --- /dev/null +++ b/scripts/pegen/__main__.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3.8 + +"""pegen -- PEG Generator. + +Search the web for PEG Parsers for reference. +""" + +import argparse +import sys +import time +import token +import traceback +from typing import Tuple + +from pegen.build import Grammar, Parser, ParserGenerator, Tokenizer +from pegen.validator import validate_grammar + + +def generate_c_code( + args: argparse.Namespace, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + from pegen.build import build_c_parser_and_generator + + verbose = args.verbose + verbose_tokenizer = verbose >= 3 + verbose_parser = verbose == 2 or verbose >= 4 + try: + grammar, parser, tokenizer, gen = build_c_parser_and_generator( + args.grammar_filename, + args.tokens_filename, + args.output, + args.compile_extension, + verbose_tokenizer, + verbose_parser, + args.verbose, + keep_asserts_in_extension=False if args.optimized else True, + skip_actions=args.skip_actions, + ) + return grammar, parser, tokenizer, gen + except Exception as err: + if args.verbose: + raise # Show traceback + traceback.print_exception(err.__class__, err, None) + sys.stderr.write("For full traceback, use -v\n") + sys.exit(1) + + +def generate_python_code( + args: argparse.Namespace, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + from pegen.build import build_python_parser_and_generator + + verbose = args.verbose + verbose_tokenizer = verbose >= 3 + verbose_parser = verbose == 2 or verbose >= 4 + try: + grammar, parser, tokenizer, gen = build_python_parser_and_generator( + args.grammar_filename, + args.output, + verbose_tokenizer, + verbose_parser, + skip_actions=args.skip_actions, + ) + return grammar, parser, tokenizer, gen + except Exception as err: + if args.verbose: + raise # Show traceback + traceback.print_exception(err.__class__, err, None) + sys.stderr.write("For full traceback, use -v\n") + sys.exit(1) + +def generate_javascript_code( + args: argparse.Namespace, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + from pegen.build import build_javascript_parser_and_generator + + verbose = args.verbose + verbose_tokenizer = verbose >= 3 + verbose_parser = verbose == 2 or verbose >= 4 + try: + grammar, parser, tokenizer, gen = build_javascript_parser_and_generator( + args.grammar_filename, + args.output, + verbose_tokenizer, + verbose_parser, + skip_actions=args.skip_actions, + ) + return grammar, parser, tokenizer, gen + except Exception as err: + if args.verbose: + raise # Show traceback + traceback.print_exception(err.__class__, err, None) + sys.stderr.write("For full traceback, use -v\n") + sys.exit(1) + +def generate_javascript_code( + args: argparse.Namespace, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + from pegen.build import build_javascript_parser_and_generator + verbose = args.verbose + verbose_tokenizer = verbose >= 3 + verbose_parser = verbose == 2 or verbose >= 4 + try: + grammar, parser, tokenizer, gen = build_javascript_parser_and_generator( + args.grammar_filename, + args.tokens_filename, + args.output, + args.compile_extension, + verbose_tokenizer, + verbose_parser, + args.verbose, + keep_asserts_in_extension=False if args.optimized else True, + skip_actions=args.skip_actions, + ) + return grammar, parser, tokenizer, gen + except Exception as err: + if args.verbose: + raise # Show traceback + traceback.print_exception(err.__class__, err, None) + sys.stderr.write("For full traceback, use -v\n") + sys.exit(1) + +argparser = argparse.ArgumentParser( + prog="pegen", description="Experimental PEG-like parser generator" +) +argparser.add_argument("-q", "--quiet", action="store_true", help="Don't print the parsed grammar") +argparser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="Print timing stats; repeat for more debug output", +) +subparsers = argparser.add_subparsers(help="target language for the generated code") + +c_parser = subparsers.add_parser("c", help="Generate C code for inclusion into CPython") +c_parser.set_defaults(func=generate_c_code) +c_parser.add_argument("grammar_filename", help="Grammar description") +c_parser.add_argument("tokens_filename", help="Tokens description") +c_parser.add_argument( + "-o", "--output", metavar="OUT", default="parse.c", help="Where to write the generated parser" +) +c_parser.add_argument( + "--compile-extension", + action="store_true", + help="Compile generated C code into an extension module", +) +c_parser.add_argument( + "--optimized", action="store_true", help="Compile the extension in optimized mode" +) +c_parser.add_argument( + "--skip-actions", + action="store_true", + help="Suppress code emission for rule actions", +) + +python_parser = subparsers.add_parser("python", help="Generate Python code") +python_parser.set_defaults(func=generate_python_code) +python_parser.add_argument("grammar_filename", help="Grammar description") +python_parser.add_argument( + "-o", + "--output", + metavar="OUT", + default="parse.py", + help="Where to write the generated parser", +) +python_parser.add_argument( + "--skip-actions", + action="store_true", + help="Suppress code emission for rule actions", +) + +javascript_parser = subparsers.add_parser("javascript", help="Generate Javascript code for inclusion into CPython") +javascript_parser.set_defaults(func=generate_javascript_code) +javascript_parser.add_argument("grammar_filename", help="Grammar description") +javascript_parser.add_argument("tokens_filename", help="Tokens description") +javascript_parser.add_argument( + "-o", "--output", metavar="OUT", default="parse.js", help="Where to write the generated parser" +) +javascript_parser.add_argument( + "--compile-extension", + action="store_true", + help="Compile generated C code into an extension module", +) +javascript_parser.add_argument( + "--optimized", action="store_true", help="Compile the extension in optimized mode" +) +javascript_parser.add_argument( + "--skip-actions", + action="store_true", + help="Suppress code emission for rule actions", +) + + +def main() -> None: + from pegen.testutil import print_memstats + + args = argparser.parse_args() + if "func" not in args: + argparser.error("Must specify the target language mode ('c' or 'python')") + + t0 = time.time() + grammar, parser, tokenizer, gen = args.func(args) + t1 = time.time() + + validate_grammar(grammar) + + if not args.quiet: + if args.verbose: + print("Raw Grammar:") + for line in repr(grammar).splitlines(): + print(" ", line) + + print("Clean Grammar:") + for line in str(grammar).splitlines(): + print(" ", line) + + if args.verbose: + print("First Graph:") + for src, dsts in gen.first_graph.items(): + print(f" {src} -> {', '.join(dsts)}") + print("First SCCS:") + for scc in gen.first_sccs: + print(" ", scc, end="") + if len(scc) > 1: + print( + " # Indirectly left-recursive; leaders:", + {name for name in scc if grammar.rules[name].leader}, + ) + else: + name = next(iter(scc)) + if name in gen.first_graph[name]: + print(" # Left-recursive") + else: + print() + + if args.verbose: + dt = t1 - t0 + diag = tokenizer.diagnose() + nlines = diag.end[0] + if diag.type == token.ENDMARKER: + nlines -= 1 + print(f"Total time: {dt:.3f} sec; {nlines} lines", end="") + if dt: + print(f"; {nlines / dt:.0f} lines/sec") + else: + print() + print("Caches sizes:") + print(f" token array : {len(tokenizer._tokens):10}") + print(f" cache : {len(parser._cache):10}") + if not print_memstats(): + print("(Can't find psutil; install it for memory stats.)") + + +if __name__ == "__main__": + if sys.version_info < (3, 8): + print("ERROR: using pegen requires at least Python 3.8!", file=sys.stderr) + sys.exit(1) + main() diff --git a/scripts/pegen/ast_dump.py b/scripts/pegen/ast_dump.py new file mode 100644 index 000000000..2c57d0932 --- /dev/null +++ b/scripts/pegen/ast_dump.py @@ -0,0 +1,71 @@ +""" +Copy-parse of ast.dump, removing the `isinstance` checks. This is needed, +because testing pegen requires generating a C extension module, which contains +a copy of the symbols defined in Python-ast.c. Thus, the isinstance check would +always fail. We rely on string comparison of the base classes instead. +TODO: Remove the above-described hack. +""" + +from typing import Any, Optional, Tuple + + +def ast_dump( + node: Any, + annotate_fields: bool = True, + include_attributes: bool = False, + *, + indent: Optional[str] = None, +) -> str: + def _format(node: Any, level: int = 0) -> Tuple[str, bool]: + if indent is not None: + level += 1 + prefix = "\n" + indent * level + sep = ",\n" + indent * level + else: + prefix = "" + sep = ", " + if any(cls.__name__ == "AST" for cls in node.__class__.__mro__): + cls = type(node) + args = [] + allsimple = True + keywords = annotate_fields + for name in node._fields: + try: + value = getattr(node, name) + except AttributeError: + keywords = True + continue + if value is None and getattr(cls, name, ...) is None: + keywords = True + continue + value, simple = _format(value, level) + allsimple = allsimple and simple + if keywords: + args.append("%s=%s" % (name, value)) + else: + args.append(value) + if include_attributes and node._attributes: + for name in node._attributes: + try: + value = getattr(node, name) + except AttributeError: + continue + if value is None and getattr(cls, name, ...) is None: + continue + value, simple = _format(value, level) + allsimple = allsimple and simple + args.append("%s=%s" % (name, value)) + if allsimple and len(args) <= 3: + return "%s(%s)" % (node.__class__.__name__, ", ".join(args)), not args + return "%s(%s%s)" % (node.__class__.__name__, prefix, sep.join(args)), False + elif isinstance(node, list): + if not node: + return "[]", True + return "[%s%s]" % (prefix, sep.join(_format(x, level)[0] for x in node)), False + return repr(node), True + + if all(cls.__name__ != "AST" for cls in node.__class__.__mro__): + raise TypeError("expected AST, got %r" % node.__class__.__name__) + if indent is not None and not isinstance(indent, str): + indent = " " * indent + return _format(node)[0] diff --git a/scripts/pegen/build.py b/scripts/pegen/build.py new file mode 100644 index 000000000..f45506a28 --- /dev/null +++ b/scripts/pegen/build.py @@ -0,0 +1,472 @@ +import itertools +import os +import pathlib +import sys +import sysconfig +import tempfile +import tokenize +from typing import IO, Dict, List, Optional, Set, Tuple + +from pegen.c_generator import CParserGenerator +from pegen.grammar import Grammar +from pegen.grammar_parser import GeneratedParser as GrammarParser +from pegen.parser import Parser +from pegen.parser_generator import ParserGenerator +from pegen.python_generator import PythonParserGenerator +from pegen.javascript_generator import JavascriptParserGenerator +from pegen.tokenizer import Tokenizer + +MOD_DIR = pathlib.Path(__file__).resolve().parent + +TokenDefinitions = Tuple[Dict[int, str], Dict[str, int], Set[str]] + + +def get_extra_flags(compiler_flags: str, compiler_py_flags_nodist: str) -> List[str]: + flags = sysconfig.get_config_var(compiler_flags) + py_flags_nodist = sysconfig.get_config_var(compiler_py_flags_nodist) + if flags is None or py_flags_nodist is None: + return [] + return f"{flags} {py_flags_nodist}".split() + + +def fixup_build_ext(cmd): + """Function needed to make build_ext tests pass. + + When Python was built with --enable-shared on Unix, -L. is not enough to + find libpython.so, because regrtest runs in a tempdir, not in the + source directory where the .so lives. + + When Python was built with in debug mode on Windows, build_ext commands + need their debug attribute set, and it is not done automatically for + some reason. + + This function handles both of these things. Example use: + + cmd = build_ext(dist) + support.fixup_build_ext(cmd) + cmd.ensure_finalized() + + Unlike most other Unix platforms, Mac OS X embeds absolute paths + to shared libraries into executables, so the fixup is not needed there. + + Taken from distutils (was part of the CPython stdlib until Python 3.11) + """ + if os.name == 'nt': + cmd.debug = sys.executable.endswith('_d.exe') + elif sysconfig.get_config_var('Py_ENABLE_SHARED'): + # To further add to the shared builds fun on Unix, we can't just add + # library_dirs to the Extension() instance because that doesn't get + # plumbed through to the final compiler command. + runshared = sysconfig.get_config_var('RUNSHARED') + if runshared is None: + cmd.library_dirs = ['.'] + else: + if sys.platform == 'darwin': + cmd.library_dirs = [] + else: + name, equals, value = runshared.partition('=') + cmd.library_dirs = [d for d in value.split(os.pathsep) if d] + + + +def compile_c_extension( + generated_source_path: str, + build_dir: Optional[str] = None, + verbose: bool = False, + keep_asserts: bool = True, + disable_optimization: bool = False, + library_dir: Optional[str] = None, +) -> str: + """Compile the generated source for a parser generator into an extension module. + + The extension module will be generated in the same directory as the provided path + for the generated source, with the same basename (in addition to extension module + metadata). For example, for the source mydir/parser.c the generated extension + in a darwin system with python 3.8 will be mydir/parser.cpython-38-darwin.so. + + If *build_dir* is provided, that path will be used as the temporary build directory + of distutils (this is useful in case you want to use a temporary directory). + + If *library_dir* is provided, that path will be used as the directory for a + static library of the common parser sources (this is useful in case you are + creating multiple extensions). + """ + import setuptools.logging + + from setuptools import Extension, Distribution + from setuptools._distutils.dep_util import newer_group + from setuptools._distutils.ccompiler import new_compiler + from setuptools._distutils.sysconfig import customize_compiler + + if verbose: + setuptools.logging.set_threshold(setuptools.logging.logging.DEBUG) + + source_file_path = pathlib.Path(generated_source_path) + extension_name = source_file_path.stem + extra_compile_args = get_extra_flags("CFLAGS", "PY_CFLAGS_NODIST") + extra_compile_args.append("-DPy_BUILD_CORE_MODULE") + # Define _Py_TEST_PEGEN to not call PyAST_Validate() in Parser/pegen.c + extra_compile_args.append("-D_Py_TEST_PEGEN") + extra_link_args = get_extra_flags("LDFLAGS", "PY_LDFLAGS_NODIST") + if keep_asserts: + extra_compile_args.append("-UNDEBUG") + if disable_optimization: + if sys.platform == 'win32': + extra_compile_args.append("/Od") + extra_link_args.append("/LTCG:OFF") + else: + extra_compile_args.append("-O0") + if sysconfig.get_config_var("GNULD") == "yes": + extra_link_args.append("-fno-lto") + + common_sources = [ + str(MOD_DIR.parent.parent.parent / "Python" / "Python-ast.c"), + str(MOD_DIR.parent.parent.parent / "Python" / "asdl.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "pegen.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "pegen_errors.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "action_helpers.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "string_parser.c"), + str(MOD_DIR.parent / "peg_extension" / "peg_extension.c"), + ] + include_dirs = [ + str(MOD_DIR.parent.parent.parent / "Include" / "internal"), + str(MOD_DIR.parent.parent.parent / "Parser"), + ] + extension = Extension( + extension_name, + sources=[generated_source_path], + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, + ) + dist = Distribution({"name": extension_name, "ext_modules": [extension]}) + cmd = dist.get_command_obj("build_ext") + fixup_build_ext(cmd) + cmd.build_lib = str(source_file_path.parent) + cmd.include_dirs = include_dirs + if build_dir: + cmd.build_temp = build_dir + cmd.ensure_finalized() + + compiler = new_compiler() + customize_compiler(compiler) + compiler.set_include_dirs(cmd.include_dirs) + compiler.set_library_dirs(cmd.library_dirs) + # build static lib + if library_dir: + library_filename = compiler.library_filename(extension_name, + output_dir=library_dir) + if newer_group(common_sources, library_filename, 'newer'): + if sys.platform == 'win32': + pdb = compiler.static_lib_format % (extension_name, '.pdb') + compile_opts = [f"/Fd{library_dir}\\{pdb}"] + compile_opts.extend(extra_compile_args) + else: + compile_opts = extra_compile_args + objects = compiler.compile(common_sources, + output_dir=library_dir, + debug=cmd.debug, + extra_postargs=compile_opts) + compiler.create_static_lib(objects, extension_name, + output_dir=library_dir, + debug=cmd.debug) + if sys.platform == 'win32': + compiler.add_library_dir(library_dir) + extension.libraries = [extension_name] + elif sys.platform == 'darwin': + compiler.set_link_objects([ + '-Wl,-force_load', library_filename, + ]) + else: + compiler.set_link_objects([ + '-Wl,--whole-archive', library_filename, '-Wl,--no-whole-archive', + ]) + else: + extension.sources[0:0] = common_sources + + # Compile the source code to object files. + ext_path = cmd.get_ext_fullpath(extension_name) + if newer_group(extension.sources, ext_path, 'newer'): + objects = compiler.compile(extension.sources, + output_dir=cmd.build_temp, + debug=cmd.debug, + extra_postargs=extra_compile_args) + else: + objects = compiler.object_filenames(extension.sources, + output_dir=cmd.build_temp) + # Now link the object files together into a "shared object" + compiler.link_shared_object( + objects, ext_path, + libraries=cmd.get_libraries(extension), + extra_postargs=extra_link_args, + export_symbols=cmd.get_export_symbols(extension), + debug=cmd.debug, + build_temp=cmd.build_temp) + + return pathlib.Path(ext_path) + + +def build_parser( + grammar_file: str, verbose_tokenizer: bool = False, verbose_parser: bool = False +) -> Tuple[Grammar, Parser, Tokenizer]: + with open(grammar_file) as file: + tokenizer = Tokenizer(tokenize.generate_tokens(file.readline), verbose=verbose_tokenizer) + parser = GrammarParser(tokenizer, verbose=verbose_parser) + grammar = parser.start() + + if not grammar: + raise parser.make_syntax_error(grammar_file) + + return grammar, parser, tokenizer + + +def generate_token_definitions(tokens: IO[str]) -> TokenDefinitions: + all_tokens = {} + exact_tokens = {} + non_exact_tokens = set() + numbers = itertools.count(0) + + for line in tokens: + line = line.strip() + + if not line or line.startswith("#"): + continue + + pieces = line.split() + index = next(numbers) + + if len(pieces) == 1: + (token,) = pieces + non_exact_tokens.add(token) + all_tokens[index] = token + elif len(pieces) == 2: + token, op = pieces + exact_tokens[op.strip("'")] = index + all_tokens[index] = token + else: + raise ValueError(f"Unexpected line found in Tokens file: {line}") + + return all_tokens, exact_tokens, non_exact_tokens + + +def build_c_generator( + grammar: Grammar, + grammar_file: str, + tokens_file: str, + output_file: str, + compile_extension: bool = False, + verbose_c_extension: bool = False, + keep_asserts_in_extension: bool = True, + skip_actions: bool = False, +) -> ParserGenerator: + with open(tokens_file, "r") as tok_file: + all_tokens, exact_tok, non_exact_tok = generate_token_definitions(tok_file) + with open(output_file, "w") as file: + gen: ParserGenerator = CParserGenerator( + grammar, all_tokens, exact_tok, non_exact_tok, file, skip_actions=skip_actions + ) + gen.generate(grammar_file) + + if compile_extension: + with tempfile.TemporaryDirectory() as build_dir: + compile_c_extension( + output_file, + build_dir=build_dir, + verbose=verbose_c_extension, + keep_asserts=keep_asserts_in_extension, + ) + return gen + + +def build_python_generator( + grammar: Grammar, + grammar_file: str, + output_file: str, + skip_actions: bool = False, +) -> ParserGenerator: + with open(output_file, "w") as file: + gen: ParserGenerator = PythonParserGenerator(grammar, file) # TODO: skip_actions + gen.generate(grammar_file) + return gen + +def build_javascript_generator( + grammar: Grammar, + grammar_file: str, + output_file: str, + skip_actions: bool = False, +) -> ParserGenerator: + with open(output_file, "w") as file: + gen: ParserGenerator = JavascriptParserGenerator(grammar, file) # TODO: skip_actions + gen.generate(grammar_file) + return gen + +def build_javascript_generator( + grammar: Grammar, + grammar_file: str, + tokens_file: str, + output_file: str, + compile_extension: bool = False, + verbose_c_extension: bool = False, + keep_asserts_in_extension: bool = True, + skip_actions: bool = False, +) -> ParserGenerator: + with open(tokens_file, "r") as tok_file: + all_tokens, exact_tok, non_exact_tok = generate_token_definitions(tok_file) + with open(output_file, "w") as file: + gen: ParserGenerator = JavascriptParserGenerator( + grammar, all_tokens, exact_tok, non_exact_tok, file, + skip_actions=skip_actions, + debug = True + ) + gen.generate(grammar_file) + + if compile_extension: + with tempfile.TemporaryDirectory() as build_dir: + compile_c_extension( + output_file, + build_dir=build_dir, + verbose=verbose_c_extension, + keep_asserts=keep_asserts_in_extension, + ) + return gen + +def build_c_parser_and_generator( + grammar_file: str, + tokens_file: str, + output_file: str, + compile_extension: bool = False, + verbose_tokenizer: bool = False, + verbose_parser: bool = False, + verbose_c_extension: bool = False, + keep_asserts_in_extension: bool = True, + skip_actions: bool = False, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + """Generate rules, C parser, tokenizer, parser generator for a given grammar + + Args: + grammar_file (string): Path for the grammar file + tokens_file (string): Path for the tokens file + output_file (string): Path for the output file + compile_extension (bool, optional): Whether to compile the C extension. + Defaults to False. + verbose_tokenizer (bool, optional): Whether to display additional output + when generating the tokenizer. Defaults to False. + verbose_parser (bool, optional): Whether to display additional output + when generating the parser. Defaults to False. + verbose_c_extension (bool, optional): Whether to display additional + output when compiling the C extension . Defaults to False. + keep_asserts_in_extension (bool, optional): Whether to keep the assert statements + when compiling the extension module. Defaults to True. + skip_actions (bool, optional): Whether to pretend no rule has any actions. + """ + grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser) + gen = build_c_generator( + grammar, + grammar_file, + tokens_file, + output_file, + compile_extension, + verbose_c_extension, + keep_asserts_in_extension, + skip_actions=skip_actions, + ) + + return grammar, parser, tokenizer, gen + + +def build_python_parser_and_generator( + grammar_file: str, + output_file: str, + verbose_tokenizer: bool = False, + verbose_parser: bool = False, + skip_actions: bool = False, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + """Generate rules, python parser, tokenizer, parser generator for a given grammar + + Args: + grammar_file (string): Path for the grammar file + output_file (string): Path for the output file + verbose_tokenizer (bool, optional): Whether to display additional output + when generating the tokenizer. Defaults to False. + verbose_parser (bool, optional): Whether to display additional output + when generating the parser. Defaults to False. + skip_actions (bool, optional): Whether to pretend no rule has any actions. + """ + grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser) + gen = build_python_generator( + grammar, + grammar_file, + output_file, + skip_actions=skip_actions, + ) + return grammar, parser, tokenizer, gen + +def build_javascript_parser_and_generator( + grammar_file: str, + output_file: str, + verbose_tokenizer: bool = False, + verbose_parser: bool = False, + skip_actions: bool = False, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + """Generate rules, python parser, tokenizer, parser generator for a given grammar + + Args: + grammar_file (string): Path for the grammar file + output_file (string): Path for the output file + verbose_tokenizer (bool, optional): Whether to display additional output + when generating the tokenizer. Defaults to False. + verbose_parser (bool, optional): Whether to display additional output + when generating the parser. Defaults to False. + skip_actions (bool, optional): Whether to pretend no rule has any actions. + """ + grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser) + gen = build_javascript_generator( + grammar, + grammar_file, + output_file, + skip_actions=skip_actions, + ) + return grammar, parser, tokenizer, gen + +def build_javascript_parser_and_generator( + grammar_file: str, + tokens_file: str, + output_file: str, + compile_extension: bool = False, + verbose_tokenizer: bool = False, + verbose_parser: bool = False, + verbose_c_extension: bool = False, + keep_asserts_in_extension: bool = True, + skip_actions: bool = False, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + """Generate rules, C parser, tokenizer, parser generator for a given grammar + + Args: + grammar_file (string): Path for the grammar file + tokens_file (string): Path for the tokens file + output_file (string): Path for the output file + compile_extension (bool, optional): Whether to compile the C extension. + Defaults to False. + verbose_tokenizer (bool, optional): Whether to display additional output + when generating the tokenizer. Defaults to False. + verbose_parser (bool, optional): Whether to display additional output + when generating the parser. Defaults to False. + verbose_c_extension (bool, optional): Whether to display additional + output when compiling the C extension . Defaults to False. + keep_asserts_in_extension (bool, optional): Whether to keep the assert statements + when compiling the extension module. Defaults to True. + skip_actions (bool, optional): Whether to pretend no rule has any actions. + """ + grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser) + gen = build_javascript_generator( + grammar, + grammar_file, + tokens_file, + output_file, + compile_extension, + verbose_c_extension, + keep_asserts_in_extension, + skip_actions=skip_actions, + ) + + return grammar, parser, tokenizer, gen \ No newline at end of file diff --git a/scripts/pegen/c_generator.py b/scripts/pegen/c_generator.py new file mode 100644 index 000000000..301949bda --- /dev/null +++ b/scripts/pegen/c_generator.py @@ -0,0 +1,871 @@ +import ast +import os.path +import re +from dataclasses import dataclass, field +from enum import Enum +from typing import IO, Any, Dict, List, Optional, Set, Text, Tuple + +from pegen import grammar +from pegen.grammar import ( + Alt, + Cut, + Forced, + Gather, + GrammarVisitor, + Group, + Leaf, + Lookahead, + NamedItem, + NameLeaf, + NegativeLookahead, + Opt, + PositiveLookahead, + Repeat0, + Repeat1, + Rhs, + Rule, + StringLeaf, +) +from pegen.parser_generator import ParserGenerator + +EXTENSION_PREFIX = """\ +#include "pegen.h" + +#if defined(Py_DEBUG) && defined(Py_BUILD_CORE) +# define D(x) if (p->debug) { x; } +#else +# define D(x) +#endif + +#ifdef __wasi__ +# define MAXSTACK 4000 +#else +# define MAXSTACK 6000 +#endif + +""" + + +EXTENSION_SUFFIX = """ +void * +_PyPegen_parse(Parser *p) +{ + // Initialize keywords + p->keywords = reserved_keywords; + p->n_keyword_lists = n_keyword_lists; + p->soft_keywords = soft_keywords; + + return start_rule(p); +} +""" + + +class NodeTypes(Enum): + NAME_TOKEN = 0 + NUMBER_TOKEN = 1 + STRING_TOKEN = 2 + GENERIC_TOKEN = 3 + KEYWORD = 4 + SOFT_KEYWORD = 5 + CUT_OPERATOR = 6 + F_STRING_CHUNK = 7 + + +BASE_NODETYPES = { + "NAME": NodeTypes.NAME_TOKEN, + "NUMBER": NodeTypes.NUMBER_TOKEN, + "STRING": NodeTypes.STRING_TOKEN, + "SOFT_KEYWORD": NodeTypes.SOFT_KEYWORD, +} + + +@dataclass +class FunctionCall: + function: str + arguments: List[Any] = field(default_factory=list) + assigned_variable: Optional[str] = None + assigned_variable_type: Optional[str] = None + return_type: Optional[str] = None + nodetype: Optional[NodeTypes] = None + force_true: bool = False + comment: Optional[str] = None + + def __str__(self) -> str: + parts = [] + parts.append(self.function) + if self.arguments: + parts.append(f"({', '.join(map(str, self.arguments))})") + if self.force_true: + parts.append(", !p->error_indicator") + if self.assigned_variable: + if self.assigned_variable_type: + parts = [ + "(", + self.assigned_variable, + " = ", + "(", + self.assigned_variable_type, + ")", + *parts, + ")", + ] + else: + parts = ["(", self.assigned_variable, " = ", *parts, ")"] + if self.comment: + parts.append(f" // {self.comment}") + return "".join(parts) + + +class CCallMakerVisitor(GrammarVisitor): + def __init__( + self, + parser_generator: ParserGenerator, + exact_tokens: Dict[str, int], + non_exact_tokens: Set[str], + ): + self.gen = parser_generator + self.exact_tokens = exact_tokens + self.non_exact_tokens = non_exact_tokens + self.cache: Dict[Any, FunctionCall] = {} + self.cleanup_statements: List[str] = [] + + def keyword_helper(self, keyword: str) -> FunctionCall: + return FunctionCall( + assigned_variable="_keyword", + function="_PyPegen_expect_token", + arguments=["p", self.gen.keywords[keyword]], + return_type="Token *", + nodetype=NodeTypes.KEYWORD, + comment=f"token='{keyword}'", + ) + + def soft_keyword_helper(self, value: str) -> FunctionCall: + return FunctionCall( + assigned_variable="_keyword", + function="_PyPegen_expect_soft_keyword", + arguments=["p", value], + return_type="expr_ty", + nodetype=NodeTypes.SOFT_KEYWORD, + comment=f"soft_keyword='{value}'", + ) + + def visit_NameLeaf(self, node: NameLeaf) -> FunctionCall: + name = node.value + if name in self.non_exact_tokens: + if name in BASE_NODETYPES: + return FunctionCall( + assigned_variable=f"{name.lower()}_var", + function=f"_PyPegen_{name.lower()}_token", + arguments=["p"], + nodetype=BASE_NODETYPES[name], + return_type="expr_ty", + comment=name, + ) + return FunctionCall( + assigned_variable=f"{name.lower()}_var", + function=f"_PyPegen_expect_token", + arguments=["p", name], + nodetype=NodeTypes.GENERIC_TOKEN, + return_type="Token *", + comment=f"token='{name}'", + ) + + type = None + rule = self.gen.all_rules.get(name.lower()) + if rule is not None: + type = "asdl_seq *" if rule.is_loop() or rule.is_gather() else rule.type + + return FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + return_type=type, + comment=f"{node}", + ) + + def visit_StringLeaf(self, node: StringLeaf) -> FunctionCall: + val = ast.literal_eval(node.value) + if re.match(r"[a-zA-Z_]\w*\Z", val): # This is a keyword + if node.value.endswith("'"): + return self.keyword_helper(val) + else: + return self.soft_keyword_helper(node.value) + else: + assert val in self.exact_tokens, f"{node.value} is not a known literal" + type = self.exact_tokens[val] + return FunctionCall( + assigned_variable="_literal", + function=f"_PyPegen_expect_token", + arguments=["p", type], + nodetype=NodeTypes.GENERIC_TOKEN, + return_type="Token *", + comment=f"token='{val}'", + ) + + def visit_Rhs(self, node: Rhs) -> FunctionCall: + if node in self.cache: + return self.cache[node] + if node.can_be_inlined: + self.cache[node] = self.generate_call(node.alts[0].items[0]) + else: + name = self.gen.artifical_rule_from_rhs(node) + self.cache[node] = FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + comment=f"{node}", + ) + return self.cache[node] + + def visit_NamedItem(self, node: NamedItem) -> FunctionCall: + call = self.generate_call(node.item) + if node.name: + call.assigned_variable = node.name + if node.type: + call.assigned_variable_type = node.type + return call + + def lookahead_call_helper(self, node: Lookahead, positive: int) -> FunctionCall: + call = self.generate_call(node.node) + if call.nodetype == NodeTypes.NAME_TOKEN: + return FunctionCall( + function=f"_PyPegen_lookahead_with_name", + arguments=[positive, call.function, *call.arguments], + return_type="int", + ) + elif call.nodetype == NodeTypes.SOFT_KEYWORD: + return FunctionCall( + function=f"_PyPegen_lookahead_with_string", + arguments=[positive, call.function, *call.arguments], + return_type="int", + ) + elif call.nodetype in {NodeTypes.GENERIC_TOKEN, NodeTypes.KEYWORD}: + return FunctionCall( + function=f"_PyPegen_lookahead_with_int", + arguments=[positive, call.function, *call.arguments], + return_type="int", + comment=f"token={node.node}", + ) + else: + return FunctionCall( + function=f"_PyPegen_lookahead", + arguments=[positive, call.function, *call.arguments], + return_type="int", + ) + + def visit_PositiveLookahead(self, node: PositiveLookahead) -> FunctionCall: + return self.lookahead_call_helper(node, 1) + + def visit_NegativeLookahead(self, node: NegativeLookahead) -> FunctionCall: + return self.lookahead_call_helper(node, 0) + + def visit_Forced(self, node: Forced) -> FunctionCall: + call = self.generate_call(node.node) + if isinstance(node.node, Leaf): + assert isinstance(node.node, Leaf) + val = ast.literal_eval(node.node.value) + assert val in self.exact_tokens, f"{node.node.value} is not a known literal" + type = self.exact_tokens[val] + return FunctionCall( + assigned_variable="_literal", + function=f"_PyPegen_expect_forced_token", + arguments=["p", type, f'"{val}"'], + nodetype=NodeTypes.GENERIC_TOKEN, + return_type="Token *", + comment=f"forced_token='{val}'", + ) + if isinstance(node.node, Group): + call = self.visit(node.node.rhs) + call.assigned_variable = None + call.comment = None + return FunctionCall( + assigned_variable="_literal", + function=f"_PyPegen_expect_forced_result", + arguments=["p", str(call), f'"{node.node.rhs!s}"'], + return_type="void *", + comment=f"forced_token=({node.node.rhs!s})", + ) + else: + raise NotImplementedError(f"Forced tokens don't work with {node.node} nodes") + + def visit_Opt(self, node: Opt) -> FunctionCall: + call = self.generate_call(node.node) + return FunctionCall( + assigned_variable="_opt_var", + function=call.function, + arguments=call.arguments, + force_true=True, + comment=f"{node}", + ) + + def visit_Repeat0(self, node: Repeat0) -> FunctionCall: + if node in self.cache: + return self.cache[node] + name = self.gen.artificial_rule_from_repeat(node.node, False) + self.cache[node] = FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + return_type="asdl_seq *", + comment=f"{node}", + ) + return self.cache[node] + + def visit_Repeat1(self, node: Repeat1) -> FunctionCall: + if node in self.cache: + return self.cache[node] + name = self.gen.artificial_rule_from_repeat(node.node, True) + self.cache[node] = FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + return_type="asdl_seq *", + comment=f"{node}", + ) + return self.cache[node] + + def visit_Gather(self, node: Gather) -> FunctionCall: + if node in self.cache: + return self.cache[node] + name = self.gen.artifical_rule_from_gather(node) + self.cache[node] = FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + return_type="asdl_seq *", + comment=f"{node}", + ) + return self.cache[node] + + def visit_Group(self, node: Group) -> FunctionCall: + return self.generate_call(node.rhs) + + def visit_Cut(self, node: Cut) -> FunctionCall: + return FunctionCall( + assigned_variable="_cut_var", + return_type="int", + function="1", + nodetype=NodeTypes.CUT_OPERATOR, + ) + + def generate_call(self, node: Any) -> FunctionCall: + return super().visit(node) + + +class CParserGenerator(ParserGenerator, GrammarVisitor): + def __init__( + self, + grammar: grammar.Grammar, + tokens: Dict[int, str], + exact_tokens: Dict[str, int], + non_exact_tokens: Set[str], + file: Optional[IO[Text]], + debug: bool = False, + skip_actions: bool = False, + ): + super().__init__(grammar, set(tokens.values()), file) + self.callmakervisitor: CCallMakerVisitor = CCallMakerVisitor( + self, exact_tokens, non_exact_tokens + ) + self._varname_counter = 0 + self.debug = debug + self.skip_actions = skip_actions + self.cleanup_statements: List[str] = [] + + def add_level(self) -> None: + self.print("if (p->level++ == MAXSTACK) {") + with self.indent(): + self.print("_Pypegen_stack_overflow(p);") + self.print("}") + + def remove_level(self) -> None: + self.print("p->level--;") + + def add_return(self, ret_val: str) -> None: + for stmt in self.cleanup_statements: + self.print(stmt) + self.remove_level() + self.print(f"return {ret_val};") + + def unique_varname(self, name: str = "tmpvar") -> str: + new_var = name + "_" + str(self._varname_counter) + self._varname_counter += 1 + return new_var + + def call_with_errorcheck_return(self, call_text: str, returnval: str) -> None: + error_var = self.unique_varname() + self.print(f"int {error_var} = {call_text};") + self.print(f"if ({error_var}) {{") + with self.indent(): + self.add_return(returnval) + self.print("}") + + def call_with_errorcheck_goto(self, call_text: str, goto_target: str) -> None: + error_var = self.unique_varname() + self.print(f"int {error_var} = {call_text};") + self.print(f"if ({error_var}) {{") + with self.indent(): + self.print(f"goto {goto_target};") + self.print(f"}}") + + def out_of_memory_return( + self, + expr: str, + cleanup_code: Optional[str] = None, + ) -> None: + self.print(f"if ({expr}) {{") + with self.indent(): + if cleanup_code is not None: + self.print(cleanup_code) + self.print("p->error_indicator = 1;") + self.print("PyErr_NoMemory();") + self.add_return("NULL") + self.print(f"}}") + + def out_of_memory_goto(self, expr: str, goto_target: str) -> None: + self.print(f"if ({expr}) {{") + with self.indent(): + self.print("PyErr_NoMemory();") + self.print(f"goto {goto_target};") + self.print(f"}}") + + def generate(self, filename: str) -> None: + self.collect_rules() + basename = os.path.basename(filename) + self.print(f"// @generated by pegen from {basename}") + header = self.grammar.metas.get("header", EXTENSION_PREFIX) + if header: + self.print(header.rstrip("\n")) + subheader = self.grammar.metas.get("subheader", "") + if subheader: + self.print(subheader) + self._setup_keywords() + self._setup_soft_keywords() + for i, (rulename, rule) in enumerate(self.all_rules.items(), 1000): + comment = " // Left-recursive" if rule.left_recursive else "" + self.print(f"#define {rulename}_type {i}{comment}") + self.print() + for rulename, rule in self.all_rules.items(): + if rule.is_loop() or rule.is_gather(): + type = "asdl_seq *" + elif rule.type: + type = rule.type + " " + else: + type = "void *" + self.print(f"static {type}{rulename}_rule(Parser *p);") + self.print() + for rulename, rule in list(self.all_rules.items()): + self.print() + if rule.left_recursive: + self.print("// Left-recursive") + self.visit(rule) + if self.skip_actions: + mode = 0 + else: + mode = int(self.rules["start"].type == "mod_ty") if "start" in self.rules else 1 + if mode == 1 and self.grammar.metas.get("bytecode"): + mode += 1 + modulename = self.grammar.metas.get("modulename", "parse") + trailer = self.grammar.metas.get("trailer", EXTENSION_SUFFIX) + if trailer: + self.print(trailer.rstrip("\n") % dict(mode=mode, modulename=modulename)) + + def _group_keywords_by_length(self) -> Dict[int, List[Tuple[str, int]]]: + groups: Dict[int, List[Tuple[str, int]]] = {} + for keyword_str, keyword_type in self.keywords.items(): + length = len(keyword_str) + if length in groups: + groups[length].append((keyword_str, keyword_type)) + else: + groups[length] = [(keyword_str, keyword_type)] + return groups + + def _setup_keywords(self) -> None: + n_keyword_lists = ( + len(max(self.keywords.keys(), key=len)) + 1 if len(self.keywords) > 0 else 0 + ) + self.print(f"static const int n_keyword_lists = {n_keyword_lists};") + groups = self._group_keywords_by_length() + self.print("static KeywordToken *reserved_keywords[] = {") + with self.indent(): + num_groups = max(groups) + 1 if groups else 1 + for keywords_length in range(num_groups): + if keywords_length not in groups.keys(): + self.print("(KeywordToken[]) {{NULL, -1}},") + else: + self.print("(KeywordToken[]) {") + with self.indent(): + for keyword_str, keyword_type in groups[keywords_length]: + self.print(f'{{"{keyword_str}", {keyword_type}}},') + self.print("{NULL, -1},") + self.print("},") + self.print("};") + + def _setup_soft_keywords(self) -> None: + soft_keywords = sorted(self.soft_keywords) + self.print("static char *soft_keywords[] = {") + with self.indent(): + for keyword in soft_keywords: + self.print(f'"{keyword}",') + self.print("NULL,") + self.print("};") + + def _set_up_token_start_metadata_extraction(self) -> None: + self.print("if (p->mark == p->fill && _PyPegen_fill_token(p) < 0) {") + with self.indent(): + self.print("p->error_indicator = 1;") + self.add_return("NULL") + self.print("}") + self.print("int _start_lineno = p->tokens[_mark]->lineno;") + self.print("UNUSED(_start_lineno); // Only used by EXTRA macro") + self.print("int _start_col_offset = p->tokens[_mark]->col_offset;") + self.print("UNUSED(_start_col_offset); // Only used by EXTRA macro") + + def _set_up_token_end_metadata_extraction(self) -> None: + self.print("Token *_token = _PyPegen_get_last_nonnwhitespace_token(p);") + self.print("if (_token == NULL) {") + with self.indent(): + self.add_return("NULL") + self.print("}") + self.print("int _end_lineno = _token->end_lineno;") + self.print("UNUSED(_end_lineno); // Only used by EXTRA macro") + self.print("int _end_col_offset = _token->end_col_offset;") + self.print("UNUSED(_end_col_offset); // Only used by EXTRA macro") + + def _check_for_errors(self) -> None: + self.print("if (p->error_indicator) {") + with self.indent(): + self.add_return("NULL") + self.print("}") + + def _set_up_rule_memoization(self, node: Rule, result_type: str) -> None: + self.print("{") + with self.indent(): + self.add_level() + self.print(f"{result_type} _res = NULL;") + self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res)) {{") + with self.indent(): + self.add_return("_res") + self.print("}") + self.print("int _mark = p->mark;") + self.print("int _resmark = p->mark;") + self.print("while (1) {") + with self.indent(): + self.call_with_errorcheck_return( + f"_PyPegen_update_memo(p, _mark, {node.name}_type, _res)", "_res" + ) + self.print("p->mark = _mark;") + self.print(f"void *_raw = {node.name}_raw(p);") + self.print("if (p->error_indicator) {") + with self.indent(): + self.add_return("NULL") + self.print("}") + self.print("if (_raw == NULL || p->mark <= _resmark)") + with self.indent(): + self.print("break;") + self.print(f"_resmark = p->mark;") + self.print("_res = _raw;") + self.print("}") + self.print(f"p->mark = _resmark;") + self.add_return("_res") + self.print("}") + self.print(f"static {result_type}") + self.print(f"{node.name}_raw(Parser *p)") + + def _should_memoize(self, node: Rule) -> bool: + return node.memo and not node.left_recursive + + def _handle_default_rule_body(self, node: Rule, rhs: Rhs, result_type: str) -> None: + memoize = self._should_memoize(node) + + with self.indent(): + self.add_level() + self._check_for_errors() + self.print(f"{result_type} _res = NULL;") + if memoize: + self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res)) {{") + with self.indent(): + self.add_return("_res") + self.print("}") + self.print("int _mark = p->mark;") + if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts): + self._set_up_token_start_metadata_extraction() + self.visit( + rhs, + is_loop=False, + is_gather=node.is_gather(), + rulename=node.name, + ) + if self.debug: + self.print(f'D(fprintf(stderr, "Fail at %d: {node.name}\\n", p->mark));') + self.print("_res = NULL;") + self.print(" done:") + with self.indent(): + if memoize: + self.print(f"_PyPegen_insert_memo(p, _mark, {node.name}_type, _res);") + self.add_return("_res") + + def _handle_loop_rule_body(self, node: Rule, rhs: Rhs) -> None: + memoize = self._should_memoize(node) + is_repeat1 = node.name.startswith("_loop1") + + with self.indent(): + self.add_level() + self._check_for_errors() + self.print("void *_res = NULL;") + if memoize: + self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res)) {{") + with self.indent(): + self.add_return("_res") + self.print("}") + self.print("int _mark = p->mark;") + if memoize: + self.print("int _start_mark = p->mark;") + self.print("void **_children = PyMem_Malloc(sizeof(void *));") + self.out_of_memory_return(f"!_children") + self.print("Py_ssize_t _children_capacity = 1;") + self.print("Py_ssize_t _n = 0;") + if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts): + self._set_up_token_start_metadata_extraction() + self.visit( + rhs, + is_loop=True, + is_gather=node.is_gather(), + rulename=node.name, + ) + if is_repeat1: + self.print("if (_n == 0 || p->error_indicator) {") + with self.indent(): + self.print("PyMem_Free(_children);") + self.add_return("NULL") + self.print("}") + self.print("asdl_seq *_seq = (asdl_seq*)_Py_asdl_generic_seq_new(_n, p->arena);") + self.out_of_memory_return(f"!_seq", cleanup_code="PyMem_Free(_children);") + self.print("for (int i = 0; i < _n; i++) asdl_seq_SET_UNTYPED(_seq, i, _children[i]);") + self.print("PyMem_Free(_children);") + if memoize and node.name: + self.print(f"_PyPegen_insert_memo(p, _start_mark, {node.name}_type, _seq);") + self.add_return("_seq") + + def visit_Rule(self, node: Rule) -> None: + is_loop = node.is_loop() + is_gather = node.is_gather() + rhs = node.flatten() + if is_loop or is_gather: + result_type = "asdl_seq *" + elif node.type: + result_type = node.type + else: + result_type = "void *" + + for line in str(node).splitlines(): + self.print(f"// {line}") + if node.left_recursive and node.leader: + self.print(f"static {result_type} {node.name}_raw(Parser *);") + + self.print(f"static {result_type}") + self.print(f"{node.name}_rule(Parser *p)") + + if node.left_recursive and node.leader: + self._set_up_rule_memoization(node, result_type) + + self.print("{") + + if node.name.endswith("without_invalid"): + with self.indent(): + self.print("int _prev_call_invalid = p->call_invalid_rules;") + self.print("p->call_invalid_rules = 0;") + self.cleanup_statements.append("p->call_invalid_rules = _prev_call_invalid;") + + if is_loop: + self._handle_loop_rule_body(node, rhs) + else: + self._handle_default_rule_body(node, rhs, result_type) + + if node.name.endswith("without_invalid"): + self.cleanup_statements.pop() + + self.print("}") + + def visit_NamedItem(self, node: NamedItem) -> None: + call = self.callmakervisitor.generate_call(node) + if call.assigned_variable: + call.assigned_variable = self.dedupe(call.assigned_variable) + self.print(call) + + def visit_Rhs( + self, node: Rhs, is_loop: bool, is_gather: bool, rulename: Optional[str] + ) -> None: + if is_loop: + assert len(node.alts) == 1 + for alt in node.alts: + self.visit(alt, is_loop=is_loop, is_gather=is_gather, rulename=rulename) + + def join_conditions(self, keyword: str, node: Any) -> None: + self.print(f"{keyword} (") + with self.indent(): + first = True + for item in node.items: + if first: + first = False + else: + self.print("&&") + self.visit(item) + self.print(")") + + def emit_action(self, node: Alt, cleanup_code: Optional[str] = None) -> None: + self.print(f"_res = {node.action};") + + self.print("if (_res == NULL && PyErr_Occurred()) {") + with self.indent(): + self.print("p->error_indicator = 1;") + if cleanup_code: + self.print(cleanup_code) + self.add_return("NULL") + self.print("}") + + if self.debug: + self.print( + f'D(fprintf(stderr, "Hit with action [%d-%d]: %s\\n", _mark, p->mark, "{node}"));' + ) + + def emit_default_action(self, is_gather: bool, node: Alt) -> None: + if len(self.local_variable_names) > 1: + if is_gather: + assert len(self.local_variable_names) == 2 + self.print( + f"_res = _PyPegen_seq_insert_in_front(p, " + f"{self.local_variable_names[0]}, {self.local_variable_names[1]});" + ) + else: + if self.debug: + self.print( + f'D(fprintf(stderr, "Hit without action [%d:%d]: %s\\n", _mark, p->mark, "{node}"));' + ) + self.print( + f"_res = _PyPegen_dummy_name(p, {', '.join(self.local_variable_names)});" + ) + else: + if self.debug: + self.print( + f'D(fprintf(stderr, "Hit with default action [%d:%d]: %s\\n", _mark, p->mark, "{node}"));' + ) + self.print(f"_res = {self.local_variable_names[0]};") + + def emit_dummy_action(self) -> None: + self.print("_res = _PyPegen_dummy_name(p);") + + def handle_alt_normal(self, node: Alt, is_gather: bool, rulename: Optional[str]) -> None: + self.join_conditions(keyword="if", node=node) + self.print("{") + # We have parsed successfully all the conditions for the option. + with self.indent(): + node_str = str(node).replace('"', '\\"') + self.print( + f'D(fprintf(stderr, "%*c+ {rulename}[%d-%d]: %s succeeded!\\n", p->level, \' \', _mark, p->mark, "{node_str}"));' + ) + # Prepare to emit the rule action and do so + if node.action and "EXTRA" in node.action: + self._set_up_token_end_metadata_extraction() + if self.skip_actions: + self.emit_dummy_action() + elif node.action: + self.emit_action(node) + else: + self.emit_default_action(is_gather, node) + + # As the current option has parsed correctly, do not continue with the rest. + self.print(f"goto done;") + self.print("}") + + def handle_alt_loop(self, node: Alt, is_gather: bool, rulename: Optional[str]) -> None: + # Condition of the main body of the alternative + self.join_conditions(keyword="while", node=node) + self.print("{") + # We have parsed successfully one item! + with self.indent(): + # Prepare to emit the rule action and do so + if node.action and "EXTRA" in node.action: + self._set_up_token_end_metadata_extraction() + if self.skip_actions: + self.emit_dummy_action() + elif node.action: + self.emit_action(node, cleanup_code="PyMem_Free(_children);") + else: + self.emit_default_action(is_gather, node) + + # Add the result of rule to the temporary buffer of children. This buffer + # will populate later an asdl_seq with all elements to return. + self.print("if (_n == _children_capacity) {") + with self.indent(): + self.print("_children_capacity *= 2;") + self.print( + "void **_new_children = PyMem_Realloc(_children, _children_capacity*sizeof(void *));" + ) + self.out_of_memory_return(f"!_new_children", cleanup_code="PyMem_Free(_children);") + self.print("_children = _new_children;") + self.print("}") + self.print("_children[_n++] = _res;") + self.print("_mark = p->mark;") + self.print("}") + + def visit_Alt( + self, node: Alt, is_loop: bool, is_gather: bool, rulename: Optional[str] + ) -> None: + if len(node.items) == 1 and str(node.items[0]).startswith("invalid_"): + self.print(f"if (p->call_invalid_rules) {{ // {node}") + else: + self.print(f"{{ // {node}") + with self.indent(): + self._check_for_errors() + node_str = str(node).replace('"', '\\"') + self.print( + f'D(fprintf(stderr, "%*c> {rulename}[%d-%d]: %s\\n", p->level, \' \', _mark, p->mark, "{node_str}"));' + ) + # Prepare variable declarations for the alternative + vars = self.collect_vars(node) + for v, var_type in sorted(item for item in vars.items() if item[0] is not None): + if not var_type: + var_type = "void *" + else: + var_type += " " + if v == "_cut_var": + v += " = 0" # cut_var must be initialized + self.print(f"{var_type}{v};") + if v and v.startswith("_opt_var"): + self.print(f"UNUSED({v}); // Silence compiler warnings") + + with self.local_variable_context(): + if is_loop: + self.handle_alt_loop(node, is_gather, rulename) + else: + self.handle_alt_normal(node, is_gather, rulename) + + self.print("p->mark = _mark;") + node_str = str(node).replace('"', '\\"') + self.print( + f"D(fprintf(stderr, \"%*c%s {rulename}[%d-%d]: %s failed!\\n\", p->level, ' ',\n" + f' p->error_indicator ? "ERROR!" : "-", _mark, p->mark, "{node_str}"));' + ) + if "_cut_var" in vars: + self.print("if (_cut_var) {") + with self.indent(): + self.add_return("NULL") + self.print("}") + self.print("}") + + def collect_vars(self, node: Alt) -> Dict[Optional[str], Optional[str]]: + types = {} + with self.local_variable_context(): + for item in node.items: + name, type = self.add_var(item) + types[name] = type + return types + + def add_var(self, node: NamedItem) -> Tuple[Optional[str], Optional[str]]: + call = self.callmakervisitor.generate_call(node.item) + name = node.name if node.name else call.assigned_variable + if name is not None: + name = self.dedupe(name) + return_type = call.return_type if node.type is None else node.type + return name, return_type diff --git a/scripts/pegen/first_sets.py b/scripts/pegen/first_sets.py new file mode 100644 index 000000000..6d794ffa4 --- /dev/null +++ b/scripts/pegen/first_sets.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3.8 + +import argparse +import pprint +import sys +from typing import Dict, Set + +from pegen.build import build_parser +from pegen.grammar import ( + Alt, + Cut, + Gather, + GrammarVisitor, + Group, + Lookahead, + NamedItem, + NameLeaf, + NegativeLookahead, + Opt, + Repeat0, + Repeat1, + Rhs, + Rule, + StringLeaf, +) +from pegen.parser_generator import compute_nullables + +argparser = argparse.ArgumentParser( + prog="calculate_first_sets", + description="Calculate the first sets of a grammar", +) +argparser.add_argument("grammar_file", help="The grammar file") + + +class FirstSetCalculator(GrammarVisitor): + def __init__(self, rules: Dict[str, Rule]) -> None: + self.rules = rules + self.nullables = compute_nullables(rules) + self.first_sets: Dict[str, Set[str]] = dict() + self.in_process: Set[str] = set() + + def calculate(self) -> Dict[str, Set[str]]: + for name, rule in self.rules.items(): + self.visit(rule) + return self.first_sets + + def visit_Alt(self, item: Alt) -> Set[str]: + result: Set[str] = set() + to_remove: Set[str] = set() + for other in item.items: + new_terminals = self.visit(other) + if isinstance(other.item, NegativeLookahead): + to_remove |= new_terminals + result |= new_terminals + if to_remove: + result -= to_remove + + # If the set of new terminals can start with the empty string, + # it means that the item is completely nullable and we should + # also considering at least the next item in case the current + # one fails to parse. + + if "" in new_terminals: + continue + + if not isinstance(other.item, (Opt, NegativeLookahead, Repeat0)): + break + + # Do not allow the empty string to propagate. + result.discard("") + + return result + + def visit_Cut(self, item: Cut) -> Set[str]: + return set() + + def visit_Group(self, item: Group) -> Set[str]: + return self.visit(item.rhs) + + def visit_PositiveLookahead(self, item: Lookahead) -> Set[str]: + return self.visit(item.node) + + def visit_NegativeLookahead(self, item: NegativeLookahead) -> Set[str]: + return self.visit(item.node) + + def visit_NamedItem(self, item: NamedItem) -> Set[str]: + return self.visit(item.item) + + def visit_Opt(self, item: Opt) -> Set[str]: + return self.visit(item.node) + + def visit_Gather(self, item: Gather) -> Set[str]: + return self.visit(item.node) + + def visit_Repeat0(self, item: Repeat0) -> Set[str]: + return self.visit(item.node) + + def visit_Repeat1(self, item: Repeat1) -> Set[str]: + return self.visit(item.node) + + def visit_NameLeaf(self, item: NameLeaf) -> Set[str]: + if item.value not in self.rules: + return {item.value} + + if item.value not in self.first_sets: + self.first_sets[item.value] = self.visit(self.rules[item.value]) + return self.first_sets[item.value] + elif item.value in self.in_process: + return set() + + return self.first_sets[item.value] + + def visit_StringLeaf(self, item: StringLeaf) -> Set[str]: + return {item.value} + + def visit_Rhs(self, item: Rhs) -> Set[str]: + result: Set[str] = set() + for alt in item.alts: + result |= self.visit(alt) + return result + + def visit_Rule(self, item: Rule) -> Set[str]: + if item.name in self.in_process: + return set() + elif item.name not in self.first_sets: + self.in_process.add(item.name) + terminals = self.visit(item.rhs) + if item in self.nullables: + terminals.add("") + self.first_sets[item.name] = terminals + self.in_process.remove(item.name) + return self.first_sets[item.name] + + +def main() -> None: + args = argparser.parse_args() + + try: + grammar, parser, tokenizer = build_parser(args.grammar_file) + except Exception as err: + print("ERROR: Failed to parse grammar file", file=sys.stderr) + sys.exit(1) + + firs_sets = FirstSetCalculator(grammar.rules).calculate() + pprint.pprint(firs_sets) + + +if __name__ == "__main__": + main() diff --git a/scripts/pegen/grammar.py b/scripts/pegen/grammar.py new file mode 100644 index 000000000..a6c19bc7b --- /dev/null +++ b/scripts/pegen/grammar.py @@ -0,0 +1,363 @@ +from __future__ import annotations + +from typing import ( + AbstractSet, + Any, + Iterable, + Iterator, + List, + Optional, + Tuple, + Union, +) + + +class GrammarError(Exception): + pass + + +class GrammarVisitor: + def visit(self, node: Any, *args: Any, **kwargs: Any) -> Any: + """Visit a node.""" + method = "visit_" + node.__class__.__name__ + visitor = getattr(self, method, self.generic_visit) + return visitor(node, *args, **kwargs) + + def generic_visit(self, node: Iterable[Any], *args: Any, **kwargs: Any) -> Any: + """Called if no explicit visitor function exists for a node.""" + for value in node: + if isinstance(value, list): + for item in value: + self.visit(item, *args, **kwargs) + else: + self.visit(value, *args, **kwargs) + + +class Grammar: + def __init__(self, rules: Iterable[Rule], metas: Iterable[Tuple[str, Optional[str]]]): + # Check if there are repeated rules in "rules" + all_rules = {} + for rule in rules: + if rule.name in all_rules: + raise GrammarError(f"Repeated rule {rule.name!r}") + all_rules[rule.name] = rule + self.rules = all_rules + self.metas = dict(metas) + + def __str__(self) -> str: + return "\n".join(str(rule) for name, rule in self.rules.items()) + + def __repr__(self) -> str: + lines = ["Grammar("] + lines.append(" [") + for rule in self.rules.values(): + lines.append(f" {repr(rule)},") + lines.append(" ],") + lines.append(" {repr(list(self.metas.items()))}") + lines.append(")") + return "\n".join(lines) + + def __iter__(self) -> Iterator[Rule]: + yield from self.rules.values() + + +# Global flag whether we want actions in __str__() -- default off. +SIMPLE_STR = True + + +class Rule: + def __init__(self, name: str, type: Optional[str], rhs: Rhs, memo: Optional[object] = None): + self.name = name + self.type = type + self.rhs = rhs + self.memo = bool(memo) + self.left_recursive = False + self.leader = False + + def is_loop(self) -> bool: + return self.name.startswith("_loop") + + def is_gather(self) -> bool: + return self.name.startswith("_gather") + + def __str__(self) -> str: + if SIMPLE_STR or self.type is None: + res = f"{self.name}: {self.rhs}" + else: + res = f"{self.name}[{self.type}]: {self.rhs}" + if len(res) < 88: + return res + lines = [res.split(":")[0] + ":"] + lines += [f" | {alt}" for alt in self.rhs.alts] + return "\n".join(lines) + + def __repr__(self) -> str: + return f"Rule({self.name!r}, {self.type!r}, {self.rhs!r})" + + def __iter__(self) -> Iterator[Rhs]: + yield self.rhs + + def flatten(self) -> Rhs: + # If it's a single parenthesized group, flatten it. + rhs = self.rhs + if ( + not self.is_loop() + and len(rhs.alts) == 1 + and len(rhs.alts[0].items) == 1 + and isinstance(rhs.alts[0].items[0].item, Group) + ): + rhs = rhs.alts[0].items[0].item.rhs + return rhs + + +class Leaf: + def __init__(self, value: str): + self.value = value + + def __str__(self) -> str: + return self.value + + def __iter__(self) -> Iterable[str]: + if False: + yield + + +class NameLeaf(Leaf): + """The value is the name.""" + + def __str__(self) -> str: + if self.value == "ENDMARKER": + return "$" + return super().__str__() + + def __repr__(self) -> str: + return f"NameLeaf({self.value!r})" + + +class StringLeaf(Leaf): + """The value is a string literal, including quotes.""" + + def __repr__(self) -> str: + return f"StringLeaf({self.value!r})" + + +class Rhs: + def __init__(self, alts: List[Alt]): + self.alts = alts + self.memo: Optional[Tuple[Optional[str], str]] = None + + def __str__(self) -> str: + return " | ".join(str(alt) for alt in self.alts) + + def __repr__(self) -> str: + return f"Rhs({self.alts!r})" + + def __iter__(self) -> Iterator[List[Alt]]: + yield self.alts + + @property + def can_be_inlined(self) -> bool: + if len(self.alts) != 1 or len(self.alts[0].items) != 1: + return False + # If the alternative has an action we cannot inline + if getattr(self.alts[0], "action", None) is not None: + return False + return True + + +class Alt: + def __init__(self, items: List[NamedItem], *, icut: int = -1, action: Optional[str] = None): + self.items = items + self.icut = icut + self.action = action + + def __str__(self) -> str: + core = " ".join(str(item) for item in self.items) + if not SIMPLE_STR and self.action: + return f"{core} {{ {self.action} }}" + else: + return core + + def __repr__(self) -> str: + args = [repr(self.items)] + if self.icut >= 0: + args.append(f"icut={self.icut}") + if self.action: + args.append(f"action={self.action!r}") + return f"Alt({', '.join(args)})" + + def __iter__(self) -> Iterator[List[NamedItem]]: + yield self.items + + +class NamedItem: + def __init__(self, name: Optional[str], item: Item, type: Optional[str] = None): + self.name = name + self.item = item + self.type = type + + def __str__(self) -> str: + if not SIMPLE_STR and self.name: + return f"{self.name}={self.item}" + else: + return str(self.item) + + def __repr__(self) -> str: + return f"NamedItem({self.name!r}, {self.item!r})" + + def __iter__(self) -> Iterator[Item]: + yield self.item + + +class Forced: + def __init__(self, node: Plain): + self.node = node + + def __str__(self) -> str: + return f"&&{self.node}" + + def __iter__(self) -> Iterator[Plain]: + yield self.node + + +class Lookahead: + def __init__(self, node: Plain, sign: str): + self.node = node + self.sign = sign + + def __str__(self) -> str: + return f"{self.sign}{self.node}" + + def __iter__(self) -> Iterator[Plain]: + yield self.node + + +class PositiveLookahead(Lookahead): + def __init__(self, node: Plain): + super().__init__(node, "&") + + def __repr__(self) -> str: + return f"PositiveLookahead({self.node!r})" + + +class NegativeLookahead(Lookahead): + def __init__(self, node: Plain): + super().__init__(node, "!") + + def __repr__(self) -> str: + return f"NegativeLookahead({self.node!r})" + + +class Opt: + def __init__(self, node: Item): + self.node = node + + def __str__(self) -> str: + s = str(self.node) + # TODO: Decide whether to use [X] or X? based on type of X + if " " in s: + return f"[{s}]" + else: + return f"{s}?" + + def __repr__(self) -> str: + return f"Opt({self.node!r})" + + def __iter__(self) -> Iterator[Item]: + yield self.node + + +class Repeat: + """Shared base class for x* and x+.""" + + def __init__(self, node: Plain): + self.node = node + self.memo: Optional[Tuple[Optional[str], str]] = None + + def __iter__(self) -> Iterator[Plain]: + yield self.node + + +class Repeat0(Repeat): + def __str__(self) -> str: + s = str(self.node) + # TODO: Decide whether to use (X)* or X* based on type of X + if " " in s: + return f"({s})*" + else: + return f"{s}*" + + def __repr__(self) -> str: + return f"Repeat0({self.node!r})" + + +class Repeat1(Repeat): + def __str__(self) -> str: + s = str(self.node) + # TODO: Decide whether to use (X)+ or X+ based on type of X + if " " in s: + return f"({s})+" + else: + return f"{s}+" + + def __repr__(self) -> str: + return f"Repeat1({self.node!r})" + + +class Gather(Repeat): + def __init__(self, separator: Plain, node: Plain): + self.separator = separator + self.node = node + + def __str__(self) -> str: + return f"{self.separator!s}.{self.node!s}+" + + def __repr__(self) -> str: + return f"Gather({self.separator!r}, {self.node!r})" + + +class Group: + def __init__(self, rhs: Rhs): + self.rhs = rhs + + def __str__(self) -> str: + return f"({self.rhs})" + + def __repr__(self) -> str: + return f"Group({self.rhs!r})" + + def __iter__(self) -> Iterator[Rhs]: + yield self.rhs + + +class Cut: + def __init__(self) -> None: + pass + + def __repr__(self) -> str: + return f"Cut()" + + def __str__(self) -> str: + return f"~" + + def __iter__(self) -> Iterator[Tuple[str, str]]: + if False: + yield + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Cut): + return NotImplemented + return True + + def initial_names(self) -> AbstractSet[str]: + return set() + + +Plain = Union[Leaf, Group] +Item = Union[Plain, Opt, Repeat, Forced, Lookahead, Rhs, Cut] +RuleName = Tuple[str, str] +MetaTuple = Tuple[str, Optional[str]] +MetaList = List[MetaTuple] +RuleList = List[Rule] +NamedItemList = List[NamedItem] +LookaheadOrCut = Union[Lookahead, Cut] diff --git a/scripts/pegen/grammar_parser.py b/scripts/pegen/grammar_parser.py new file mode 100644 index 000000000..bf31fe532 --- /dev/null +++ b/scripts/pegen/grammar_parser.py @@ -0,0 +1,655 @@ +#!/usr/bin/env python3.8 +# @generated by pegen from metagrammar.gram + +import ast +import sys +import tokenize + +from typing import Any, Optional + +from pegen.parser import memoize, memoize_left_rec, logger, Parser +from ast import literal_eval + +from pegen.grammar import ( + Alt, + Cut, + Forced, + Gather, + Group, + Item, + Lookahead, + LookaheadOrCut, + MetaTuple, + MetaList, + NameLeaf, + NamedItem, + NamedItemList, + NegativeLookahead, + Opt, + Plain, + PositiveLookahead, + Repeat0, + Repeat1, + Rhs, + Rule, + RuleList, + RuleName, + Grammar, + StringLeaf, +) + +# Keywords and soft keywords are listed at the end of the parser definition. +class GeneratedParser(Parser): + + @memoize + def start(self) -> Optional[Grammar]: + # start: grammar $ + mark = self._mark() + if ( + (grammar := self.grammar()) + and + (_endmarker := self.expect('ENDMARKER')) + ): + return grammar + self._reset(mark) + return None + + @memoize + def grammar(self) -> Optional[Grammar]: + # grammar: metas rules | rules + mark = self._mark() + if ( + (metas := self.metas()) + and + (rules := self.rules()) + ): + return Grammar ( rules , metas ) + self._reset(mark) + if ( + (rules := self.rules()) + ): + return Grammar ( rules , [] ) + self._reset(mark) + return None + + @memoize + def metas(self) -> Optional[MetaList]: + # metas: meta metas | meta + mark = self._mark() + if ( + (meta := self.meta()) + and + (metas := self.metas()) + ): + return [meta] + metas + self._reset(mark) + if ( + (meta := self.meta()) + ): + return [meta] + self._reset(mark) + return None + + @memoize + def meta(self) -> Optional[MetaTuple]: + # meta: "@" NAME NEWLINE | "@" NAME NAME NEWLINE | "@" NAME STRING NEWLINE + mark = self._mark() + if ( + (literal := self.expect("@")) + and + (name := self.name()) + and + (_newline := self.expect('NEWLINE')) + ): + return ( name . string , None ) + self._reset(mark) + if ( + (literal := self.expect("@")) + and + (a := self.name()) + and + (b := self.name()) + and + (_newline := self.expect('NEWLINE')) + ): + return ( a . string , b . string ) + self._reset(mark) + if ( + (literal := self.expect("@")) + and + (name := self.name()) + and + (string := self.string()) + and + (_newline := self.expect('NEWLINE')) + ): + return ( name . string , literal_eval ( string . string ) ) + self._reset(mark) + return None + + @memoize + def rules(self) -> Optional[RuleList]: + # rules: rule rules | rule + mark = self._mark() + if ( + (rule := self.rule()) + and + (rules := self.rules()) + ): + return [rule] + rules + self._reset(mark) + if ( + (rule := self.rule()) + ): + return [rule] + self._reset(mark) + return None + + @memoize + def rule(self) -> Optional[Rule]: + # rule: rulename memoflag? ":" alts NEWLINE INDENT more_alts DEDENT | rulename memoflag? ":" NEWLINE INDENT more_alts DEDENT | rulename memoflag? ":" alts NEWLINE + mark = self._mark() + if ( + (rulename := self.rulename()) + and + (opt := self.memoflag(),) + and + (literal := self.expect(":")) + and + (alts := self.alts()) + and + (_newline := self.expect('NEWLINE')) + and + (_indent := self.expect('INDENT')) + and + (more_alts := self.more_alts()) + and + (_dedent := self.expect('DEDENT')) + ): + return Rule ( rulename [0] , rulename [1] , Rhs ( alts . alts + more_alts . alts ) , memo = opt ) + self._reset(mark) + if ( + (rulename := self.rulename()) + and + (opt := self.memoflag(),) + and + (literal := self.expect(":")) + and + (_newline := self.expect('NEWLINE')) + and + (_indent := self.expect('INDENT')) + and + (more_alts := self.more_alts()) + and + (_dedent := self.expect('DEDENT')) + ): + return Rule ( rulename [0] , rulename [1] , more_alts , memo = opt ) + self._reset(mark) + if ( + (rulename := self.rulename()) + and + (opt := self.memoflag(),) + and + (literal := self.expect(":")) + and + (alts := self.alts()) + and + (_newline := self.expect('NEWLINE')) + ): + return Rule ( rulename [0] , rulename [1] , alts , memo = opt ) + self._reset(mark) + return None + + @memoize + def rulename(self) -> Optional[RuleName]: + # rulename: NAME annotation | NAME + mark = self._mark() + if ( + (name := self.name()) + and + (annotation := self.annotation()) + ): + return ( name . string , annotation ) + self._reset(mark) + if ( + (name := self.name()) + ): + return ( name . string , None ) + self._reset(mark) + return None + + @memoize + def memoflag(self) -> Optional[str]: + # memoflag: '(' "memo" ')' + mark = self._mark() + if ( + (literal := self.expect('(')) + and + (literal_1 := self.expect("memo")) + and + (literal_2 := self.expect(')')) + ): + return "memo" + self._reset(mark) + return None + + @memoize + def alts(self) -> Optional[Rhs]: + # alts: alt "|" alts | alt + mark = self._mark() + if ( + (alt := self.alt()) + and + (literal := self.expect("|")) + and + (alts := self.alts()) + ): + return Rhs ( [alt] + alts . alts ) + self._reset(mark) + if ( + (alt := self.alt()) + ): + return Rhs ( [alt] ) + self._reset(mark) + return None + + @memoize + def more_alts(self) -> Optional[Rhs]: + # more_alts: "|" alts NEWLINE more_alts | "|" alts NEWLINE + mark = self._mark() + if ( + (literal := self.expect("|")) + and + (alts := self.alts()) + and + (_newline := self.expect('NEWLINE')) + and + (more_alts := self.more_alts()) + ): + return Rhs ( alts . alts + more_alts . alts ) + self._reset(mark) + if ( + (literal := self.expect("|")) + and + (alts := self.alts()) + and + (_newline := self.expect('NEWLINE')) + ): + return Rhs ( alts . alts ) + self._reset(mark) + return None + + @memoize + def alt(self) -> Optional[Alt]: + # alt: items '$' action | items '$' | items action | items + mark = self._mark() + if ( + (items := self.items()) + and + (literal := self.expect('$')) + and + (action := self.action()) + ): + return Alt ( items + [NamedItem ( None , NameLeaf ( 'ENDMARKER' ) )] , action = action ) + self._reset(mark) + if ( + (items := self.items()) + and + (literal := self.expect('$')) + ): + return Alt ( items + [NamedItem ( None , NameLeaf ( 'ENDMARKER' ) )] , action = None ) + self._reset(mark) + if ( + (items := self.items()) + and + (action := self.action()) + ): + return Alt ( items , action = action ) + self._reset(mark) + if ( + (items := self.items()) + ): + return Alt ( items , action = None ) + self._reset(mark) + return None + + @memoize + def items(self) -> Optional[NamedItemList]: + # items: named_item items | named_item + mark = self._mark() + if ( + (named_item := self.named_item()) + and + (items := self.items()) + ): + return [named_item] + items + self._reset(mark) + if ( + (named_item := self.named_item()) + ): + return [named_item] + self._reset(mark) + return None + + @memoize + def named_item(self) -> Optional[NamedItem]: + # named_item: NAME annotation '=' ~ item | NAME '=' ~ item | item | forced_atom | lookahead + mark = self._mark() + cut = False + if ( + (name := self.name()) + and + (annotation := self.annotation()) + and + (literal := self.expect('=')) + and + (cut := True) + and + (item := self.item()) + ): + return NamedItem ( name . string , item , annotation ) + self._reset(mark) + if cut: return None + cut = False + if ( + (name := self.name()) + and + (literal := self.expect('=')) + and + (cut := True) + and + (item := self.item()) + ): + return NamedItem ( name . string , item ) + self._reset(mark) + if cut: return None + if ( + (item := self.item()) + ): + return NamedItem ( None , item ) + self._reset(mark) + if ( + (forced := self.forced_atom()) + ): + return NamedItem ( None , forced ) + self._reset(mark) + if ( + (it := self.lookahead()) + ): + return NamedItem ( None , it ) + self._reset(mark) + return None + + @memoize + def forced_atom(self) -> Optional[Forced]: + # forced_atom: '&' '&' ~ atom + mark = self._mark() + cut = False + if ( + (literal := self.expect('&')) + and + (literal_1 := self.expect('&')) + and + (cut := True) + and + (atom := self.atom()) + ): + return Forced ( atom ) + self._reset(mark) + if cut: return None + return None + + @memoize + def lookahead(self) -> Optional[LookaheadOrCut]: + # lookahead: '&' ~ atom | '!' ~ atom | '~' + mark = self._mark() + cut = False + if ( + (literal := self.expect('&')) + and + (cut := True) + and + (atom := self.atom()) + ): + return PositiveLookahead ( atom ) + self._reset(mark) + if cut: return None + cut = False + if ( + (literal := self.expect('!')) + and + (cut := True) + and + (atom := self.atom()) + ): + return NegativeLookahead ( atom ) + self._reset(mark) + if cut: return None + if ( + (literal := self.expect('~')) + ): + return Cut ( ) + self._reset(mark) + return None + + @memoize + def item(self) -> Optional[Item]: + # item: '[' ~ alts ']' | atom '?' | atom '*' | atom '+' | atom '.' atom '+' | atom + mark = self._mark() + cut = False + if ( + (literal := self.expect('[')) + and + (cut := True) + and + (alts := self.alts()) + and + (literal_1 := self.expect(']')) + ): + return Opt ( alts ) + self._reset(mark) + if cut: return None + if ( + (atom := self.atom()) + and + (literal := self.expect('?')) + ): + return Opt ( atom ) + self._reset(mark) + if ( + (atom := self.atom()) + and + (literal := self.expect('*')) + ): + return Repeat0 ( atom ) + self._reset(mark) + if ( + (atom := self.atom()) + and + (literal := self.expect('+')) + ): + return Repeat1 ( atom ) + self._reset(mark) + if ( + (sep := self.atom()) + and + (literal := self.expect('.')) + and + (node := self.atom()) + and + (literal_1 := self.expect('+')) + ): + return Gather ( sep , node ) + self._reset(mark) + if ( + (atom := self.atom()) + ): + return atom + self._reset(mark) + return None + + @memoize + def atom(self) -> Optional[Plain]: + # atom: '(' ~ alts ')' | NAME | STRING + mark = self._mark() + cut = False + if ( + (literal := self.expect('(')) + and + (cut := True) + and + (alts := self.alts()) + and + (literal_1 := self.expect(')')) + ): + return Group ( alts ) + self._reset(mark) + if cut: return None + if ( + (name := self.name()) + ): + return NameLeaf ( name . string ) + self._reset(mark) + if ( + (string := self.string()) + ): + return StringLeaf ( string . string ) + self._reset(mark) + return None + + @memoize + def action(self) -> Optional[str]: + # action: "{" ~ target_atoms "}" + mark = self._mark() + cut = False + if ( + (literal := self.expect("{")) + and + (cut := True) + and + (target_atoms := self.target_atoms()) + and + (literal_1 := self.expect("}")) + ): + return target_atoms + self._reset(mark) + if cut: return None + return None + + @memoize + def annotation(self) -> Optional[str]: + # annotation: "[" ~ target_atoms "]" + mark = self._mark() + cut = False + if ( + (literal := self.expect("[")) + and + (cut := True) + and + (target_atoms := self.target_atoms()) + and + (literal_1 := self.expect("]")) + ): + return target_atoms + self._reset(mark) + if cut: return None + return None + + @memoize + def target_atoms(self) -> Optional[str]: + # target_atoms: target_atom target_atoms | target_atom + mark = self._mark() + if ( + (target_atom := self.target_atom()) + and + (target_atoms := self.target_atoms()) + ): + return target_atom + " " + target_atoms + self._reset(mark) + if ( + (target_atom := self.target_atom()) + ): + return target_atom + self._reset(mark) + return None + + @memoize + def target_atom(self) -> Optional[str]: + # target_atom: "{" ~ target_atoms? "}" | "[" ~ target_atoms? "]" | NAME "*" | NAME | NUMBER | STRING | "?" | ":" | !"}" !"]" OP + mark = self._mark() + cut = False + if ( + (literal := self.expect("{")) + and + (cut := True) + and + (atoms := self.target_atoms(),) + and + (literal_1 := self.expect("}")) + ): + return "{" + ( atoms or "" ) + "}" + self._reset(mark) + if cut: return None + cut = False + if ( + (literal := self.expect("[")) + and + (cut := True) + and + (atoms := self.target_atoms(),) + and + (literal_1 := self.expect("]")) + ): + return "[" + ( atoms or "" ) + "]" + self._reset(mark) + if cut: return None + if ( + (name := self.name()) + and + (literal := self.expect("*")) + ): + return name . string + "*" + self._reset(mark) + if ( + (name := self.name()) + ): + return name . string + self._reset(mark) + if ( + (number := self.number()) + ): + return number . string + self._reset(mark) + if ( + (string := self.string()) + ): + return string . string + self._reset(mark) + if ( + (literal := self.expect("?")) + ): + return "?" + self._reset(mark) + if ( + (literal := self.expect(":")) + ): + return ":" + self._reset(mark) + if ( + self.negative_lookahead(self.expect, "}") + and + self.negative_lookahead(self.expect, "]") + and + (op := self.op()) + ): + return op . string + self._reset(mark) + return None + + KEYWORDS = () + SOFT_KEYWORDS = ('memo',) + + +if __name__ == '__main__': + from pegen.parser import simple_parser_main + simple_parser_main(GeneratedParser) diff --git a/scripts/pegen/grammar_visualizer.py b/scripts/pegen/grammar_visualizer.py new file mode 100644 index 000000000..ab5c6364f --- /dev/null +++ b/scripts/pegen/grammar_visualizer.py @@ -0,0 +1,64 @@ +import argparse +import sys +from typing import Any, Callable, Iterator + +from pegen.build import build_parser +from pegen.grammar import Grammar, Rule + +argparser = argparse.ArgumentParser( + prog="pegen", description="Pretty print the AST for a given PEG grammar" +) +argparser.add_argument("filename", help="Grammar description") + + +class ASTGrammarPrinter: + def children(self, node: Rule) -> Iterator[Any]: + for value in node: + if isinstance(value, list): + yield from value + else: + yield value + + def name(self, node: Rule) -> str: + if not list(self.children(node)): + return repr(node) + return node.__class__.__name__ + + def print_grammar_ast(self, grammar: Grammar, printer: Callable[..., None] = print) -> None: + for rule in grammar.rules.values(): + printer(self.print_nodes_recursively(rule)) + + def print_nodes_recursively(self, node: Rule, prefix: str = "", istail: bool = True) -> str: + + children = list(self.children(node)) + value = self.name(node) + + line = prefix + ("└──" if istail else "├──") + value + "\n" + sufix = " " if istail else "│ " + + if not children: + return line + + *children, last = children + for child in children: + line += self.print_nodes_recursively(child, prefix + sufix, False) + line += self.print_nodes_recursively(last, prefix + sufix, True) + + return line + + +def main() -> None: + args = argparser.parse_args() + + try: + grammar, parser, tokenizer = build_parser(args.filename) + except Exception as err: + print("ERROR: Failed to parse grammar file", file=sys.stderr) + sys.exit(1) + + visitor = ASTGrammarPrinter() + visitor.print_grammar_ast(grammar) + + +if __name__ == "__main__": + main() diff --git a/scripts/pegen/javascript_generator.py b/scripts/pegen/javascript_generator.py new file mode 100644 index 000000000..7c453fe1d --- /dev/null +++ b/scripts/pegen/javascript_generator.py @@ -0,0 +1,977 @@ +import ast +import os.path +import re +from dataclasses import dataclass, field +from enum import Enum +from typing import IO, Any, Dict, List, Optional, Set, Text, Tuple + +from pegen import grammar +from pegen.grammar import ( + Alt, + Cut, + Forced, + Gather, + GrammarVisitor, + Group, + Leaf, + Lookahead, + NamedItem, + NameLeaf, + NegativeLookahead, + Opt, + PositiveLookahead, + Repeat0, + Repeat1, + Rhs, + Rule, + StringLeaf, +) +from pegen.parser_generator import ParserGenerator + +from adapt_grammar_actions import transform_action + +EXTENSION_PREFIX = r"""function fprintf(dest, format){ + var args = Array.from(arguments).slice(2) + for(var arg of args){ + format = format.replace(/%\*?[a-z]/, arg) + } + return format +} + +const stderr = null + +function D(x){ + console.log(x) +} + +function UNUSED(){ + // does nothing +} + +function strcmp(x, y){ + return x == y ? 0 : x < y ? -1 : 1 +} + +const MAXSTACK = 6000, + NULL = undefined + +const ENDMARKER = 0, + NAME = 1, + NUMBER = 2, + STRING = 3, + NEWLINE = 4, + INDENT = 5, + DEDENT = 6, + LPAR = 7, + RPAR = 8, + LSQB = 9, + RSQB = 10, + COLON = 11, + COMMA = 12, + SEMI = 13, + PLUS = 14, + MINUS = 15, + STAR = 16, + SLASH = 17, + VBAR = 18, + AMPER = 19, + LESS = 20, + GREATER = 21, + EQUAL = 22, + DOT = 23, + PERCENT = 24, + LBRACE = 25, + RBRACE = 26, + EQEQUAL = 27, + NOTEQUAL = 28, + LESSEQUAL = 29, + GREATEREQUAL = 30, + TILDE = 31, + CIRCUMFLEX = 32, + LEFTSHIFT = 33, + RIGHTSHIFT = 34, + DOUBLESTAR = 35, + PLUSEQUAL = 36, + MINEQUAL = 37, + STAREQUAL = 38, + SLASHEQUAL = 39, + PERCENTEQUAL = 40, + AMPEREQUAL = 41, + VBAREQUAL = 42, + CIRCUMFLEXEQUAL = 43, + LEFTSHIFTEQUAL = 44, + RIGHTSHIFTEQUAL = 45, + DOUBLESTAREQUAL = 46, + DOUBLESLASH = 47, + DOUBLESLASHEQUAL = 48, + AT = 49, + ATEQUAL = 50, + RARROW = 51, + ELLIPSIS = 52, + COLONEQUAL = 53, + EXCLAMATION = 54, + OP = 55, + AWAIT = 56, + ASYNC = 57, + TYPE_IGNORE = 58, + TYPE_COMMENT = 59, + SOFT_KEYWORD = 60, + FSTRING_START = 61, + FSTRING_MIDDLE = 62, + FSTRING_END = 63, + COMMENT = 64, + NL = 65, + ERRORTOKEN = 66, + N_TOKENS = 68 + + +function NEW_TYPE_COMMENT(){} + +const Store = new $B.ast.Store(), + Load = new $B.ast.Load + +const EXTRA = {} +""" + + +EXTENSION_SUFFIX = """ +$B._PyPegen_parse = function(p){ + console.log('parse', p) + // Initialize keywords + p.keywords = reserved_keywords; + p.n_keyword_lists = n_keyword_lists; + p.soft_keywords = soft_keywords; + + console.log('first token', p.tok.next().value) + + return file_rule(p) + +} +""" + + +class NodeTypes(Enum): + NAME_TOKEN = 0 + NUMBER_TOKEN = 1 + STRING_TOKEN = 2 + GENERIC_TOKEN = 3 + KEYWORD = 4 + SOFT_KEYWORD = 5 + CUT_OPERATOR = 6 + F_STRING_CHUNK = 7 + + +BASE_NODETYPES = { + "NAME": NodeTypes.NAME_TOKEN, + "NUMBER": NodeTypes.NUMBER_TOKEN, + "STRING": NodeTypes.STRING_TOKEN, + "SOFT_KEYWORD": NodeTypes.SOFT_KEYWORD, +} + + +@dataclass +class FunctionCall: + function: str + arguments: List[Any] = field(default_factory=list) + assigned_variable: Optional[str] = None + assigned_variable_type: Optional[str] = None + return_type: Optional[str] = None + nodetype: Optional[NodeTypes] = None + force_true: bool = False + comment: Optional[str] = None + + def __str__(self) -> str: + parts = [] + parts.append(self.function) + if self.arguments: + parts.append(f"({', '.join(map(str, self.arguments))})") + if self.force_true: + parts.append(", !p.error_indicator") + if self.assigned_variable: + if self.assigned_variable_type: + parts = [ + "(", + self.assigned_variable, + " = ", + # "(", + # self.assigned_variable_type, + # ")", + *parts, + ")", + ] + else: + parts = ["(", self.assigned_variable, " = ", *parts, ")"] + if '*' in "".join(parts): + print('function call', "".join(parts)) + input() + if self.comment: + parts.append(f" // {self.comment}") + return "".join(parts) + + +class CCallMakerVisitor(GrammarVisitor): + def __init__( + self, + parser_generator: ParserGenerator, + exact_tokens: Dict[str, int], + non_exact_tokens: Set[str], + ): + self.gen = parser_generator + self.exact_tokens = exact_tokens + self.non_exact_tokens = non_exact_tokens + self.cache: Dict[Any, FunctionCall] = {} + self.cleanup_statements: List[str] = [] + + def keyword_helper(self, keyword: str) -> FunctionCall: + return FunctionCall( + assigned_variable="_keyword", + function="$B._PyPegen.expect_token", + arguments=["p", self.gen.keywords[keyword]], + return_type="Token *", + nodetype=NodeTypes.KEYWORD, + comment=f"token='{keyword}'", + ) + + def soft_keyword_helper(self, value: str) -> FunctionCall: + return FunctionCall( + assigned_variable="_keyword", + function="$B._PyPegen.expect_soft_keyword", + arguments=["p", value], + return_type="expr_ty", + nodetype=NodeTypes.SOFT_KEYWORD, + comment=f"soft_keyword='{value}'", + ) + + def visit_NameLeaf(self, node: NameLeaf) -> FunctionCall: + name = node.value + if name in self.non_exact_tokens: + if name in BASE_NODETYPES: + return FunctionCall( + assigned_variable=f"{name.lower()}_var", + function=f"$B._PyPegen.{name.lower()}_token", + arguments=["p"], + nodetype=BASE_NODETYPES[name], + return_type="expr_ty", + comment=name, + ) + return FunctionCall( + assigned_variable=f"{name.lower()}_var", + function=f"$B._PyPegen.expect_token", + arguments=["p", name], + nodetype=NodeTypes.GENERIC_TOKEN, + return_type="Token *", + comment=f"token='{name}'", + ) + + type = None + rule = self.gen.all_rules.get(name.lower()) + if rule is not None: + type = "asdl_seq *" if rule.is_loop() or rule.is_gather() else rule.type + + return FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + return_type=type, + comment=f"{node}", + ) + + def visit_StringLeaf(self, node: StringLeaf) -> FunctionCall: + val = ast.literal_eval(node.value) + if re.match(r"[a-zA-Z_]\w*\Z", val): # This is a keyword + if node.value.endswith("'"): + return self.keyword_helper(val) + else: + return self.soft_keyword_helper(node.value) + else: + assert val in self.exact_tokens, f"{node.value} is not a known literal" + type = self.exact_tokens[val] + return FunctionCall( + assigned_variable="_literal", + function=f"$B._PyPegen.expect_token", + arguments=["p", type], + nodetype=NodeTypes.GENERIC_TOKEN, + return_type="Token *", + comment=f"token='{val}'", + ) + + def visit_Rhs(self, node: Rhs) -> FunctionCall: + if node in self.cache: + return self.cache[node] + if node.can_be_inlined: + self.cache[node] = self.generate_call(node.alts[0].items[0]) + else: + name = self.gen.artifical_rule_from_rhs(node) + self.cache[node] = FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + comment=f"{node}", + ) + return self.cache[node] + + def visit_NamedItem(self, node: NamedItem) -> FunctionCall: + call = self.generate_call(node.item) + if node.name: + call.assigned_variable = node.name + if node.type: + call.assigned_variable_type = node.type + return call + + def lookahead_call_helper(self, node: Lookahead, positive: int) -> FunctionCall: + call = self.generate_call(node.node) + if call.nodetype == NodeTypes.NAME_TOKEN: + return FunctionCall( + function=f"$B._PyPegen.lookahead_with_name", + arguments=[positive, call.function, *call.arguments], + return_type="int", + ) + elif call.nodetype == NodeTypes.SOFT_KEYWORD: + return FunctionCall( + function=f"$B._PyPegen.lookahead_with_string", + arguments=[positive, call.function, *call.arguments], + return_type="int", + ) + elif call.nodetype in {NodeTypes.GENERIC_TOKEN, NodeTypes.KEYWORD}: + return FunctionCall( + function=f"$B._PyPegen.lookahead_with_int", + arguments=[positive, call.function, *call.arguments], + return_type="int", + comment=f"token={node.node}", + ) + else: + return FunctionCall( + function=f"$B._PyPegen.lookahead", + arguments=[positive, call.function, *call.arguments], + return_type="int", + ) + + def visit_PositiveLookahead(self, node: PositiveLookahead) -> FunctionCall: + return self.lookahead_call_helper(node, 1) + + def visit_NegativeLookahead(self, node: NegativeLookahead) -> FunctionCall: + return self.lookahead_call_helper(node, 0) + + def visit_Forced(self, node: Forced) -> FunctionCall: + call = self.generate_call(node.node) + if isinstance(node.node, Leaf): + assert isinstance(node.node, Leaf) + val = ast.literal_eval(node.node.value) + assert val in self.exact_tokens, f"{node.node.value} is not a known literal" + type = self.exact_tokens[val] + return FunctionCall( + assigned_variable="_literal", + function=f"$B._PyPegen.expect_forced_token", + arguments=["p", type, f'"{val}"'], + nodetype=NodeTypes.GENERIC_TOKEN, + return_type="Token *", + comment=f"forced_token='{val}'", + ) + if isinstance(node.node, Group): + call = self.visit(node.node.rhs) + call.assigned_variable = None + call.comment = None + return FunctionCall( + assigned_variable="_literal", + function=f"$B._PyPegen.expect_forced_result", + arguments=["p", str(call), f'"{node.node.rhs!s}"'], + return_type="void *", + comment=f"forced_token=({node.node.rhs!s})", + ) + else: + raise NotImplementedError(f"Forced tokens don't work with {node.node} nodes") + + def visit_Opt(self, node: Opt) -> FunctionCall: + call = self.generate_call(node.node) + return FunctionCall( + assigned_variable="_opt_var", + function=call.function, + arguments=call.arguments, + force_true=True, + comment=f"{node}", + ) + + def visit_Repeat0(self, node: Repeat0) -> FunctionCall: + if node in self.cache: + return self.cache[node] + name = self.gen.artificial_rule_from_repeat(node.node, False) + self.cache[node] = FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + return_type="asdl_seq *", + comment=f"{node}", + ) + return self.cache[node] + + def visit_Repeat1(self, node: Repeat1) -> FunctionCall: + if node in self.cache: + return self.cache[node] + name = self.gen.artificial_rule_from_repeat(node.node, True) + self.cache[node] = FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + return_type="asdl_seq *", + comment=f"{node}", + ) + return self.cache[node] + + def visit_Gather(self, node: Gather) -> FunctionCall: + if node in self.cache: + return self.cache[node] + name = self.gen.artifical_rule_from_gather(node) + self.cache[node] = FunctionCall( + assigned_variable=f"{name}_var", + function=f"{name}_rule", + arguments=["p"], + return_type="asdl_seq *", + comment=f"{node}", + ) + return self.cache[node] + + def visit_Group(self, node: Group) -> FunctionCall: + return self.generate_call(node.rhs) + + def visit_Cut(self, node: Cut) -> FunctionCall: + return FunctionCall( + assigned_variable="_cut_var", + return_type="int", + function="1", + nodetype=NodeTypes.CUT_OPERATOR, + ) + + def generate_call(self, node: Any) -> FunctionCall: + return super().visit(node) + + +class JavascriptParserGenerator(ParserGenerator, GrammarVisitor): + def __init__( + self, + grammar: grammar.Grammar, + tokens: Dict[int, str], + exact_tokens: Dict[str, int], + non_exact_tokens: Set[str], + file: Optional[IO[Text]], + debug: bool = False, + skip_actions: bool = False, + ): + super().__init__(grammar, set(tokens.values()), file) + self.callmakervisitor: CCallMakerVisitor = CCallMakerVisitor( + self, exact_tokens, non_exact_tokens + ) + self._varname_counter = 0 + self.debug = debug + self.skip_actions = skip_actions + self.cleanup_statements: List[str] = [] + + def add_level(self) -> None: + pass + + def remove_level(self) -> None: + pass + + def add_return(self, ret_val: str) -> None: + for stmt in self.cleanup_statements: + self.print(stmt) + self.remove_level() + self.print(f"return {ret_val};") + + def unique_varname(self, name: str = "tmpvar") -> str: + new_var = name + "_" + str(self._varname_counter) + self._varname_counter += 1 + return new_var + + def call_with_errorcheck_return(self, call_text: str, returnval: str) -> None: + error_var = self.unique_varname() + self.print(f"var {error_var} = {call_text};") + self.print(f"if ({error_var}) {{") + with self.indent(): + self.add_return(returnval) + self.print("}") + + def call_with_errorcheck_goto(self, call_text: str, goto_target: str) -> None: + error_var = self.unique_varname() + self.print(f"var {error_var} = {call_text};") + self.print(f"if ({error_var}) {{") + with self.indent(): + self.print(f"{goto_target}();") + self.print(f"}}") + + def out_of_memory_return( + self, + expr: str, + cleanup_code: Optional[str] = None, + ) -> None: + self.print(f"if ({expr}) {{") + with self.indent(): + if cleanup_code is not None: + self.print(cleanup_code) + self.print("p.error_indicator = 1;") + self.print("PyErr_NoMemory();") + self.add_return("NULL") + self.print(f"}}") + + def out_of_memory_goto(self, expr: str, goto_target: str) -> None: + self.print(f"if ({expr}) {{") + with self.indent(): + self.print("PyErr_NoMemory();") + self.print(f"{goto_target}();") + self.print(f"}}") + + def generate(self, filename: str) -> None: + self.collect_rules() + basename = os.path.basename(filename) + self.print(f"// @generated by pegen from {basename}") + header = self.grammar.metas.get("header", EXTENSION_PREFIX) + if header: + self.print(header.rstrip("\n")) + subheader = self.grammar.metas.get("subheader", "") + if subheader: + self.print(subheader) + self._setup_keywords() + self._setup_soft_keywords() + + types = [] + for i, (rulename, rule) in enumerate(self.all_rules.items(), 1000): + types.append(f"{rulename}_type = {i}") + types = 'const ' + ', \n'.join(types) + self.print(types) + self.print() + """ + for rulename, rule in self.all_rules.items(): + if rule.is_loop() or rule.is_gather(): + type = "asdl_seq *" + elif rule.type: + type = rule.type + " " + else: + type = "void *" + self.print(f"static {type}{rulename}_rule(p);") + self.print() + """ + for rulename, rule in list(self.all_rules.items()): + self.print() + if rule.left_recursive: + self.print("// Left-recursive") + self.visit(rule) + if self.skip_actions: + mode = 0 + else: + mode = int(self.rules["start"].type == "mod_ty") if "start" in self.rules else 1 + if mode == 1 and self.grammar.metas.get("bytecode"): + mode += 1 + modulename = self.grammar.metas.get("modulename", "parse") + trailer = EXTENSION_SUFFIX + if trailer: + self.print(trailer.rstrip("\n") % dict(mode=mode, modulename=modulename)) + + def _group_keywords_by_length(self) -> Dict[int, List[Tuple[str, int]]]: + groups: Dict[int, List[Tuple[str, int]]] = {} + for keyword_str, keyword_type in self.keywords.items(): + length = len(keyword_str) + if length in groups: + groups[length].append((keyword_str, keyword_type)) + else: + groups[length] = [(keyword_str, keyword_type)] + return groups + + def _setup_keywords(self) -> None: + n_keyword_lists = ( + len(max(self.keywords.keys(), key=len)) + 1 if len(self.keywords) > 0 else 0 + ) + self.print(f"const n_keyword_lists = {n_keyword_lists};") + groups = self._group_keywords_by_length() + self.print("const reserved_keywords = {") + with self.indent(): + num_groups = max(groups) + 1 if groups else 1 + for keywords_length in range(num_groups): + if keywords_length not in groups.keys(): + self.print("NULL: -1,") + else: + # self.print("(KeywordToken[]) {") + # with self.indent(): + for keyword_str, keyword_type in groups[keywords_length]: + self.print(f'{keyword_str}: {keyword_type},') + # self.print("{NULL, -1},") + # self.print("},") + self.print("};") + + def _setup_soft_keywords(self) -> None: + soft_keywords = sorted(self.soft_keywords) + self.print("const soft_keywords = [") + with self.indent(): + for keyword in soft_keywords: + self.print(f'"{keyword}",') + self.print("NULL,") + self.print("];") + + def _set_up_token_start_metadata_extraction(self) -> None: + self.print("if (p.mark == p.fill && $B._PyPegen.fill_token(p) < 0) {") + with self.indent(): + self.print("p.error_indicator = 1;") + self.add_return("NULL") + self.print("}") + self.print("var _start_lineno = p.tokens[_mark].lineno;") + self.print("UNUSED(_start_lineno); // Only used by EXTRA macro") + self.print("var _start_col_offset = p.tokens[_mark].col_offset;") + self.print("UNUSED(_start_col_offset); // Only used by EXTRA macro") + + def _set_up_token_end_metadata_extraction(self) -> None: + self.print("var _token = $B._PyPegen.get_last_nonnwhitespace_token(p);") + self.print("if (_token == NULL) {") + with self.indent(): + self.add_return("NULL") + self.print("}") + self.print("var _end_lineno = _token.end_lineno;") + self.print("UNUSED(_end_lineno); // Only used by EXTRA macro") + self.print("var _end_col_offset = _token.end_col_offset;") + self.print("UNUSED(_end_col_offset); // Only used by EXTRA macro") + + def _check_for_errors(self) -> None: + self.print("if (p.error_indicator) {") + with self.indent(): + self.add_return("NULL") + self.print("}") + + def _set_up_rule_memoization(self, node: Rule, result_type: str) -> None: + self.print("{") + with self.indent(): + self.add_level() + self.print(f"var _res = {{value: NULL}};") + self.print(f"if ($B._PyPegen.is_memoized(p, {node.name}_type, _res)) {{") + with self.indent(): + self.add_return("_res.value") + self.print("}") + self.print("_res = NULL;") + self.print("var _mark = p.mark;") + self.print("var _resmark = p.mark;") + self.print("while (1) {") + with self.indent(): + self.call_with_errorcheck_return( + f"$B._PyPegen.update_memo(p, _mark, {node.name}_type, _res)", "_res" + ) + self.print("p.mark = _mark;") + self.print(f"var _raw = {node.name}_raw(p);") + self.print("if (p.error_indicator) {") + with self.indent(): + self.add_return("NULL") + self.print("}") + self.print("if (_raw == NULL || p.mark <= _resmark)") + with self.indent(): + self.print("break;") + self.print(f"_resmark = p.mark;") + self.print("_res = _raw;") + self.print("}") + self.print(f"p.mark = _resmark;") + self.add_return("_res") + self.print("}") + # self.print(f"static {result_type}") + self.print(f"function {node.name}_raw(p)") + + def _should_memoize(self, node: Rule) -> bool: + return node.memo and not node.left_recursive + + def _handle_default_rule_body(self, node: Rule, rhs: Rhs, result_type: str) -> None: + memoize = self._should_memoize(node) + + with self.indent(): + self.add_level() + self._check_for_errors() + self.print(f"var _res = {{value: NULL}};") + if memoize: + self.print(f"if ($B._PyPegen.is_memoized(p, {node.name}_type, _res)) {{") + with self.indent(): + self.add_return("_res.value") + self.print("}") + self.print("_res = NULL;") + self.print("var _mark = p.mark;") + if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts): + self._set_up_token_start_metadata_extraction() + self.visit( + rhs, + is_loop=False, + is_gather=node.is_gather(), + rulename=node.name, + ) + if self.debug: + self.print(f'D(fprintf(stderr, "Fail at %d: {node.name}\\n", p.mark));') + self.print("_res = NULL;") + self.print(" function done(){") + with self.indent(): + if memoize: + self.print(f"$B._PyPegen.insert_memo(p, _mark, {node.name}_type, _res);") + self.add_return("_res") + self.print('}') + + def _handle_loop_rule_body(self, node: Rule, rhs: Rhs) -> None: + memoize = self._should_memoize(node) + is_repeat1 = node.name.startswith("_loop1") + + with self.indent(): + self.add_level() + self._check_for_errors() + self.print(f"var _res = {{value: NULL}};") + if memoize: + self.print(f"if ($B._PyPegen.is_memoized(p, {node.name}_type, _res)) {{") + with self.indent(): + self.add_return("_res.value") + self.print("}") + self.print("_res = NULL;") + self.print("var _mark = p.mark;") + if memoize: + self.print("var _start_mark = p.mark;") + self.print("var _children = [];") + self.print("var _children_capacity = 1;") + self.print("var _n = 0;") + if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts): + self._set_up_token_start_metadata_extraction() + self.visit( + rhs, + is_loop=True, + is_gather=node.is_gather(), + rulename=node.name, + ) + if is_repeat1: + self.print("if (_n == 0 || p.error_indicator) {") + with self.indent(): + self.print("// PyMem_Free(_children);") + self.add_return("NULL") + self.print("}") + self.print("var _seq = [];") + self.out_of_memory_return(f"!_seq", cleanup_code="PyMem_Free(_children);") + self.print("for (let i = 0; i < _n; i++){_seq[i] = _children[i]};") + self.print("// PyMem_Free(_children);") + if memoize and node.name: + self.print(f"$B._PyPegen.insert_memo(p, _start_mark, {node.name}_type, _seq);") + self.add_return("_seq") + + def visit_Rule(self, node: Rule) -> None: + is_loop = node.is_loop() + is_gather = node.is_gather() + rhs = node.flatten() + if is_loop or is_gather: + result_type = "asdl_seq *" + elif node.type: + result_type = node.type + else: + result_type = "void *" + + for line in str(node).splitlines(): + self.print(f"// {line}") + if node.left_recursive and node.leader: + self.print(f"function {node.name}_raw(){{}};") + + # self.print(f"static {result_type}") + self.print(f"function {node.name}_rule(p)") + + if node.left_recursive and node.leader: + self._set_up_rule_memoization(node, result_type) + + self.print("{") + + if node.name.endswith("without_invalid"): + with self.indent(): + self.print("var _prev_call_invalid = p.call_invalid_rules;") + self.print("p.call_invalid_rules = 0;") + self.cleanup_statements.append("p.call_invalid_rules = _prev_call_invalid;") + + if is_loop: + self._handle_loop_rule_body(node, rhs) + else: + self._handle_default_rule_body(node, rhs, result_type) + + if node.name.endswith("without_invalid"): + self.cleanup_statements.pop() + + self.print("}") + + def visit_NamedItem(self, node: NamedItem) -> None: + call = self.callmakervisitor.generate_call(node) + if call.assigned_variable: + call.assigned_variable = self.dedupe(call.assigned_variable) + self.print(call) + + def visit_Rhs( + self, node: Rhs, is_loop: bool, is_gather: bool, rulename: Optional[str] + ) -> None: + if is_loop: + assert len(node.alts) == 1 + for alt in node.alts: + self.visit(alt, is_loop=is_loop, is_gather=is_gather, rulename=rulename) + + def join_conditions(self, keyword: str, node: Any) -> None: + self.print(f"{keyword} (") + with self.indent(): + first = True + for item in node.items: + if first: + first = False + else: + self.print("&&") + self.visit(item) + self.print(")") + + def emit_action(self, node: Alt, cleanup_code: Optional[str] = None) -> None: + _action = transform_action(node.action) + self.print(f"_res = {_action};") + + self.print("if (_res == NULL && PyErr_Occurred()) {") + with self.indent(): + self.print("p.error_indicator = 1;") + if cleanup_code: + self.print(cleanup_code) + self.add_return("NULL") + self.print("}") + + if self.debug: + node = str(node).replace('"', '\\"') + self.print( + f'D(fprintf(stderr, "Hit with action [%d-%d]: %s\\n", _mark, p.mark, "{node}"));' + ) + + def emit_default_action(self, is_gather: bool, node: Alt) -> None: + node = str(node).replace('"', '\\"') + if len(self.local_variable_names) > 1: + if is_gather: + assert len(self.local_variable_names) == 2 + self.print( + f"_res = $B._PyPegen.seq_insert_in_front(p, " + f"{self.local_variable_names[0]}, {self.local_variable_names[1]});" + ) + else: + if self.debug: + self.print( + f'D(fprintf(stderr, "Hit without action [%d:%d]: %s\\n", _mark, p.mark, "{node}"));' + ) + self.print( + f"_res = $B._PyPegen.dummy_name(p, {', '.join(self.local_variable_names)});" + ) + else: + if self.debug: + self.print( + f'D(fprintf(stderr, "Hit with default action [%d:%d]: %s\\n", _mark, p.mark, "{node}"));' + ) + self.print(f"_res = {self.local_variable_names[0]};") + + def emit_dummy_action(self) -> None: + self.print("_res = $B._PyPegen.dummy_name(p);") + + def handle_alt_normal(self, node: Alt, is_gather: bool, rulename: Optional[str]) -> None: + self.join_conditions(keyword="if", node=node) + self.print("{") + # We have parsed successfully all the conditions for the option. + with self.indent(): + node_str = str(node).replace('"', '\\"') + if self.debug: + self.print( + f'D(fprintf(stderr, "%*c+ {rulename}[%d-%d]: %s succeeded!\\n", p.level, \' \', _mark, p.mark, "{node_str}"));' + ) + # Prepare to emit the rule action and do so + if node.action and "EXTRA" in node.action: + self._set_up_token_end_metadata_extraction() + if self.skip_actions: + self.emit_dummy_action() + elif node.action: + self.emit_action(node) + else: + self.emit_default_action(is_gather, node) + + # As the current option has parsed correctly, do not continue with the rest. + self.print(f"return done();") + self.print("}") + + def handle_alt_loop(self, node: Alt, is_gather: bool, rulename: Optional[str]) -> None: + # Condition of the main body of the alternative + self.join_conditions(keyword="while", node=node) + self.print("{") + # We have parsed successfully one item! + with self.indent(): + # Prepare to emit the rule action and do so + if node.action and "EXTRA" in node.action: + self._set_up_token_end_metadata_extraction() + if self.skip_actions: + self.emit_dummy_action() + elif node.action: + self.emit_action(node, cleanup_code="PyMem_Free(_children);") + else: + self.emit_default_action(is_gather, node) + + # Add the result of rule to the temporary buffer of children. This buffer + # will populate later an asdl_seq with all elements to return. + """ + self.print("if (_n == _children_capacity) {") + with self.indent(): + self.print("_children_capacity *= 2;") + self.print( + "var _new_children = PyMem_Realloc(_children, _children_capacity * sizeof(void *));" + ) + self.out_of_memory_return(f"!_new_children", cleanup_code="PyMem_Free(_children);") + self.print("_children = _new_children;") + self.print("}") + """ + self.print("_children[_n++] = _res;") + self.print("_mark = p.mark;") + self.print("}") + + def visit_Alt( + self, node: Alt, is_loop: bool, is_gather: bool, rulename: Optional[str] + ) -> None: + if len(node.items) == 1 and str(node.items[0]).startswith("invalid_"): + self.print(f"if (p.call_invalid_rules) {{ // {node}") + else: + self.print(f"{{ // {node}") + with self.indent(): + self._check_for_errors() + node_str = str(node).replace('"', '\\"') + if self.debug: + self.print( + f'D(fprintf(stderr, "%*c> {rulename}[%d-%d]: %s\\n", p.level, \' \', _mark, p.mark, "{node_str}"));' + ) + # Prepare variable declarations for the alternative + vars = self.collect_vars(node) + for v, var_type in sorted(item for item in vars.items() if item[0] is not None): + if not var_type: + var_type = "" + else: + var_type += " " + if v == "_cut_var": + v += " = 0" # cut_var must be initialized + self.print(f"var {v};") + if v and v.startswith("_opt_var"): + self.print(f"UNUSED({v}); // Silence compiler warnings") + + with self.local_variable_context(): + if is_loop: + self.handle_alt_loop(node, is_gather, rulename) + else: + self.handle_alt_normal(node, is_gather, rulename) + + self.print("p.mark = _mark;") + node_str = str(node).replace('"', '\\"') + if self.debug: + self.print( + f"D(fprintf(stderr, \"%*c%s {rulename}[%d-%d]: %s failed!\\n\", p.level, ' ',\n" + f' p.error_indicator ? "ERROR!" : "-", _mark, p.mark, "{node_str}"));' + ) + if "_cut_var" in vars: + self.print("if (_cut_var) {") + with self.indent(): + self.add_return("NULL") + self.print("}") + self.print("}") + + def collect_vars(self, node: Alt) -> Dict[Optional[str], Optional[str]]: + types = {} + with self.local_variable_context(): + for item in node.items: + name, type = self.add_var(item) + types[name] = type + return types + + def add_var(self, node: NamedItem) -> Tuple[Optional[str], Optional[str]]: + call = self.callmakervisitor.generate_call(node.item) + name = node.name if node.name else call.assigned_variable + if name is not None: + name = self.dedupe(name) + return_type = call.return_type if node.type is None else node.type + return name, return_type diff --git a/scripts/pegen/keywordgen.py b/scripts/pegen/keywordgen.py new file mode 100644 index 000000000..35a5e1a22 --- /dev/null +++ b/scripts/pegen/keywordgen.py @@ -0,0 +1,76 @@ +"""Generate Lib/keyword.py from the Grammar and Tokens files using pgen""" + +import argparse + +from .build import build_parser, generate_token_definitions +from .c_generator import CParserGenerator + +TEMPLATE = r''' +"""Keywords (from "Grammar/python.gram") + +This file is automatically generated; please don't muck it up! + +To update the symbols in this file, 'cd' to the top directory of +the python source tree and run: + + PYTHONPATH=Tools/peg_generator python3 -m pegen.keywordgen \ + Grammar/python.gram \ + Grammar/Tokens \ + Lib/keyword.py + +Alternatively, you can run 'make regen-keyword'. +""" + +__all__ = ["iskeyword", "issoftkeyword", "kwlist", "softkwlist"] + +kwlist = [ +{keywords} +] + +softkwlist = [ +{soft_keywords} +] + +iskeyword = frozenset(kwlist).__contains__ +issoftkeyword = frozenset(softkwlist).__contains__ +'''.lstrip() + +EXTRA_KEYWORDS = ["async", "await"] + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Generate the Lib/keywords.py file from the grammar." + ) + parser.add_argument( + "grammar", type=str, help="The file with the grammar definition in PEG format" + ) + parser.add_argument( + "tokens_file", type=argparse.FileType("r"), help="The file with the token definitions" + ) + parser.add_argument( + "keyword_file", + type=argparse.FileType("w"), + help="The path to write the keyword definitions", + ) + args = parser.parse_args() + + grammar, _, _ = build_parser(args.grammar) + with args.tokens_file as tok_file: + all_tokens, exact_tok, non_exact_tok = generate_token_definitions(tok_file) + gen = CParserGenerator(grammar, all_tokens, exact_tok, non_exact_tok, file=None) + gen.collect_rules() + + with args.keyword_file as thefile: + all_keywords = sorted(list(gen.keywords.keys()) + EXTRA_KEYWORDS) + all_soft_keywords = sorted(gen.soft_keywords) + + keywords = "" if not all_keywords else " " + ",\n ".join(map(repr, all_keywords)) + soft_keywords = ( + "" if not all_soft_keywords else " " + ",\n ".join(map(repr, all_soft_keywords)) + ) + thefile.write(TEMPLATE.format(keywords=keywords, soft_keywords=soft_keywords)) + + +if __name__ == "__main__": + main() diff --git a/scripts/pegen/metagrammar.gram b/scripts/pegen/metagrammar.gram new file mode 100644 index 000000000..f22c334ca --- /dev/null +++ b/scripts/pegen/metagrammar.gram @@ -0,0 +1,131 @@ +@subheader """\ +from ast import literal_eval + +from pegen.grammar import ( + Alt, + Cut, + Forced, + Gather, + Group, + Item, + Lookahead, + LookaheadOrCut, + MetaTuple, + MetaList, + NameLeaf, + NamedItem, + NamedItemList, + NegativeLookahead, + Opt, + Plain, + PositiveLookahead, + Repeat0, + Repeat1, + Rhs, + Rule, + RuleList, + RuleName, + Grammar, + StringLeaf, +) +""" + +start[Grammar]: grammar ENDMARKER { grammar } + +grammar[Grammar]: + | metas rules { Grammar(rules, metas) } + | rules { Grammar(rules, []) } + +metas[MetaList]: + | meta metas { [meta] + metas } + | meta { [meta] } + +meta[MetaTuple]: + | "@" NAME NEWLINE { (name.string, None) } + | "@" a=NAME b=NAME NEWLINE { (a.string, b.string) } + | "@" NAME STRING NEWLINE { (name.string, literal_eval(string.string)) } + +rules[RuleList]: + | rule rules { [rule] + rules } + | rule { [rule] } + +rule[Rule]: + | rulename memoflag? ":" alts NEWLINE INDENT more_alts DEDENT { + Rule(rulename[0], rulename[1], Rhs(alts.alts + more_alts.alts), memo=opt) } + | rulename memoflag? ":" NEWLINE INDENT more_alts DEDENT { + Rule(rulename[0], rulename[1], more_alts, memo=opt) } + | rulename memoflag? ":" alts NEWLINE { Rule(rulename[0], rulename[1], alts, memo=opt) } + +rulename[RuleName]: + | NAME annotation { (name.string, annotation) } + | NAME { (name.string, None) } + +# In the future this may return something more complicated +memoflag[str]: + | '(' "memo" ')' { "memo" } + +alts[Rhs]: + | alt "|" alts { Rhs([alt] + alts.alts)} + | alt { Rhs([alt]) } + +more_alts[Rhs]: + | "|" alts NEWLINE more_alts { Rhs(alts.alts + more_alts.alts) } + | "|" alts NEWLINE { Rhs(alts.alts) } + +alt[Alt]: + | items '$' action { Alt(items + [NamedItem(None, NameLeaf('ENDMARKER'))], action=action) } + | items '$' { Alt(items + [NamedItem(None, NameLeaf('ENDMARKER'))], action=None) } + | items action { Alt(items, action=action) } + | items { Alt(items, action=None) } + +items[NamedItemList]: + | named_item items { [named_item] + items } + | named_item { [named_item] } + +named_item[NamedItem]: + | NAME annotation '=' ~ item {NamedItem(name.string, item, annotation)} + | NAME '=' ~ item {NamedItem(name.string, item)} + | item {NamedItem(None, item)} + | forced=forced_atom {NamedItem(None, forced)} + | it=lookahead {NamedItem(None, it)} + +forced_atom[Forced]: + | '&''&' ~ atom {Forced(atom)} + +lookahead[LookaheadOrCut]: + | '&' ~ atom {PositiveLookahead(atom)} + | '!' ~ atom {NegativeLookahead(atom)} + | '~' {Cut()} + +item[Item]: + | '[' ~ alts ']' {Opt(alts)} + | atom '?' {Opt(atom)} + | atom '*' {Repeat0(atom)} + | atom '+' {Repeat1(atom)} + | sep=atom '.' node=atom '+' {Gather(sep, node)} + | atom {atom} + +atom[Plain]: + | '(' ~ alts ')' {Group(alts)} + | NAME {NameLeaf(name.string) } + | STRING {StringLeaf(string.string)} + +# Mini-grammar for the actions and annotations + +action[str]: "{" ~ target_atoms "}" { target_atoms } +annotation[str]: "[" ~ target_atoms "]" { target_atoms } + +target_atoms[str]: + | target_atom target_atoms { target_atom + " " + target_atoms } + | target_atom { target_atom } + +target_atom[str]: + | "{" ~ atoms=target_atoms? "}" { "{" + (atoms or "") + "}" } + | "[" ~ atoms=target_atoms? "]" { "[" + (atoms or "") + "]" } + | NAME "*" { name.string + "*" } + | NAME { name.string } + | NUMBER { number.string } + | STRING { string.string } + | "?" { "?" } + | ":" { ":" } + | !"}" !"]" OP { op.string } diff --git a/scripts/pegen/parser.py b/scripts/pegen/parser.py new file mode 100644 index 000000000..034e8e601 --- /dev/null +++ b/scripts/pegen/parser.py @@ -0,0 +1,334 @@ +import argparse +import sys +import time +import token +import tokenize +import traceback +from abc import abstractmethod +from typing import Any, Callable, ClassVar, Dict, Optional, Tuple, Type, TypeVar, cast + +from pegen.tokenizer import Mark, Tokenizer, exact_token_types + +T = TypeVar("T") +P = TypeVar("P", bound="Parser") +F = TypeVar("F", bound=Callable[..., Any]) + + +def logger(method: F) -> F: + """For non-memoized functions that we want to be logged. + + (In practice this is only non-leader left-recursive functions.) + """ + method_name = method.__name__ + + def logger_wrapper(self: P, *args: object) -> T: + if not self._verbose: + return method(self, *args) + argsr = ",".join(repr(arg) for arg in args) + fill = " " * self._level + print(f"{fill}{method_name}({argsr}) .... (looking at {self.showpeek()})") + self._level += 1 + tree = method(self, *args) + self._level -= 1 + print(f"{fill}... {method_name}({argsr}) --> {tree!s:.200}") + return tree + + logger_wrapper.__wrapped__ = method # type: ignore + return cast(F, logger_wrapper) + + +def memoize(method: F) -> F: + """Memoize a symbol method.""" + method_name = method.__name__ + + def memoize_wrapper(self: P, *args: object) -> T: + mark = self._mark() + key = mark, method_name, args + # Fast path: cache hit, and not verbose. + if key in self._cache and not self._verbose: + tree, endmark = self._cache[key] + self._reset(endmark) + return tree + # Slow path: no cache hit, or verbose. + verbose = self._verbose + argsr = ",".join(repr(arg) for arg in args) + fill = " " * self._level + if key not in self._cache: + if verbose: + print(f"{fill}{method_name}({argsr}) ... (looking at {self.showpeek()})") + self._level += 1 + tree = method(self, *args) + self._level -= 1 + if verbose: + print(f"{fill}... {method_name}({argsr}) -> {tree!s:.200}") + endmark = self._mark() + self._cache[key] = tree, endmark + else: + tree, endmark = self._cache[key] + if verbose: + print(f"{fill}{method_name}({argsr}) -> {tree!s:.200}") + self._reset(endmark) + return tree + + memoize_wrapper.__wrapped__ = method # type: ignore + return cast(F, memoize_wrapper) + + +def memoize_left_rec(method: Callable[[P], Optional[T]]) -> Callable[[P], Optional[T]]: + """Memoize a left-recursive symbol method.""" + method_name = method.__name__ + + def memoize_left_rec_wrapper(self: P) -> Optional[T]: + mark = self._mark() + key = mark, method_name, () + # Fast path: cache hit, and not verbose. + if key in self._cache and not self._verbose: + tree, endmark = self._cache[key] + self._reset(endmark) + return tree + # Slow path: no cache hit, or verbose. + verbose = self._verbose + fill = " " * self._level + if key not in self._cache: + if verbose: + print(f"{fill}{method_name} ... (looking at {self.showpeek()})") + self._level += 1 + + # For left-recursive rules we manipulate the cache and + # loop until the rule shows no progress, then pick the + # previous result. For an explanation why this works, see + # https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion + # (But we use the memoization cache instead of a static + # variable; perhaps this is similar to a paper by Warth et al. + # (http://web.cs.ucla.edu/~todd/research/pub.php?id=pepm08). + + # Prime the cache with a failure. + self._cache[key] = None, mark + lastresult, lastmark = None, mark + depth = 0 + if verbose: + print(f"{fill}Recursive {method_name} at {mark} depth {depth}") + + while True: + self._reset(mark) + self.in_recursive_rule += 1 + try: + result = method(self) + finally: + self.in_recursive_rule -= 1 + endmark = self._mark() + depth += 1 + if verbose: + print( + f"{fill}Recursive {method_name} at {mark} depth {depth}: {result!s:.200} to {endmark}" + ) + if not result: + if verbose: + print(f"{fill}Fail with {lastresult!s:.200} to {lastmark}") + break + if endmark <= lastmark: + if verbose: + print(f"{fill}Bailing with {lastresult!s:.200} to {lastmark}") + break + self._cache[key] = lastresult, lastmark = result, endmark + + self._reset(lastmark) + tree = lastresult + + self._level -= 1 + if verbose: + print(f"{fill}{method_name}() -> {tree!s:.200} [cached]") + if tree: + endmark = self._mark() + else: + endmark = mark + self._reset(endmark) + self._cache[key] = tree, endmark + else: + tree, endmark = self._cache[key] + if verbose: + print(f"{fill}{method_name}() -> {tree!s:.200} [fresh]") + if tree: + self._reset(endmark) + return tree + + memoize_left_rec_wrapper.__wrapped__ = method # type: ignore + return memoize_left_rec_wrapper + + +class Parser: + """Parsing base class.""" + + KEYWORDS: ClassVar[Tuple[str, ...]] + + SOFT_KEYWORDS: ClassVar[Tuple[str, ...]] + + def __init__(self, tokenizer: Tokenizer, *, verbose: bool = False): + self._tokenizer = tokenizer + self._verbose = verbose + self._level = 0 + self._cache: Dict[Tuple[Mark, str, Tuple[Any, ...]], Tuple[Any, Mark]] = {} + # Integer tracking whether we are in a left recursive rule or not. Can be useful + # for error reporting. + self.in_recursive_rule = 0 + # Pass through common tokenizer methods. + self._mark = self._tokenizer.mark + self._reset = self._tokenizer.reset + + @abstractmethod + def start(self) -> Any: + pass + + def showpeek(self) -> str: + tok = self._tokenizer.peek() + return f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}" + + @memoize + def name(self) -> Optional[tokenize.TokenInfo]: + tok = self._tokenizer.peek() + if tok.type == token.NAME and tok.string not in self.KEYWORDS: + return self._tokenizer.getnext() + return None + + @memoize + def number(self) -> Optional[tokenize.TokenInfo]: + tok = self._tokenizer.peek() + if tok.type == token.NUMBER: + return self._tokenizer.getnext() + return None + + @memoize + def string(self) -> Optional[tokenize.TokenInfo]: + tok = self._tokenizer.peek() + if tok.type == token.STRING: + return self._tokenizer.getnext() + return None + + @memoize + def op(self) -> Optional[tokenize.TokenInfo]: + tok = self._tokenizer.peek() + if tok.type == token.OP: + return self._tokenizer.getnext() + return None + + @memoize + def type_comment(self) -> Optional[tokenize.TokenInfo]: + tok = self._tokenizer.peek() + if tok.type == token.TYPE_COMMENT: + return self._tokenizer.getnext() + return None + + @memoize + def soft_keyword(self) -> Optional[tokenize.TokenInfo]: + tok = self._tokenizer.peek() + if tok.type == token.NAME and tok.string in self.SOFT_KEYWORDS: + return self._tokenizer.getnext() + return None + + @memoize + def expect(self, type: str) -> Optional[tokenize.TokenInfo]: + tok = self._tokenizer.peek() + if tok.string == type: + return self._tokenizer.getnext() + if type in exact_token_types: + if tok.type == exact_token_types[type]: + return self._tokenizer.getnext() + if type in token.__dict__: + if tok.type == token.__dict__[type]: + return self._tokenizer.getnext() + if tok.type == token.OP and tok.string == type: + return self._tokenizer.getnext() + return None + + def expect_forced(self, res: Any, expectation: str) -> Optional[tokenize.TokenInfo]: + if res is None: + raise self.make_syntax_error(f"expected {expectation}") + return res + + def positive_lookahead(self, func: Callable[..., T], *args: object) -> T: + mark = self._mark() + ok = func(*args) + self._reset(mark) + return ok + + def negative_lookahead(self, func: Callable[..., object], *args: object) -> bool: + mark = self._mark() + ok = func(*args) + self._reset(mark) + return not ok + + def make_syntax_error(self, message: str, filename: str = "") -> SyntaxError: + tok = self._tokenizer.diagnose() + return SyntaxError(message, (filename, tok.start[0], 1 + tok.start[1], tok.line)) + + +def simple_parser_main(parser_class: Type[Parser]) -> None: + argparser = argparse.ArgumentParser() + argparser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="Print timing stats; repeat for more debug output", + ) + argparser.add_argument( + "-q", "--quiet", action="store_true", help="Don't print the parsed program" + ) + argparser.add_argument("filename", help="Input file ('-' to use stdin)") + + args = argparser.parse_args() + verbose = args.verbose + verbose_tokenizer = verbose >= 3 + verbose_parser = verbose == 2 or verbose >= 4 + + t0 = time.time() + + filename = args.filename + if filename == "" or filename == "-": + filename = "" + file = sys.stdin + else: + file = open(args.filename) + try: + tokengen = tokenize.generate_tokens(file.readline) + tokenizer = Tokenizer(tokengen, verbose=verbose_tokenizer) + parser = parser_class(tokenizer, verbose=verbose_parser) + tree = parser.start() + try: + if file.isatty(): + endpos = 0 + else: + endpos = file.tell() + except IOError: + endpos = 0 + finally: + if file is not sys.stdin: + file.close() + + t1 = time.time() + + if not tree: + err = parser.make_syntax_error(filename) + traceback.print_exception(err.__class__, err, None) + sys.exit(1) + + if not args.quiet: + print(tree) + + if verbose: + dt = t1 - t0 + diag = tokenizer.diagnose() + nlines = diag.end[0] + if diag.type == token.ENDMARKER: + nlines -= 1 + print(f"Total time: {dt:.3f} sec; {nlines} lines", end="") + if endpos: + print(f" ({endpos} bytes)", end="") + if dt: + print(f"; {nlines / dt:.0f} lines/sec") + else: + print() + print("Caches sizes:") + print(f" token array : {len(tokenizer._tokens):10}") + print(f" cache : {len(parser._cache):10}") + ## print_memstats() diff --git a/scripts/pegen/parser_generator.py b/scripts/pegen/parser_generator.py new file mode 100644 index 000000000..f2105d8fa --- /dev/null +++ b/scripts/pegen/parser_generator.py @@ -0,0 +1,383 @@ +import ast +import contextlib +import re +from abc import abstractmethod +from typing import ( + IO, + AbstractSet, + Any, + Dict, + Iterable, + Iterator, + List, + Optional, + Set, + Text, + Tuple, + Union, +) + +from pegen import sccutils +from pegen.grammar import ( + Alt, + Cut, + Forced, + Gather, + Grammar, + GrammarError, + GrammarVisitor, + Group, + Lookahead, + NamedItem, + NameLeaf, + Opt, + Plain, + Repeat0, + Repeat1, + Rhs, + Rule, + StringLeaf, +) + + +class RuleCollectorVisitor(GrammarVisitor): + """Visitor that invokes a provieded callmaker visitor with just the NamedItem nodes""" + + def __init__(self, rules: Dict[str, Rule], callmakervisitor: GrammarVisitor) -> None: + self.rulses = rules + self.callmaker = callmakervisitor + + def visit_Rule(self, rule: Rule) -> None: + self.visit(rule.flatten()) + + def visit_NamedItem(self, item: NamedItem) -> None: + self.callmaker.visit(item) + + +class KeywordCollectorVisitor(GrammarVisitor): + """Visitor that collects all the keywods and soft keywords in the Grammar""" + + def __init__(self, gen: "ParserGenerator", keywords: Dict[str, int], soft_keywords: Set[str]): + self.generator = gen + self.keywords = keywords + self.soft_keywords = soft_keywords + + def visit_StringLeaf(self, node: StringLeaf) -> None: + val = ast.literal_eval(node.value) + if re.match(r"[a-zA-Z_]\w*\Z", val): # This is a keyword + if node.value.endswith("'") and node.value not in self.keywords: + self.keywords[val] = self.generator.keyword_type() + else: + return self.soft_keywords.add(node.value.replace('"', "")) + + +class RuleCheckingVisitor(GrammarVisitor): + def __init__(self, rules: Dict[str, Rule], tokens: Set[str]): + self.rules = rules + self.tokens = tokens + + def visit_NameLeaf(self, node: NameLeaf) -> None: + if node.value not in self.rules and node.value not in self.tokens: + raise GrammarError(f"Dangling reference to rule {node.value!r}") + + def visit_NamedItem(self, node: NamedItem) -> None: + if node.name and node.name.startswith("_"): + raise GrammarError(f"Variable names cannot start with underscore: '{node.name}'") + self.visit(node.item) + + +class ParserGenerator: + + callmakervisitor: GrammarVisitor + + def __init__(self, grammar: Grammar, tokens: Set[str], file: Optional[IO[Text]]): + self.grammar = grammar + self.tokens = tokens + self.keywords: Dict[str, int] = {} + self.soft_keywords: Set[str] = set() + self.rules = grammar.rules + self.validate_rule_names() + if "trailer" not in grammar.metas and "start" not in self.rules: + raise GrammarError("Grammar without a trailer must have a 'start' rule") + checker = RuleCheckingVisitor(self.rules, self.tokens) + for rule in self.rules.values(): + checker.visit(rule) + self.file = file + self.level = 0 + self.first_graph, self.first_sccs = compute_left_recursives(self.rules) + self.counter = 0 # For name_rule()/name_loop() + self.keyword_counter = 499 # For keyword_type() + self.all_rules: Dict[str, Rule] = self.rules.copy() # Rules + temporal rules + self._local_variable_stack: List[List[str]] = [] + + def validate_rule_names(self) -> None: + for rule in self.rules: + if rule.startswith("_"): + raise GrammarError(f"Rule names cannot start with underscore: '{rule}'") + + @contextlib.contextmanager + def local_variable_context(self) -> Iterator[None]: + self._local_variable_stack.append([]) + yield + self._local_variable_stack.pop() + + @property + def local_variable_names(self) -> List[str]: + return self._local_variable_stack[-1] + + @abstractmethod + def generate(self, filename: str) -> None: + raise NotImplementedError + + @contextlib.contextmanager + def indent(self) -> Iterator[None]: + self.level += 1 + try: + yield + finally: + self.level -= 1 + + def print(self, *args: object) -> None: + if not args: + print(file=self.file) + else: + print(" " * self.level, end="", file=self.file) + print(*args, file=self.file) + + def printblock(self, lines: str) -> None: + for line in lines.splitlines(): + self.print(line) + + def collect_rules(self) -> None: + keyword_collector = KeywordCollectorVisitor(self, self.keywords, self.soft_keywords) + for rule in self.all_rules.values(): + keyword_collector.visit(rule) + + rule_collector = RuleCollectorVisitor(self.rules, self.callmakervisitor) + done: Set[str] = set() + while True: + computed_rules = list(self.all_rules) + todo = [i for i in computed_rules if i not in done] + if not todo: + break + done = set(self.all_rules) + for rulename in todo: + rule_collector.visit(self.all_rules[rulename]) + + def keyword_type(self) -> int: + self.keyword_counter += 1 + return self.keyword_counter + + def artifical_rule_from_rhs(self, rhs: Rhs) -> str: + self.counter += 1 + name = f"_tmp_{self.counter}" # TODO: Pick a nicer name. + self.all_rules[name] = Rule(name, None, rhs) + return name + + def artificial_rule_from_repeat(self, node: Plain, is_repeat1: bool) -> str: + self.counter += 1 + if is_repeat1: + prefix = "_loop1_" + else: + prefix = "_loop0_" + name = f"{prefix}{self.counter}" + self.all_rules[name] = Rule(name, None, Rhs([Alt([NamedItem(None, node)])])) + return name + + def artifical_rule_from_gather(self, node: Gather) -> str: + self.counter += 1 + name = f"_gather_{self.counter}" + self.counter += 1 + extra_function_name = f"_loop0_{self.counter}" + extra_function_alt = Alt( + [NamedItem(None, node.separator), NamedItem("elem", node.node)], + action="elem", + ) + self.all_rules[extra_function_name] = Rule( + extra_function_name, + None, + Rhs([extra_function_alt]), + ) + alt = Alt( + [NamedItem("elem", node.node), NamedItem("seq", NameLeaf(extra_function_name))], + ) + self.all_rules[name] = Rule( + name, + None, + Rhs([alt]), + ) + return name + + def dedupe(self, name: str) -> str: + origname = name + counter = 0 + while name in self.local_variable_names: + counter += 1 + name = f"{origname}_{counter}" + self.local_variable_names.append(name) + return name + + +class NullableVisitor(GrammarVisitor): + def __init__(self, rules: Dict[str, Rule]) -> None: + self.rules = rules + self.visited: Set[Any] = set() + self.nullables: Set[Union[Rule, NamedItem]] = set() + + def visit_Rule(self, rule: Rule) -> bool: + if rule in self.visited: + return False + self.visited.add(rule) + if self.visit(rule.rhs): + self.nullables.add(rule) + return rule in self.nullables + + def visit_Rhs(self, rhs: Rhs) -> bool: + for alt in rhs.alts: + if self.visit(alt): + return True + return False + + def visit_Alt(self, alt: Alt) -> bool: + for item in alt.items: + if not self.visit(item): + return False + return True + + def visit_Forced(self, force: Forced) -> bool: + return True + + def visit_LookAhead(self, lookahead: Lookahead) -> bool: + return True + + def visit_Opt(self, opt: Opt) -> bool: + return True + + def visit_Repeat0(self, repeat: Repeat0) -> bool: + return True + + def visit_Repeat1(self, repeat: Repeat1) -> bool: + return False + + def visit_Gather(self, gather: Gather) -> bool: + return False + + def visit_Cut(self, cut: Cut) -> bool: + return False + + def visit_Group(self, group: Group) -> bool: + return self.visit(group.rhs) + + def visit_NamedItem(self, item: NamedItem) -> bool: + if self.visit(item.item): + self.nullables.add(item) + return item in self.nullables + + def visit_NameLeaf(self, node: NameLeaf) -> bool: + if node.value in self.rules: + return self.visit(self.rules[node.value]) + # Token or unknown; never empty. + return False + + def visit_StringLeaf(self, node: StringLeaf) -> bool: + # The string token '' is considered empty. + return not node.value + + +def compute_nullables(rules: Dict[str, Rule]) -> Set[Any]: + """Compute which rules in a grammar are nullable. + + Thanks to TatSu (tatsu/leftrec.py) for inspiration. + """ + nullable_visitor = NullableVisitor(rules) + for rule in rules.values(): + nullable_visitor.visit(rule) + return nullable_visitor.nullables + + +class InitialNamesVisitor(GrammarVisitor): + def __init__(self, rules: Dict[str, Rule]) -> None: + self.rules = rules + self.nullables = compute_nullables(rules) + + def generic_visit(self, node: Iterable[Any], *args: Any, **kwargs: Any) -> Set[Any]: + names: Set[str] = set() + for value in node: + if isinstance(value, list): + for item in value: + names |= self.visit(item, *args, **kwargs) + else: + names |= self.visit(value, *args, **kwargs) + return names + + def visit_Alt(self, alt: Alt) -> Set[Any]: + names: Set[str] = set() + for item in alt.items: + names |= self.visit(item) + if item not in self.nullables: + break + return names + + def visit_Forced(self, force: Forced) -> Set[Any]: + return set() + + def visit_LookAhead(self, lookahead: Lookahead) -> Set[Any]: + return set() + + def visit_Cut(self, cut: Cut) -> Set[Any]: + return set() + + def visit_NameLeaf(self, node: NameLeaf) -> Set[Any]: + return {node.value} + + def visit_StringLeaf(self, node: StringLeaf) -> Set[Any]: + return set() + + +def compute_left_recursives( + rules: Dict[str, Rule] +) -> Tuple[Dict[str, AbstractSet[str]], List[AbstractSet[str]]]: + graph = make_first_graph(rules) + sccs = list(sccutils.strongly_connected_components(graph.keys(), graph)) + for scc in sccs: + if len(scc) > 1: + for name in scc: + rules[name].left_recursive = True + # Try to find a leader such that all cycles go through it. + leaders = set(scc) + for start in scc: + for cycle in sccutils.find_cycles_in_scc(graph, scc, start): + # print("Cycle:", " -> ".join(cycle)) + leaders -= scc - set(cycle) + if not leaders: + raise ValueError( + f"SCC {scc} has no leadership candidate (no element is included in all cycles)" + ) + # print("Leaders:", leaders) + leader = min(leaders) # Pick an arbitrary leader from the candidates. + rules[leader].leader = True + else: + name = min(scc) # The only element. + if name in graph[name]: + rules[name].left_recursive = True + rules[name].leader = True + return graph, sccs + + +def make_first_graph(rules: Dict[str, Rule]) -> Dict[str, AbstractSet[str]]: + """Compute the graph of left-invocations. + + There's an edge from A to B if A may invoke B at its initial + position. + + Note that this requires the nullable flags to have been computed. + """ + initial_name_visitor = InitialNamesVisitor(rules) + graph = {} + vertices: Set[str] = set() + for rulename, rhs in rules.items(): + graph[rulename] = names = initial_name_visitor.visit(rhs) + vertices |= names + for vertex in vertices: + graph.setdefault(vertex, set()) + return graph diff --git a/scripts/pegen/python_generator.py b/scripts/pegen/python_generator.py new file mode 100644 index 000000000..5329d0ebe --- /dev/null +++ b/scripts/pegen/python_generator.py @@ -0,0 +1,345 @@ +import os.path +import token +from typing import IO, Any, Dict, Optional, Sequence, Set, Text, Tuple + +from pegen import grammar +from pegen.grammar import ( + Alt, + Cut, + Forced, + Gather, + GrammarVisitor, + Group, + Lookahead, + NamedItem, + NameLeaf, + NegativeLookahead, + Opt, + PositiveLookahead, + Repeat0, + Repeat1, + Rhs, + Rule, + StringLeaf, +) +from pegen.parser_generator import ParserGenerator + +MODULE_PREFIX = """\ +#!/usr/bin/env python3.8 +# @generated by pegen from {filename} + +import ast +import sys +import tokenize + +from typing import Any, Optional + +from pegen.parser import memoize, memoize_left_rec, logger, Parser + +""" +MODULE_SUFFIX = """ + +if __name__ == '__main__': + from pegen.parser import simple_parser_main + simple_parser_main({class_name}) +""" + + +class InvalidNodeVisitor(GrammarVisitor): + def visit_NameLeaf(self, node: NameLeaf) -> bool: + name = node.value + return name.startswith("invalid") + + def visit_StringLeaf(self, node: StringLeaf) -> bool: + return False + + def visit_NamedItem(self, node: NamedItem) -> bool: + return self.visit(node.item) + + def visit_Rhs(self, node: Rhs) -> bool: + return any(self.visit(alt) for alt in node.alts) + + def visit_Alt(self, node: Alt) -> bool: + return any(self.visit(item) for item in node.items) + + def lookahead_call_helper(self, node: Lookahead) -> bool: + return self.visit(node.node) + + def visit_PositiveLookahead(self, node: PositiveLookahead) -> bool: + return self.lookahead_call_helper(node) + + def visit_NegativeLookahead(self, node: NegativeLookahead) -> bool: + return self.lookahead_call_helper(node) + + def visit_Opt(self, node: Opt) -> bool: + return self.visit(node.node) + + def visit_Repeat(self, node: Repeat0) -> Tuple[str, str]: + return self.visit(node.node) + + def visit_Gather(self, node: Gather) -> Tuple[str, str]: + return self.visit(node.node) + + def visit_Group(self, node: Group) -> bool: + return self.visit(node.rhs) + + def visit_Cut(self, node: Cut) -> bool: + return False + + def visit_Forced(self, node: Forced) -> bool: + return self.visit(node.node) + + +class PythonCallMakerVisitor(GrammarVisitor): + def __init__(self, parser_generator: ParserGenerator): + self.gen = parser_generator + self.cache: Dict[Any, Any] = {} + + def visit_NameLeaf(self, node: NameLeaf) -> Tuple[Optional[str], str]: + name = node.value + if name == "SOFT_KEYWORD": + return "soft_keyword", "self.soft_keyword()" + if name in ("NAME", "NUMBER", "STRING", "OP", "TYPE_COMMENT"): + name = name.lower() + return name, f"self.{name}()" + if name in ("NEWLINE", "DEDENT", "INDENT", "ENDMARKER", "ASYNC", "AWAIT"): + # Avoid using names that can be Python keywords + return "_" + name.lower(), f"self.expect({name!r})" + return name, f"self.{name}()" + + def visit_StringLeaf(self, node: StringLeaf) -> Tuple[str, str]: + return "literal", f"self.expect({node.value})" + + def visit_Rhs(self, node: Rhs) -> Tuple[Optional[str], str]: + if node in self.cache: + return self.cache[node] + if len(node.alts) == 1 and len(node.alts[0].items) == 1: + self.cache[node] = self.visit(node.alts[0].items[0]) + else: + name = self.gen.artifical_rule_from_rhs(node) + self.cache[node] = name, f"self.{name}()" + return self.cache[node] + + def visit_NamedItem(self, node: NamedItem) -> Tuple[Optional[str], str]: + name, call = self.visit(node.item) + if node.name: + name = node.name + return name, call + + def lookahead_call_helper(self, node: Lookahead) -> Tuple[str, str]: + name, call = self.visit(node.node) + head, tail = call.split("(", 1) + assert tail[-1] == ")" + tail = tail[:-1] + return head, tail + + def visit_PositiveLookahead(self, node: PositiveLookahead) -> Tuple[None, str]: + head, tail = self.lookahead_call_helper(node) + return None, f"self.positive_lookahead({head}, {tail})" + + def visit_NegativeLookahead(self, node: NegativeLookahead) -> Tuple[None, str]: + head, tail = self.lookahead_call_helper(node) + return None, f"self.negative_lookahead({head}, {tail})" + + def visit_Opt(self, node: Opt) -> Tuple[str, str]: + name, call = self.visit(node.node) + # Note trailing comma (the call may already have one comma + # at the end, for example when rules have both repeat0 and optional + # markers, e.g: [rule*]) + if call.endswith(","): + return "opt", call + else: + return "opt", f"{call}," + + def visit_Repeat0(self, node: Repeat0) -> Tuple[str, str]: + if node in self.cache: + return self.cache[node] + name = self.gen.artificial_rule_from_repeat(node.node, False) + self.cache[node] = name, f"self.{name}()," # Also a trailing comma! + return self.cache[node] + + def visit_Repeat1(self, node: Repeat1) -> Tuple[str, str]: + if node in self.cache: + return self.cache[node] + name = self.gen.artificial_rule_from_repeat(node.node, True) + self.cache[node] = name, f"self.{name}()" # But no trailing comma here! + return self.cache[node] + + def visit_Gather(self, node: Gather) -> Tuple[str, str]: + if node in self.cache: + return self.cache[node] + name = self.gen.artifical_rule_from_gather(node) + self.cache[node] = name, f"self.{name}()" # No trailing comma here either! + return self.cache[node] + + def visit_Group(self, node: Group) -> Tuple[Optional[str], str]: + return self.visit(node.rhs) + + def visit_Cut(self, node: Cut) -> Tuple[str, str]: + return "cut", "True" + + def visit_Forced(self, node: Forced) -> Tuple[str, str]: + if isinstance(node.node, Group): + _, val = self.visit(node.node.rhs) + return "forced", f"self.expect_forced({val}, '''({node.node.rhs!s})''')" + else: + return ( + "forced", + f"self.expect_forced(self.expect({node.node.value}), {node.node.value!r})", + ) + + +class PythonParserGenerator(ParserGenerator, GrammarVisitor): + def __init__( + self, + grammar: grammar.Grammar, + file: Optional[IO[Text]], + tokens: Set[str] = set(token.tok_name.values()), + location_formatting: Optional[str] = None, + unreachable_formatting: Optional[str] = None, + ): + tokens.add("SOFT_KEYWORD") + super().__init__(grammar, tokens, file) + self.callmakervisitor: PythonCallMakerVisitor = PythonCallMakerVisitor(self) + self.invalidvisitor: InvalidNodeVisitor = InvalidNodeVisitor() + self.unreachable_formatting = unreachable_formatting or "None # pragma: no cover" + self.location_formatting = ( + location_formatting + or "lineno=start_lineno, col_offset=start_col_offset, " + "end_lineno=end_lineno, end_col_offset=end_col_offset" + ) + + def generate(self, filename: str) -> None: + self.collect_rules() + header = self.grammar.metas.get("header", MODULE_PREFIX) + if header is not None: + basename = os.path.basename(filename) + self.print(header.rstrip("\n").format(filename=basename)) + subheader = self.grammar.metas.get("subheader", "") + if subheader: + self.print(subheader) + cls_name = self.grammar.metas.get("class", "GeneratedParser") + self.print("# Keywords and soft keywords are listed at the end of the parser definition.") + self.print(f"class {cls_name}(Parser):") + for rule in self.all_rules.values(): + self.print() + with self.indent(): + self.visit(rule) + + self.print() + with self.indent(): + self.print(f"KEYWORDS = {tuple(self.keywords)}") + self.print(f"SOFT_KEYWORDS = {tuple(self.soft_keywords)}") + + trailer = self.grammar.metas.get("trailer", MODULE_SUFFIX.format(class_name=cls_name)) + if trailer is not None: + self.print(trailer.rstrip("\n")) + + def alts_uses_locations(self, alts: Sequence[Alt]) -> bool: + for alt in alts: + if alt.action and "LOCATIONS" in alt.action: + return True + for n in alt.items: + if isinstance(n.item, Group) and self.alts_uses_locations(n.item.rhs.alts): + return True + return False + + def visit_Rule(self, node: Rule) -> None: + is_loop = node.is_loop() + is_gather = node.is_gather() + rhs = node.flatten() + if node.left_recursive: + if node.leader: + self.print("@memoize_left_rec") + else: + # Non-leader rules in a cycle are not memoized, + # but they must still be logged. + self.print("@logger") + else: + self.print("@memoize") + node_type = node.type or "Any" + self.print(f"def {node.name}(self) -> Optional[{node_type}]:") + with self.indent(): + self.print(f"# {node.name}: {rhs}") + self.print("mark = self._mark()") + if self.alts_uses_locations(node.rhs.alts): + self.print("tok = self._tokenizer.peek()") + self.print("start_lineno, start_col_offset = tok.start") + if is_loop: + self.print("children = []") + self.visit(rhs, is_loop=is_loop, is_gather=is_gather) + if is_loop: + self.print("return children") + else: + self.print("return None") + + def visit_NamedItem(self, node: NamedItem) -> None: + name, call = self.callmakervisitor.visit(node.item) + if node.name: + name = node.name + if not name: + self.print(call) + else: + if name != "cut": + name = self.dedupe(name) + self.print(f"({name} := {call})") + + def visit_Rhs(self, node: Rhs, is_loop: bool = False, is_gather: bool = False) -> None: + if is_loop: + assert len(node.alts) == 1 + for alt in node.alts: + self.visit(alt, is_loop=is_loop, is_gather=is_gather) + + def visit_Alt(self, node: Alt, is_loop: bool, is_gather: bool) -> None: + has_cut = any(isinstance(item.item, Cut) for item in node.items) + with self.local_variable_context(): + if has_cut: + self.print("cut = False") + if is_loop: + self.print("while (") + else: + self.print("if (") + with self.indent(): + first = True + for item in node.items: + if first: + first = False + else: + self.print("and") + self.visit(item) + if is_gather: + self.print("is not None") + + self.print("):") + with self.indent(): + action = node.action + if not action: + if is_gather: + assert len(self.local_variable_names) == 2 + action = ( + f"[{self.local_variable_names[0]}] + {self.local_variable_names[1]}" + ) + else: + if self.invalidvisitor.visit(node): + action = "UNREACHABLE" + elif len(self.local_variable_names) == 1: + action = f"{self.local_variable_names[0]}" + else: + action = f"[{', '.join(self.local_variable_names)}]" + elif "LOCATIONS" in action: + self.print("tok = self._tokenizer.get_last_non_whitespace_token()") + self.print("end_lineno, end_col_offset = tok.end") + action = action.replace("LOCATIONS", self.location_formatting) + + if is_loop: + self.print(f"children.append({action})") + self.print(f"mark = self._mark()") + else: + if "UNREACHABLE" in action: + action = action.replace("UNREACHABLE", self.unreachable_formatting) + self.print(f"return {action}") + + self.print("self._reset(mark)") + # Skip remaining alternatives if a cut was reached. + if has_cut: + self.print("if cut: return None") diff --git a/scripts/pegen/sccutils.py b/scripts/pegen/sccutils.py new file mode 100644 index 000000000..1f0586bb2 --- /dev/null +++ b/scripts/pegen/sccutils.py @@ -0,0 +1,128 @@ +# Adapted from mypy (mypy/build.py) under the MIT license. + +from typing import * + + +def strongly_connected_components( + vertices: AbstractSet[str], edges: Dict[str, AbstractSet[str]] +) -> Iterator[AbstractSet[str]]: + """Compute Strongly Connected Components of a directed graph. + + Args: + vertices: the labels for the vertices + edges: for each vertex, gives the target vertices of its outgoing edges + + Returns: + An iterator yielding strongly connected components, each + represented as a set of vertices. Each input vertex will occur + exactly once; vertices not part of a SCC are returned as + singleton sets. + + From http://code.activestate.com/recipes/578507/. + """ + identified: Set[str] = set() + stack: List[str] = [] + index: Dict[str, int] = {} + boundaries: List[int] = [] + + def dfs(v: str) -> Iterator[Set[str]]: + index[v] = len(stack) + stack.append(v) + boundaries.append(index[v]) + + for w in edges[v]: + if w not in index: + yield from dfs(w) + elif w not in identified: + while index[w] < boundaries[-1]: + boundaries.pop() + + if boundaries[-1] == index[v]: + boundaries.pop() + scc = set(stack[index[v] :]) + del stack[index[v] :] + identified.update(scc) + yield scc + + for v in vertices: + if v not in index: + yield from dfs(v) + + +def topsort( + data: Dict[AbstractSet[str], Set[AbstractSet[str]]] +) -> Iterable[AbstractSet[AbstractSet[str]]]: + """Topological sort. + + Args: + data: A map from SCCs (represented as frozen sets of strings) to + sets of SCCs, its dependencies. NOTE: This data structure + is modified in place -- for normalization purposes, + self-dependencies are removed and entries representing + orphans are added. + + Returns: + An iterator yielding sets of SCCs that have an equivalent + ordering. NOTE: The algorithm doesn't care about the internal + structure of SCCs. + + Example: + Suppose the input has the following structure: + + {A: {B, C}, B: {D}, C: {D}} + + This is normalized to: + + {A: {B, C}, B: {D}, C: {D}, D: {}} + + The algorithm will yield the following values: + + {D} + {B, C} + {A} + + From http://code.activestate.com/recipes/577413/. + """ + # TODO: Use a faster algorithm? + for k, v in data.items(): + v.discard(k) # Ignore self dependencies. + for item in set.union(*data.values()) - set(data.keys()): + data[item] = set() + while True: + ready = {item for item, dep in data.items() if not dep} + if not ready: + break + yield ready + data = {item: (dep - ready) for item, dep in data.items() if item not in ready} + assert not data, "A cyclic dependency exists amongst %r" % data + + +def find_cycles_in_scc( + graph: Dict[str, AbstractSet[str]], scc: AbstractSet[str], start: str +) -> Iterable[List[str]]: + """Find cycles in SCC emanating from start. + + Yields lists of the form ['A', 'B', 'C', 'A'], which means there's + a path from A -> B -> C -> A. The first item is always the start + argument, but the last item may be another element, e.g. ['A', + 'B', 'C', 'B'] means there's a path from A to B and there's a + cycle from B to C and back. + """ + # Basic input checks. + assert start in scc, (start, scc) + assert scc <= graph.keys(), scc - graph.keys() + + # Reduce the graph to nodes in the SCC. + graph = {src: {dst for dst in dsts if dst in scc} for src, dsts in graph.items() if src in scc} + assert start in graph + + # Recursive helper that yields cycles. + def dfs(node: str, path: List[str]) -> Iterator[List[str]]: + if node in path: + yield path + [node] + return + path = path + [node] # TODO: Make this not quadratic. + for child in graph[node]: + yield from dfs(child, path) + + yield from dfs(start, []) diff --git a/scripts/pegen/testutil.py b/scripts/pegen/testutil.py new file mode 100644 index 000000000..473d208a7 --- /dev/null +++ b/scripts/pegen/testutil.py @@ -0,0 +1,140 @@ +import importlib.util +import io +import os +import pathlib +import sys +import textwrap +import token +import tokenize +from typing import IO, Any, Dict, Final, Optional, Type, cast + +from pegen.build import compile_c_extension +from pegen.c_generator import CParserGenerator +from pegen.grammar import Grammar +from pegen.grammar_parser import GeneratedParser as GrammarParser +from pegen.parser import Parser +from pegen.python_generator import PythonParserGenerator +from pegen.tokenizer import Tokenizer + +ALL_TOKENS = token.tok_name +EXACT_TOKENS = token.EXACT_TOKEN_TYPES +NON_EXACT_TOKENS = { + name for index, name in token.tok_name.items() if index not in EXACT_TOKENS.values() +} + + +def generate_parser(grammar: Grammar) -> Type[Parser]: + # Generate a parser. + out = io.StringIO() + genr = PythonParserGenerator(grammar, out) + genr.generate("") + + # Load the generated parser class. + ns: Dict[str, Any] = {} + exec(out.getvalue(), ns) + return ns["GeneratedParser"] + + +def run_parser(file: IO[bytes], parser_class: Type[Parser], *, verbose: bool = False) -> Any: + # Run a parser on a file (stream). + tokenizer = Tokenizer(tokenize.generate_tokens(file.readline)) # type: ignore # typeshed issue #3515 + parser = parser_class(tokenizer, verbose=verbose) + result = parser.start() + if result is None: + raise parser.make_syntax_error("invalid syntax") + return result + + +def parse_string( + source: str, parser_class: Type[Parser], *, dedent: bool = True, verbose: bool = False +) -> Any: + # Run the parser on a string. + if dedent: + source = textwrap.dedent(source) + file = io.StringIO(source) + return run_parser(file, parser_class, verbose=verbose) # type: ignore # typeshed issue #3515 + + +def make_parser(source: str) -> Type[Parser]: + # Combine parse_string() and generate_parser(). + grammar = parse_string(source, GrammarParser) + return generate_parser(grammar) + + +def import_file(full_name: str, path: str) -> Any: + """Import a python module from a path""" + + spec = importlib.util.spec_from_file_location(full_name, path) + assert spec is not None + mod = importlib.util.module_from_spec(spec) + + # We assume this is not None and has an exec_module() method. + # See https://docs.python.org/3/reference/import.html?highlight=exec_module#loading + loader = cast(Any, spec.loader) + loader.exec_module(mod) + return mod + + +def generate_c_parser_source(grammar: Grammar) -> str: + out = io.StringIO() + genr = CParserGenerator(grammar, ALL_TOKENS, EXACT_TOKENS, NON_EXACT_TOKENS, out) + genr.generate("") + return out.getvalue() + + +def generate_parser_c_extension( + grammar: Grammar, path: pathlib.PurePath, debug: bool = False, + library_dir: Optional[str] = None, +) -> Any: + """Generate a parser c extension for the given grammar in the given path + + Returns a module object with a parse_string() method. + TODO: express that using a Protocol. + """ + # Make sure that the working directory is empty: reusing non-empty temporary + # directories when generating extensions can lead to segmentation faults. + # Check issue #95 (https://github.com/gvanrossum/pegen/issues/95) for more + # context. + assert not os.listdir(path) + source = path / "parse.c" + with open(source, "w", encoding="utf-8") as file: + genr = CParserGenerator( + grammar, ALL_TOKENS, EXACT_TOKENS, NON_EXACT_TOKENS, file, debug=debug + ) + genr.generate("parse.c") + compile_c_extension( + str(source), + build_dir=str(path), + # Significant test_peg_generator speedups + disable_optimization=True, + library_dir=library_dir, + ) + + +def print_memstats() -> bool: + MiB: Final = 2 ** 20 + try: + import psutil # type: ignore + except ImportError: + return False + print("Memory stats:") + process = psutil.Process() + meminfo = process.memory_info() + res = {} + res["rss"] = meminfo.rss / MiB + res["vms"] = meminfo.vms / MiB + if sys.platform == "win32": + res["maxrss"] = meminfo.peak_wset / MiB + else: + # See https://stackoverflow.com/questions/938733/total-memory-used-by-python-process + import resource # Since it doesn't exist on Windows. + + rusage = resource.getrusage(resource.RUSAGE_SELF) + if sys.platform == "darwin": + factor = 1 + else: + factor = 1024 # Linux + res["maxrss"] = rusage.ru_maxrss * factor / MiB + for key, value in res.items(): + print(f" {key:12.12s}: {value:10.0f} MiB") + return True diff --git a/scripts/pegen/tokenizer.py b/scripts/pegen/tokenizer.py new file mode 100644 index 000000000..7ee49e143 --- /dev/null +++ b/scripts/pegen/tokenizer.py @@ -0,0 +1,118 @@ +import token +import tokenize +from typing import Dict, Iterator, List + +Mark = int # NewType('Mark', int) + +exact_token_types = token.EXACT_TOKEN_TYPES + + +def shorttok(tok: tokenize.TokenInfo) -> str: + return "%-25.25s" % f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}" + + +class Tokenizer: + """Caching wrapper for the tokenize module. + + This is pretty tied to Python's syntax. + """ + + _tokens: List[tokenize.TokenInfo] + + def __init__( + self, tokengen: Iterator[tokenize.TokenInfo], *, path: str = "", verbose: bool = False + ): + self._tokengen = tokengen + self._tokens = [] + self._index = 0 + self._verbose = verbose + self._lines: Dict[int, str] = {} + self._path = path + if verbose: + self.report(False, False) + + def getnext(self) -> tokenize.TokenInfo: + """Return the next token and updates the index.""" + cached = not self._index == len(self._tokens) + tok = self.peek() + self._index += 1 + if self._verbose: + self.report(cached, False) + return tok + + def peek(self) -> tokenize.TokenInfo: + """Return the next token *without* updating the index.""" + while self._index == len(self._tokens): + tok = next(self._tokengen) + if tok.type in (tokenize.NL, tokenize.COMMENT): + continue + if tok.type == token.ERRORTOKEN and tok.string.isspace(): + continue + if ( + tok.type == token.NEWLINE + and self._tokens + and self._tokens[-1].type == token.NEWLINE + ): + continue + self._tokens.append(tok) + if not self._path: + self._lines[tok.start[0]] = tok.line + return self._tokens[self._index] + + def diagnose(self) -> tokenize.TokenInfo: + if not self._tokens: + self.getnext() + return self._tokens[-1] + + def get_last_non_whitespace_token(self) -> tokenize.TokenInfo: + for tok in reversed(self._tokens[: self._index]): + if tok.type != tokenize.ENDMARKER and ( + tok.type < tokenize.NEWLINE or tok.type > tokenize.DEDENT + ): + break + return tok + + def get_lines(self, line_numbers: List[int]) -> List[str]: + """Retrieve source lines corresponding to line numbers.""" + if self._lines: + lines = self._lines + else: + n = len(line_numbers) + lines = {} + count = 0 + seen = 0 + with open(self._path) as f: + for l in f: + count += 1 + if count in line_numbers: + seen += 1 + lines[count] = l + if seen == n: + break + + return [lines[n] for n in line_numbers] + + def mark(self) -> Mark: + return self._index + + def reset(self, index: Mark) -> None: + if index == self._index: + return + assert 0 <= index <= len(self._tokens), (index, len(self._tokens)) + old_index = self._index + self._index = index + if self._verbose: + self.report(True, index < old_index) + + def report(self, cached: bool, back: bool) -> None: + if back: + fill = "-" * self._index + "-" + elif cached: + fill = "-" * self._index + ">" + else: + fill = "-" * self._index + "*" + if self._index == 0: + print(f"{fill} (Bof)") + else: + tok = self._tokens[self._index - 1] + print(f"{fill} {shorttok(tok)}") diff --git a/scripts/pegen/validator.py b/scripts/pegen/validator.py new file mode 100644 index 000000000..c48a01eed --- /dev/null +++ b/scripts/pegen/validator.py @@ -0,0 +1,41 @@ +from typing import Optional + +from pegen import grammar +from pegen.grammar import Alt, GrammarVisitor, Rhs, Rule + + +class ValidationError(Exception): + pass + + +class GrammarValidator(GrammarVisitor): + def __init__(self, grammar: grammar.Grammar) -> None: + self.grammar = grammar + self.rulename: Optional[str] = None + + def validate_rule(self, rulename: str, node: Rule) -> None: + self.rulename = rulename + self.visit(node) + self.rulename = None + + +class SubRuleValidator(GrammarValidator): + def visit_Rhs(self, node: Rhs) -> None: + for index, alt in enumerate(node.alts): + alts_to_consider = node.alts[index + 1 :] + for other_alt in alts_to_consider: + self.check_intersection(alt, other_alt) + + def check_intersection(self, first_alt: Alt, second_alt: Alt) -> None: + if str(second_alt).startswith(str(first_alt)): + raise ValidationError( + f"In {self.rulename} there is an alternative that will " + f"never be visited:\n{second_alt}" + ) + + +def validate_grammar(the_grammar: grammar.Grammar) -> None: + for validator_cls in GrammarValidator.__subclasses__(): + validator = validator_cls(the_grammar) + for rule_name, rule in the_grammar.rules.items(): + validator.validate_rule(rule_name, rule) diff --git a/www/src/py2js.js b/www/src/py2js.js index 697d5f39c..75ca0f1c7 100644 --- a/www/src/py2js.js +++ b/www/src/py2js.js @@ -8511,7 +8511,7 @@ function handle_errortoken(context, token, token_reader){ error_message = `invalid character '${token.string}' (${u})` } raise_syntax_error(context, error_message); - + } raise_syntax_error(context) } @@ -8853,6 +8853,8 @@ var create_root_node = $B.parser.create_root_node = function(src, module, return root } +$B.parse_time = 0 + $B.py2js = function(src, module, locals_id, parent_scope){ // src = Python source (string or object) // module = module name (string) @@ -8878,7 +8880,8 @@ $B.py2js = function(src, module, locals_id, parent_scope){ locals_id = locals_id[0] } - var _ast + var _ast, + t0 = window.performance.now() if($B.parser_to_ast){ console.log('use standard parser') @@ -8889,6 +8892,7 @@ $B.py2js = function(src, module, locals_id, parent_scope){ dispatch_tokens(root) _ast = root.ast() } + $B.parse_time += window.performance.now() - t0 var future = $B.future_features(_ast, filename) var symtable = $B._PySymtable_Build(_ast, filename, future) var js_obj = $B.js_from_root({ast: _ast, @@ -9352,6 +9356,7 @@ $B.run_script = function(script, src, name, url, run_loop){ console.log($B.format_indent(js, 0)) } }catch(err){ + console.log('err', err) return $B.handle_error($B.exception(err)) // in loaders.js } var _script = { diff --git a/www/src/python_parser.js b/www/src/python_parser.js index c0ec6c90e..5336cbddf 100644 --- a/www/src/python_parser.js +++ b/www/src/python_parser.js @@ -349,6 +349,7 @@ var Parser = $B.Parser = function(src, filename, mode){ this.filename = filename this.mode = mode this.memo = {} + this.cache = {} this.arena = { a_objects: [] } @@ -399,7 +400,10 @@ Parser.prototype.parse = function(){ } // If parsing succeeds, return AST object - return make_ast(match, this.tokens) + var t0 = window.performance.now() + var res = make_ast(match, this.tokens) + $B.time_make_ast += window.performance.now() - t0 + return res } Parser.prototype.clear_memo = function(){ @@ -460,12 +464,21 @@ Parser.prototype.apply_rule = function(rule, position){ return result } +function set_id(rule){ + return $B.UUID() +} + +$B.nb_eval_option = 0 +$B.nb_deja_vu = 0 + Parser.prototype.eval_option = function(rule, position){ + $B.nb_eval_option++ var tokens = this.tokens, result, start = position, join_position = false + rule.id = rule.id ?? $B.UUID() if(! rule.repeat){ result = this.eval_option_once(rule, position) }else{ @@ -539,9 +552,7 @@ Parser.prototype.eval_option_once = function(rule, position){ if(rule.choices){ for(var i = 0, len = rule.choices.length; i < len; i++){ var choice = rule.choices[i], - invalid = choice.items && choice.items.length == 1 && - choice.items[0].name && - choice.items[0].name.startsWith('invalid_') + invalid = choice.invalid ?? test_invalid(choice) if(invalid && ! this.use_invalid){ continue } @@ -621,6 +632,13 @@ Parser.prototype.eval_option_once = function(rule, position){ } } +function test_invalid(choice){ + choice.invalid = choice.items && choice.items.length == 1 && + choice.items[0].name && + choice.items[0].name.startsWith('invalid_') + return choice.invalid +} + Parser.prototype.eval_body = function(rule, position){ // Only for grammar rules if(debug){ @@ -630,9 +648,7 @@ Parser.prototype.eval_body = function(rule, position){ if(rule.choices){ for(var i = 0, len = rule.choices.length; i < len; i++){ var choice = rule.choices[i], - invalid = choice.items && choice.items.length == 1 && - choice.items[0].name && - choice.items[0].name.startsWith('invalid_') + invalid = choice.invalid ?? test_invalid(choice) if(invalid && ! this.use_invalid){ continue } @@ -775,6 +791,8 @@ function set_alias(L, name, value){ } // Function that generates the AST for a match +$B.time_make_ast = 0 + function make_ast(match, tokens){ // match.rule succeeds; make_ast() returns a value for the match, based on // the grammar action for the rule diff --git a/www/src/python_tokenizer.js b/www/src/python_tokenizer.js index 2cc287e75..112c8b260 100644 --- a/www/src/python_tokenizer.js +++ b/www/src/python_tokenizer.js @@ -148,6 +148,10 @@ var ops = '.,:;+-*/%~^|&=<>[](){}@', // ! is valid in f-strings function Token(type, string, start, end, line){ start = start.slice(0, 2) var res = {type, string, start, end, line} + res.num_type = $B.py_tokens[type] + if(type == 'OP'){ + res.num_type = $B.py_tokens[$B.EXACT_TOKEN_TYPES[string]] + } res[0] = type res[1] = string res[2] = start