Skip to content

Commit

Permalink
Add scripts of CPython Tools/peg_generator/pegen to create Javascript…
Browse files Browse the repository at this point in the history
… PEG parser. Related to issue #2354
  • Loading branch information
PierreQuentel committed Jan 7, 2024
1 parent 3ce0dd2 commit 97b7e31
Show file tree
Hide file tree
Showing 22 changed files with 5,613 additions and 9 deletions.
Empty file added scripts/pegen/__init__.py
Empty file.
259 changes: 259 additions & 0 deletions scripts/pegen/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
#!/usr/bin/env python3.8

"""pegen -- PEG Generator.
Search the web for PEG Parsers for reference.
"""

import argparse
import sys
import time
import token
import traceback
from typing import Tuple

from pegen.build import Grammar, Parser, ParserGenerator, Tokenizer
from pegen.validator import validate_grammar


def generate_c_code(
args: argparse.Namespace,
) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
from pegen.build import build_c_parser_and_generator

verbose = args.verbose
verbose_tokenizer = verbose >= 3
verbose_parser = verbose == 2 or verbose >= 4
try:
grammar, parser, tokenizer, gen = build_c_parser_and_generator(
args.grammar_filename,
args.tokens_filename,
args.output,
args.compile_extension,
verbose_tokenizer,
verbose_parser,
args.verbose,
keep_asserts_in_extension=False if args.optimized else True,
skip_actions=args.skip_actions,
)
return grammar, parser, tokenizer, gen
except Exception as err:
if args.verbose:
raise # Show traceback
traceback.print_exception(err.__class__, err, None)
sys.stderr.write("For full traceback, use -v\n")
sys.exit(1)


def generate_python_code(
args: argparse.Namespace,
) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
from pegen.build import build_python_parser_and_generator

verbose = args.verbose
verbose_tokenizer = verbose >= 3
verbose_parser = verbose == 2 or verbose >= 4
try:
grammar, parser, tokenizer, gen = build_python_parser_and_generator(
args.grammar_filename,
args.output,
verbose_tokenizer,
verbose_parser,
skip_actions=args.skip_actions,
)
return grammar, parser, tokenizer, gen
except Exception as err:
if args.verbose:
raise # Show traceback
traceback.print_exception(err.__class__, err, None)
sys.stderr.write("For full traceback, use -v\n")
sys.exit(1)

def generate_javascript_code(
args: argparse.Namespace,
) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
from pegen.build import build_javascript_parser_and_generator

verbose = args.verbose
verbose_tokenizer = verbose >= 3
verbose_parser = verbose == 2 or verbose >= 4
try:
grammar, parser, tokenizer, gen = build_javascript_parser_and_generator(
args.grammar_filename,
args.output,
verbose_tokenizer,
verbose_parser,
skip_actions=args.skip_actions,
)
return grammar, parser, tokenizer, gen
except Exception as err:
if args.verbose:
raise # Show traceback
traceback.print_exception(err.__class__, err, None)
sys.stderr.write("For full traceback, use -v\n")
sys.exit(1)

def generate_javascript_code(
args: argparse.Namespace,
) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
from pegen.build import build_javascript_parser_and_generator
verbose = args.verbose
verbose_tokenizer = verbose >= 3
verbose_parser = verbose == 2 or verbose >= 4
try:
grammar, parser, tokenizer, gen = build_javascript_parser_and_generator(
args.grammar_filename,
args.tokens_filename,
args.output,
args.compile_extension,
verbose_tokenizer,
verbose_parser,
args.verbose,
keep_asserts_in_extension=False if args.optimized else True,
skip_actions=args.skip_actions,
)
return grammar, parser, tokenizer, gen
except Exception as err:
if args.verbose:
raise # Show traceback
traceback.print_exception(err.__class__, err, None)
sys.stderr.write("For full traceback, use -v\n")
sys.exit(1)

argparser = argparse.ArgumentParser(
prog="pegen", description="Experimental PEG-like parser generator"
)
argparser.add_argument("-q", "--quiet", action="store_true", help="Don't print the parsed grammar")
argparser.add_argument(
"-v",
"--verbose",
action="count",
default=0,
help="Print timing stats; repeat for more debug output",
)
subparsers = argparser.add_subparsers(help="target language for the generated code")

c_parser = subparsers.add_parser("c", help="Generate C code for inclusion into CPython")
c_parser.set_defaults(func=generate_c_code)
c_parser.add_argument("grammar_filename", help="Grammar description")
c_parser.add_argument("tokens_filename", help="Tokens description")
c_parser.add_argument(
"-o", "--output", metavar="OUT", default="parse.c", help="Where to write the generated parser"
)
c_parser.add_argument(
"--compile-extension",
action="store_true",
help="Compile generated C code into an extension module",
)
c_parser.add_argument(
"--optimized", action="store_true", help="Compile the extension in optimized mode"
)
c_parser.add_argument(
"--skip-actions",
action="store_true",
help="Suppress code emission for rule actions",
)

python_parser = subparsers.add_parser("python", help="Generate Python code")
python_parser.set_defaults(func=generate_python_code)
python_parser.add_argument("grammar_filename", help="Grammar description")
python_parser.add_argument(
"-o",
"--output",
metavar="OUT",
default="parse.py",
help="Where to write the generated parser",
)
python_parser.add_argument(
"--skip-actions",
action="store_true",
help="Suppress code emission for rule actions",
)

javascript_parser = subparsers.add_parser("javascript", help="Generate Javascript code for inclusion into CPython")
javascript_parser.set_defaults(func=generate_javascript_code)
javascript_parser.add_argument("grammar_filename", help="Grammar description")
javascript_parser.add_argument("tokens_filename", help="Tokens description")
javascript_parser.add_argument(
"-o", "--output", metavar="OUT", default="parse.js", help="Where to write the generated parser"
)
javascript_parser.add_argument(
"--compile-extension",
action="store_true",
help="Compile generated C code into an extension module",
)
javascript_parser.add_argument(
"--optimized", action="store_true", help="Compile the extension in optimized mode"
)
javascript_parser.add_argument(
"--skip-actions",
action="store_true",
help="Suppress code emission for rule actions",
)


def main() -> None:
from pegen.testutil import print_memstats

args = argparser.parse_args()
if "func" not in args:
argparser.error("Must specify the target language mode ('c' or 'python')")

t0 = time.time()
grammar, parser, tokenizer, gen = args.func(args)
t1 = time.time()

validate_grammar(grammar)

if not args.quiet:
if args.verbose:
print("Raw Grammar:")
for line in repr(grammar).splitlines():
print(" ", line)

print("Clean Grammar:")
for line in str(grammar).splitlines():
print(" ", line)

if args.verbose:
print("First Graph:")
for src, dsts in gen.first_graph.items():
print(f" {src} -> {', '.join(dsts)}")
print("First SCCS:")
for scc in gen.first_sccs:
print(" ", scc, end="")
if len(scc) > 1:
print(
" # Indirectly left-recursive; leaders:",
{name for name in scc if grammar.rules[name].leader},
)
else:
name = next(iter(scc))
if name in gen.first_graph[name]:
print(" # Left-recursive")
else:
print()

if args.verbose:
dt = t1 - t0
diag = tokenizer.diagnose()
nlines = diag.end[0]
if diag.type == token.ENDMARKER:
nlines -= 1
print(f"Total time: {dt:.3f} sec; {nlines} lines", end="")
if dt:
print(f"; {nlines / dt:.0f} lines/sec")
else:
print()
print("Caches sizes:")
print(f" token array : {len(tokenizer._tokens):10}")
print(f" cache : {len(parser._cache):10}")
if not print_memstats():
print("(Can't find psutil; install it for memory stats.)")


if __name__ == "__main__":
if sys.version_info < (3, 8):
print("ERROR: using pegen requires at least Python 3.8!", file=sys.stderr)
sys.exit(1)
main()
71 changes: 71 additions & 0 deletions scripts/pegen/ast_dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
Copy-parse of ast.dump, removing the `isinstance` checks. This is needed,
because testing pegen requires generating a C extension module, which contains
a copy of the symbols defined in Python-ast.c. Thus, the isinstance check would
always fail. We rely on string comparison of the base classes instead.
TODO: Remove the above-described hack.
"""

from typing import Any, Optional, Tuple


def ast_dump(
node: Any,
annotate_fields: bool = True,
include_attributes: bool = False,
*,
indent: Optional[str] = None,
) -> str:
def _format(node: Any, level: int = 0) -> Tuple[str, bool]:
if indent is not None:
level += 1
prefix = "\n" + indent * level
sep = ",\n" + indent * level
else:
prefix = ""
sep = ", "
if any(cls.__name__ == "AST" for cls in node.__class__.__mro__):
cls = type(node)
args = []
allsimple = True
keywords = annotate_fields
for name in node._fields:
try:
value = getattr(node, name)
except AttributeError:
keywords = True
continue
if value is None and getattr(cls, name, ...) is None:
keywords = True
continue
value, simple = _format(value, level)
allsimple = allsimple and simple
if keywords:
args.append("%s=%s" % (name, value))
else:
args.append(value)
if include_attributes and node._attributes:
for name in node._attributes:
try:
value = getattr(node, name)
except AttributeError:
continue
if value is None and getattr(cls, name, ...) is None:
continue
value, simple = _format(value, level)
allsimple = allsimple and simple
args.append("%s=%s" % (name, value))
if allsimple and len(args) <= 3:
return "%s(%s)" % (node.__class__.__name__, ", ".join(args)), not args
return "%s(%s%s)" % (node.__class__.__name__, prefix, sep.join(args)), False
elif isinstance(node, list):
if not node:
return "[]", True
return "[%s%s]" % (prefix, sep.join(_format(x, level)[0] for x in node)), False
return repr(node), True

if all(cls.__name__ != "AST" for cls in node.__class__.__mro__):
raise TypeError("expected AST, got %r" % node.__class__.__name__)
if indent is not None and not isinstance(indent, str):
indent = " " * indent
return _format(node)[0]
Loading

0 comments on commit 97b7e31

Please sign in to comment.