From 7cff8a99ce2cbeab49f6f3551c34e32a0e303ded Mon Sep 17 00:00:00 2001 From: Jason Tackaberry Date: Fri, 15 Sep 2023 20:20:16 -0400 Subject: [PATCH] Refactor content parsing and ref resolution Reference resolution logic has been moved from the renderer to the parser (invoked by the prerenderer), where refs are now converted to markdown links using an intermediate `luadox:` link format. It's up to the renderer to resolve these links to whatever is appropriate. This required introducing the notion of an id to references. Ids are globally unique opaque strings that are tracked by the parser, which the renderer can consult in order to convert an id to a Reference object. This refactoring continues to pave the way for #5 and will allow for different kinds of renderers (not just HTML), where the common logic that applies to all renderers has been moved to the parser and run during the prerender stage. Additionally, tag parsing within content blocks (e.g. handling @tparam, @note, etc.) has been rewritten and hopefully simplified. (Parser.parse_raw_content()) Finally, this commit includes some optimizations: * Compiled regexp objects are now cached and reused, reducing compilation overhead * First sentence detection has been rewritten using a more naive, lower level approach that is significantly faster. During profiling, get_first_sentence() was the most disproportionately expensive functions called. --- src/main.py | 59 +--------- src/parse.py | 280 +++++++++++++++++++++++++++------------------- src/prerender.py | 51 +++++---- src/reference.py | 27 +++-- src/render.py | 282 ++++++++++++++++++++++++++--------------------- src/utils.py | 216 +++++++++++++++++++++++++++++------- 6 files changed, 548 insertions(+), 367 deletions(-) diff --git a/src/main.py b/src/main.py index b158c83..ebd76df 100644 --- a/src/main.py +++ b/src/main.py @@ -31,11 +31,9 @@ from typing import Generator, Union, Dict, Tuple, Set from .log import log -from .assets import assets from .parse import * from .render import * from .prerender import Prerenderer -from .reference import ManualRef try: # version.py is generated at build time, so we are running from the proper @@ -51,21 +49,6 @@ # which case the module name will be inferred. BasePathsType = Dict[Union[Tuple[str, ...], None], Set[str]] -# Files from the assets directory to be copied -ASSETS = [ - 'luadox.css', - 'prism.css', - 'prism.js', - 'js-search.min.js', - 'search.js', - 'img/i-left.svg', - 'img/i-right.svg', - 'img/i-download.svg', - 'img/i-github.svg', - 'img/i-gitlab.svg', - 'img/i-bitbucket.svg', -] - class FullHelpParser(argparse.ArgumentParser): def error(self, message: str) -> None: sys.stderr.write('error: %s\n' % message) @@ -265,43 +248,9 @@ def main(): try: log.info('prerendering %d pages', len(parser.topsyms)) toprefs = Prerenderer(parser).process() - - for ref in toprefs: - if ref.userdata.get('empty') and ref.implicit: - # Reference has no content and it was also implicitly generated, so we don't render it. - log.info('not rendering empty %s %s', ref.type, ref.name) - continue - if isinstance(ref, ManualRef) and ref.name == 'index': - typedir = outdir - else: - typedir = os.path.join(outdir, ref.type) - os.makedirs(typedir, exist_ok=True) - outfile = os.path.join(typedir, ref.name + '.html') - log.info('rendering %s %s -> %s', ref.type, ref.name, outfile) - html = renderer.render(ref) - with open(outfile, 'w', encoding='utf8') as f: - f.write(html) - - js = renderer.render_search_index() - with open(os.path.join(outdir, 'index.js'), 'w', encoding='utf8') as f: - f.write(js) - - html = renderer.render_search_page() - with open(os.path.join(outdir, 'search.html'), 'w', encoding='utf8') as f: - f.write(html) - - if not parser.get_reference(ManualRef, 'index'): - # The user hasn't specified an index manual page, so we generate a blank - # landing page that at least presents the sidebar with available links. - html = renderer.render_landing_page() - with open(os.path.join(outdir, 'index.html'), 'w', encoding='utf8') as f: - f.write(html) - - for name in ASSETS: - outfile = os.path.join(outdir, name) - if os.path.dirname(name): - os.makedirs(os.path.dirname(outfile), exist_ok=True) - with open(outfile, 'wb') as f: - f.write(assets.get(name)) + renderer.render(toprefs, outdir) except Exception as e: log.exception('unhandled error rendering around %s:%s: %s', parser.ctx.file, parser.ctx.line, e) + sys.exit(1) + + log.info('done') diff --git a/src/parse.py b/src/parse.py index a32208a..29d8863 100644 --- a/src/parse.py +++ b/src/parse.py @@ -18,7 +18,7 @@ import re from collections import OrderedDict from configparser import ConfigParser -from typing import IO, Optional, Union, Tuple, List, Dict, Type +from typing import IO, Optional, Union, Tuple, List, Dict, Type, Match from .log import log from .reference import * @@ -63,6 +63,14 @@ def update(self, file: Union[str, None, Sentinel]=UNDEF, self.line = line +class ReferenceDict(dict): + """ + Dictionary keyed by ref type whose value is a list of references of that same type. + + This is a simple pattern to improve type clarity. + """ + def __getitem__(self, k: Type[RefT]) -> List[RefT]: + return super().__getitem__(k) class ParseError(ValueError): pass @@ -79,7 +87,7 @@ class Parser: def __init__(self, config: ConfigParser) -> None: self.config = config # A complete list of all Reference objects keyed by Reference subclass type - self.parsed: dict[Type[Reference], list[Reference]] = { + self.parsed = ReferenceDict({ ModuleRef: [], ClassRef: [], FunctionRef: [], @@ -87,7 +95,7 @@ def __init__(self, config: ConfigParser) -> None: SectionRef: [], TableRef: [], ManualRef: [], - } + }) # A dict of only top-level References ("toprefs"), keyed by the fully qualified # name of the reference. # @@ -109,6 +117,8 @@ def __init__(self, config: ConfigParser) -> None: # # name -> Reference self.refs: dict[str, Reference] = {} + # Maps refs by their ids, rather than names + self.refs_by_id: dict[str, Reference] = {} # This holds the context of the current file and reference being processed self.ctx = Context() @@ -148,10 +158,10 @@ def _parse_function(self, line: str) -> ParseFuncResult: found. """ # Form: function foo(bar, baz) - m = re.search(r'''\bfunction *([^\s(]+) *\(([^)]*)(\))?''', line) + m = recache(r'''\bfunction *([^\s(]+) *\(([^)]*)(\))?''').search(line) if not m: # Look for form: foo = function(bar, baz) - m = re.search(r'''(\S+) *= *function *\(([^)]*)(\))?''', line) + m = recache(r'''(\S+) *= *function *\(([^)]*)(\))?''').search(line) if not m: # Not a function (or not one we could recognize at least) return None, None @@ -163,7 +173,7 @@ def _parse_function(self, line: str) -> ParseFuncResult: if nextline is None: log.error('%s:%s: function definition is truncated', self.ctx.file, n) return None, None - m = re.search(r'''([^)]*)(\))?''', nextline) + m = recache(r'''([^)]*)(\))?''').search(nextline) if m: argstr, terminated = m.groups() arguments.extend([arg.strip() for arg in argstr.replace(' ', '').split(',') if arg.strip()]) @@ -179,10 +189,10 @@ def _parse_field(self, line: str) -> ParseFuncResult: but the second return value is always None. """ # Fields in the form [foo] = bar - m = re.search(r'''\[([^]]+)\] *=''', line) + m = recache(r'''\[([^]]+)\] *=''').search(line) if m: - return re.sub(r'''['"]''', '', m.group(1)), None - m = re.search(r'''\b([\S\.]+) *=''', line) + return recache(r'''['"]''').sub('', m.group(1)), None + m = recache(r'''\b([\S\.]+) *=''').search(line) if m: return m.group(1), None else: @@ -267,6 +277,7 @@ def _add_reference(self, ref: Reference, modref: Optional[Reference]=None) -> No ref.file, ref.line, ref.type, ref.name, conflict.type, conflict.file, conflict.line) else: self.refs[ref.name] = ref + self.refs_by_id[ref.id] = ref def _check_disconnected_reference(self, ref: Union[Reference, None]) -> bool: """ @@ -278,7 +289,7 @@ def _check_disconnected_reference(self, ref: Union[Reference, None]) -> bool: return True # Potentially disconnected comment stanza here, but let's first check to see if there's # any text in the comments, otherwise a blank --- would warn somewhat pointlessly. - content = ''.join(line.lstrip('-').strip() for (_, line) in ref.content) + content = ''.join(line.lstrip('-').strip() for (_, line) in ref.raw_content) if content: log.warning('%s:%s: comment block is not connected with any section, ignoring', ref.file, ref.line) return False @@ -331,8 +342,8 @@ def parse_source(self, f: IO[str]) -> List[str]: # Reference to the current collection, defaulting to implicit module ref collection = modref self.ctx.update(file=path) - re_start_comment_block = re.compile(r'^(---[^-]|---+$)') - re_require = re.compile(r'''\brequire\b *\(?['"]([^'"]+)['"]''') + re_start_comment_block = recache(r'^(---[^-]|---+$)') + re_require = recache(r'''\brequire\b *\(?['"]([^'"]+)['"]''') while True: n, line = self._next_line(strip=False) if n is None or line is None: @@ -396,7 +407,7 @@ def parse_source(self, f: IO[str]) -> List[str]: self.refs, file=path, line=n, scopes=scopes[:], symbol=args[0], collection=collection ) - field.content.append((n, ' '.join(args[1:]))) + field.raw_content.append((n, ' '.join(args[1:]))) self._add_reference(field, modref) elif tag == 'alias': if not args: @@ -430,7 +441,7 @@ def parse_source(self, f: IO[str]) -> List[str]: raise ParseError(f'@{tag} is missing argment') # Nothing special is otherwise needed here. else: - ref.content.append((n, line)) + ref.raw_content.append((n, line)) else: # This line doesn't start with a comment, but may have one at the end # which we remove here. @@ -555,23 +566,19 @@ def parse_manual(self, name: str, f: IO[str]) -> None: # Only h1, h2, and h3 create section references. if level <= 3: if ref == topref: - # TODO: use field - ref.flags['display'] = heading - ref.clear_cache() + ref.heading = heading # Symbol is used for URL fragment - symbol = re.sub(r'[^a-zA-Z0-9- ]', '', heading.lower()) - symbol = re.sub(r' +', '_', symbol).replace('_-_', '-') + symbol = recache(r'[^a-zA-Z0-9- ]').sub('', heading.lower()) + symbol = recache(r' +').sub('_', symbol).replace('_-_', '-') # Headings don't need to be unique, so check for duplicate symbol if symbol in symbols: symbol = symbol + str(symbols[symbol] + 1) symbols[symbol] = symbols.get(symbol, 0) + 1 ref = SectionRef(self.refs, file=path, line=n, scopes=[topref], symbol=symbol) - # TODO: use field - ref.flags['display'] = heading + ref.heading = heading ref.flags['level'] = level - ref.clear_cache() if ref != topref: self._add_reference(ref) @@ -580,14 +587,19 @@ def parse_manual(self, name: str, f: IO[str]) -> None: # ref's content just below. continue - ref.content.append((n, line)) + ref.raw_content.append((n, line)) + def get_reference(self, typ: Type[Reference], name: str) -> Union[Reference, None]: + """ + Returns the Reference object for the given type and name. + """ for ref in self.parsed[typ]: if ref.name == name: return ref - def _resolve_ref(self, name: str) -> Union[Reference, None]: + + def resolve_ref(self, name: str) -> Union[Reference, None]: """ Finds the Reference object for the given reference name. @@ -620,10 +632,8 @@ def _resolve_ref(self, name: str) -> Union[Reference, None]: ref = self.refs.get(clsref.name + '.' + name) if ref: break - if not ref: - return - if ref.within and not ref.userdata.get('within_topsym'): + if ref and ref.within and 'within_topsym' not in ref.userdata: # Check to see if the @within section is in the same topsym. collections = self.collections[ref.topsym] if ref.within not in collections: @@ -644,6 +654,7 @@ def _resolve_ref(self, name: str) -> Union[Reference, None]: return ref + def _reorder_refs(self, refs: List[RefT], topref: Optional[Reference]=None) -> List[RefT]: """ Reorders the given list of Reference objects according to any @order tags. @@ -729,7 +740,6 @@ def get_elements_in_collection(self, typ: Type[RefT], colref: CollectionRef) -> elems: list[RefT] = [] for ref in self.parsed[typ]: - assert(isinstance(ref, typ)) if topsym and topsym != ref.topsym: # We're constraining the refs search to the given topref but this ref # doesn't belong to that topref. @@ -745,11 +755,63 @@ def get_elements_in_collection(self, typ: Type[RefT], colref: CollectionRef) -> return self._reorder_refs(elems) + def render_ref_markdown(self, ref: Reference, text: Optional[str]=None, code=False) -> str: + """ + Returns the Reference as a markdown link, using luadox: as the link target, + which can be further resolved by the downstream renderer. + + If code is True, then the given text is wrapped in backticks. + """ + tick = '`' if code else '' + parens = '()' if isinstance(ref, FunctionRef) and not text else '' + return f'[{tick}{text or ref.name}{parens}{tick}](luadox:{ref.id})' + + + def _render_ref_markdown_re(self, m: Match[str]) -> str: + """ + Regexp callback to handle the @{refname} case. + """ + code: bool = (m.group(1) == '`') + ref = self.resolve_ref(m.group(2)) + if ref: + return self.render_ref_markdown(ref, m.group(3), code=code) + else: + log.warning('%s:~%s: reference "%s" could not be resolved', self.ctx.file, self.ctx.line, m.group(2)) + return m.group(3) or m.group(2) + + + def _render_backtick_ref_markdown_re(self, m: Match[str]) -> str: + """ + Regexp callback to handle the `refname` case. + """ + ref = self.resolve_ref(m.group(1)) + if ref: + return self.render_ref_markdown(ref, text=m.group(1), code=True) + else: + # Couldn't resolve the ref, just return back the original text. + return '`{}`'.format(m.group(1)) - def content_to_markdown(self, content: List[Tuple[int, str]], strip_comments=True) -> Tuple[ - Dict[str, Tuple[List[str], str]], - List[Tuple[List[str], str]], - str + + def refs_to_markdown(self, block: str) -> str: + """ + Replaces `refname` and @{refname} in the given block of text with + markdown links. + """ + # return block + # self._xxx = getattr(self, '_xxx', 0) + len(block) + # log.info('process 2: %s', self._xxx) + # Resolve `ref` + block = recache(r'(? Tuple[ + Dict[str, Tuple[List[str], Content]], + List[Tuple[List[str], Content]], + Content ]: """ Parses a docstring block into markdown. @@ -761,97 +823,83 @@ def content_to_markdown(self, content: List[Tuple[int, str]], strip_comments=Tru tags, a list of (types, docstrings) for @treturn tags, and a string holding the converted content to markdown. """ - output: list[str] = [] - params: dict[str, tuple[list[str], str]] = {} - returns: list[tuple[list[str], str]] = [] - if not content: - return params, returns, '' - - # List of (tag, args, indent, lines) - tagstack: list[tuple[str, list[str], int, list[str]]] = [] - supported_tags = 'tparam', 'treturn', 'usage', 'example', 'code', 'see', 'warning', 'note' - - def end_tag(): - tag, args, indent, lines = tagstack.pop() - target = tagstack[-1][3] if tagstack else output - if tag in ('usage', 'example'): - target.append('##### ' + tag.title()) - if tag in ('usage', 'example', 'code'): - lang = 'lua' if not args else args[0] - target.append('```' + lang) - # Remove trailing newlines. - while lines and not lines[-1].strip(): - lines.pop() - # Dedent all lines according to the indentation of the - # first line. - indent = get_indent_level(lines[0]) - target.extend([l[indent:] for l in lines]) - target.append('```') - elif tag == 'tparam' and len(args) >= 2: - types = args[0].split('|') - name = args[1] - params[name] = types, ' '.join(args[2:] + lines) - elif tag == 'treturn': - types = args[0].split('|') - doc = ' '.join(args[1:] + lines) - returns.append((types, doc)) - elif tag == 'see': - refs = ['@{{{}}}'.format(see) for see in args] - target.append('\n\x01see{}\x03\n'.format(', '.join(refs))) - elif tag == 'warning' or tag == 'note': - # Admonition - heading = ' '.join(args) if args else tag.title() - code = '\n\x01adm{}\x02{}\x02{}\n\x03\n' - target.append(code.format(tag, heading, '\n'.join(lines))) - - # FIXME: this (frustratingly uncommented) function doesn't pass the smell test. - def end_tags(all, line: Optional[str]=None, indent: Optional[int]=None): - if not all: - end_tag() - else: - while tagstack: - end_tag() - if line and tagstack: - last_tag_indent = tagstack[-1][2] - tagstack[-1][3].append(line) - line = None - return line - - last_line = content[-1][0] - for n, line in content: + params: dict[str, tuple[list[str], Content]] = {} + returns: list[tuple[list[str], Content]] = [] + # These tags take nested content + content_tags = {'warning', 'note', 'tparam', 'treturn'} + + # We pass _refs_to_markdown() as a postprocessor for the Content (here as well as + # below) which will resolve all references when the renderer finally fetches the + # markdown content via the Markdown.get() method. + # + # List of (indent, tag, content) + stack: list[tuple[int, str, Content]] = [(0, '', Content(postprocess=self.refs_to_markdown))] + # The number of columns to dedent raw lines before adding to the parsed content. + # If None, we set this to the current line's indent level and use that as dedent + # until reset back to None. + dedent = None + # We tack on a sentinel value at the end of the raw lines which forces closure of + # all pending tags on the stack. + for n, line in lines + [(-1, '')]: self.ctx.update(line=n) tag, args = self._parse_tag(line, require_comment=strip_comments) if strip_comments: line = line.lstrip('-').rstrip() indent = get_indent_level(line) - if tagstack: - last_tag_indent = tagstack[-1][2] - # Determine threshold at which we will consider the last tag to have - # terminated. - if tag: - # Any tag at the same level as the last tag (or below) will close - threshold = last_tag_indent + while len(stack) > 1 and (line or n == -1): + if stack[-1][0] < indent: + break + _, done_tag, content = stack.pop() + if done_tag in {'usage', 'example', 'code'}: + # Remove trailing newlines from the snippet before terminating the + # markdown code block. + content.md().rstrip().append('```') + # Redetect dedent level based on next line. + dedent = None + + # New content fragments are appended to the content object from the top of the + # stack. + content = stack[-1][2] + if tag: + # The Content object this tag's content will be pushed to. For tags that + # take content we initialize a new Content object, otherwise we just reuse + # the last one on the stack and append to it. + tagcontent = Content(postprocess=self.refs_to_markdown) if tag in content_tags else stack[-1][2] + stack.append((indent, tag, tagcontent)) + + if tag in {'usage', 'example', 'code'}: + if tag in {'usage', 'example'}: + # @usage and @example add a header. + content.md().append(f'##### {tag.title()}\n') + lang = 'lua' if not args else args[0] + content.md().append(f'```{lang}') + # Ensure subsequent dedent is based on the first line of the code + # block + dedent = None + elif tag in {'warning', 'note'}: + heading = self.refs_to_markdown(' '.join(args) if args else tag.title()) + content.append(Admonition(tag, heading, tagcontent)) + elif tag == 'tparam' and args and len(args) >= 2: + types = args[0].split('|') + name = args[1] + tagcontent.md().append(' '.join(args[2:])) + params[name] = types, tagcontent + elif tag == 'treturn' and args: + types = args[0].split('|') + tagcontent.md().append(' '.join(args[1:])) + returns.append((types, tagcontent)) + elif tag == 'see' and args: + refs = [self.resolve_ref(see) for see in args] + content.append(SeeAlso([ref.id for ref in refs if ref])) else: - threshold = last_tag_indent - if not tag and indent > threshold and line: - tagstack[-1][3].append(line) - line = None - if n == last_line or (line and indent <= threshold): - line = end_tags(n == last_line, line if not tag else None, indent) - - if tag and args is not None: - tagstack.append((tag, args, indent, [])) - if tag not in supported_tags: - log.error('%s:%s: unknown tag @%s', self.ctx.file, n, tag) - elif n == last_line: - end_tags(n == last_line) + log.error('%s:%s: unknown tag @%s or missing arguments', self.ctx.file, n, tag) + elif line is not None: - if tagstack: - last = tagstack[-1] - last[3].append(line) - else: - output.append(line) - return params, returns, '\n'.join(output) + dedent = indent if dedent is None else dedent + content.md().append(line[dedent:]) + if len(stack) != 1: + log.error('%s:~%s: LuaDox bug: @%s is dangling', self.ctx.file, lines[-1][0], stack[-1][1]) + return params, returns, stack[0][2] diff --git a/src/prerender.py b/src/prerender.py index e6df2a4..411e1fa 100644 --- a/src/prerender.py +++ b/src/prerender.py @@ -23,9 +23,11 @@ class Prerenderer: """ - The prerender stage populates the specific typed Reference fields needed for rendering. - generates intermediate data structures used by renderers. All - references are resolved, and tags (such as @param) are parsed and validated. + The prerender stage populates the specific typed Reference fields needed for + rendering. generates intermediate data structures used by renderers. + + All references are resolved to markdown links (whose target is in the form + luadox:), and tags (such as @tparam) are parsed and validated. """ def __init__(self, parser: Parser): self.parser = parser @@ -34,7 +36,9 @@ def __init__(self, parser: Parser): def process(self) -> List[TopRef]: """ - Preprocesses all Reference objects created by the parser by handling all remaining tags within content docstrings, normalizing content to markdown, and returns a sorted list of toprefs for rendering. + Preprocesses all Reference objects created by the parser by handling all remaining + tags within content docstrings, normalizing content to markdown, and returns a + sorted list of toprefs for rendering. """ toprefs: list[TopRef] = [] for ref in self.parser.topsyms.values(): @@ -46,80 +50,83 @@ def process(self) -> List[TopRef]: toprefs.sort(key=lambda ref: (ref.type, ref.symbol)) return toprefs + def _do_classmod(self, topref: Union[ClassRef, ModuleRef]): has_content = False for colref in self.parser.get_collections(topref): self.ctx.update(ref=colref) # Parse out section heading and body. - _, _, md = self.parser.content_to_markdown(colref.content) + _, _, content = self.parser.parse_raw_content(colref.raw_content) if isinstance(colref, (ClassRef, ModuleRef)): heading = colref.symbol - body = md else: - heading, body = get_first_sentence(md) + heading = content.get_first_sentence(pop=True) # Fall back to section name if there is no content for the heading. heading = heading.strip() or colref.name colref.heading = heading - colref.body = body + colref.content = content topref.collections.append(colref) functions = list(self.parser.get_elements_in_collection(FunctionRef, colref)) fields = list(self.parser.get_elements_in_collection(FieldRef, colref)) - has_content = has_content or colref.body or functions or fields + has_content = has_content or colref.content or functions or fields colref.compact = colref.flags.get('compact', []) fullnames: bool = colref.flags.get('fullnames', False) for ref in fields: self.ctx.update(ref=ref) - _, _, md = self.parser.content_to_markdown(ref.content) + _, _, content = self.parser.parse_raw_content(ref.raw_content) ref.title = ref.flags.get('display') or (ref.name if fullnames else ref.symbol) ref.types = ref.flags.get('type', []) ref.meta = ref.flags.get('meta') - ref.md = md + ref.content = content colref.fields.append(ref) for ref in functions: self.ctx.update(ref=ref) - paramsdict, returns, md = self.parser.content_to_markdown(ref.content) + paramsdict, returns, content = self.parser.parse_raw_content(ref.raw_content) # args is as defined in the function definition in source, while params is # based on tags. Log a warning for any undocumented argument as long as # there is at least one documented parameter. - params: List[Tuple[str, List[str], str]] = [] + params: List[Tuple[str, List[str], Content]] = [] # ref.extra contains the list of parameter names as parsed from the # source. Construct the params list based on for param in ref.extra: try: params.append((param, *paramsdict[param])) except KeyError: - params.append((param, [], '')) + params.append((param, [], Content())) if paramsdict: log.warning('%s:%s: %s() missing @tparam for "%s" parameter', ref.file, ref.line, ref.name, param) ref.title = ref.display ref.params = params ref.returns = returns - ref.meta = ref.flags.get('meta') - ref.md = md + ref.meta = self.parser.refs_to_markdown(ref.flags['meta']) if 'meta' in ref.flags else '' + ref.content = content colref.functions.append(ref) topref.userdata['empty'] = not has_content + def _do_manual(self, topref: ManualRef): - if topref.content: + if topref.raw_content: + self.ctx.update(ref=topref) # Include any preamble before the first heading. - _, _, md = self.parser.content_to_markdown(topref.content, strip_comments=False) - topref.md = md + _, _, content = self.parser.parse_raw_content(topref.raw_content, strip_comments=False) + topref.content = content + topref.heading = self.parser.refs_to_markdown(topref.heading) for ref in self.parser.get_collections(topref): # Manuals only have SectionRefs assert(isinstance(ref, SectionRef)) self.ctx.update(ref=ref) - _, _, md = self.parser.content_to_markdown(ref.content, strip_comments=False) - ref.heading = ref.display - ref.body = md + _, _, content = self.parser.parse_raw_content(ref.raw_content, strip_comments=False) + ref.heading = self.parser.refs_to_markdown(ref.heading) + ref.content = content ref.level = int(ref.flags['level']) topref.collections.append(ref) diff --git a/src/reference.py b/src/reference.py index de0e7cb..bf621a0 100644 --- a/src/reference.py +++ b/src/reference.py @@ -23,6 +23,7 @@ from typing import TypeVar, Optional, Union, List, Tuple, Dict, Any from .log import log +from .utils import Content # Used for generics taking Reference types RefT = TypeVar('RefT', bound='Reference') @@ -97,7 +98,9 @@ class Reference: # A list of lines containing the documented content for this collection. Each element # is a 2-tuple in the form (line number, text) where line number is the specific line # in self.file where the comment appears, and text is in markdown format. - content: List[Tuple[int, str]] = field(default_factory=list) + raw_content: List[Tuple[int, str]] = field(default_factory=list) + # The processed (from raw) content which is set during the prerender stage + content: Content = field(default_factory=Content) # A map of modifiers that apply to this Reference that affect how it is rendered, # mostly from @tags. These are accumulated in the flags dict until all parsing # is done and then the parser process stage will convert these to proper fields in the @@ -171,10 +174,19 @@ def name(self) -> str: assert(self._name) return self._name + @property + def id(self) -> str: + """ + A globally unique identifier of the Reference. + """ + return f'{self.topref.type}#{self.topsym}#{self.name}' + @property def topsym(self) -> str: """ Returns the symbol name of our top-level reference. + + This does *not* honor @within. """ if not self._topsym: self._set_topsym() @@ -187,6 +199,8 @@ def topref(self) -> 'Reference': Returns the Reference object for the top-level reference this ref belongs to. If we're already a top-level Ref (e.g. class or module) then self is returned. + + This does *not* honor @within. """ # If there are no scopes, we *are* the topref return self if not self.scopes else self.parser_refs[self.topsym] @@ -280,8 +294,6 @@ class FieldRef(Reference): # User-defined meta value (parsed from @meta via flags) meta: Optional[str] = None - # Markdown content of the field - md: str = '' # Renderable display name that takes tags such as @fullnames into account title: str = '' # Allowed types for this field, which can be empty if no @type tag @@ -343,9 +355,9 @@ class FunctionRef(FieldRef): type: str = 'function' # List of (name, types, docstring) - params: List[Tuple[str, List[str], str]] = field(default_factory=list) - # List of (types, markdown)j - returns: List[Tuple[List[str], str]] = field(default_factory=list) + params: List[Tuple[str, List[str], Content]] = field(default_factory=list) + # List of (types, docstring)j + returns: List[Tuple[List[str], Content]] = field(default_factory=list) @dataclass @@ -354,7 +366,6 @@ class CollectionRef(Reference): A collection can fields and functions, """ heading: str = '' - body: str = '' # List of 'functions' and/or 'fields' to indicate which should be rendered in compact # form compact: List[str] = field(default_factory=list) @@ -366,8 +377,6 @@ class TopRef(CollectionRef): """ Represents a top-level reference such as class or module. """ - # Preamble before any sections - md: str = '' # Ordered list of collections within this topref, which respects @within and @reorder collections: List[CollectionRef] = field(default_factory=list) diff --git a/src/render.py b/src/render.py index 94eb694..a238a66 100644 --- a/src/render.py +++ b/src/render.py @@ -19,7 +19,7 @@ import re import mimetypes from contextlib import contextmanager -from typing import Union, Match, Tuple, List, Callable, Generator, Type +from typing import Union, Tuple, List, Callable, Generator, Type import commonmark.blocks import commonmark_extensions.tables @@ -30,19 +30,46 @@ from .parse import * from .utils import * +# Files from the assets directory to be copied +ASSETS = [ + 'luadox.css', + 'prism.css', + 'prism.js', + 'js-search.min.js', + 'search.js', + 'img/i-left.svg', + 'img/i-right.svg', + 'img/i-download.svg', + 'img/i-github.svg', + 'img/i-gitlab.svg', + 'img/i-bitbucket.svg', +] + # Effectively disable implicit code blocks commonmark.blocks.CODE_INDENT = 1000 - class CustomRendererWithTables(commonmark_extensions.tables.RendererWithTables): - def make_table_node(self, node): + def __init__(self, renderer: 'Renderer', *args, **kwargs): + self.renderer = renderer + self.parser = renderer.parser + super().__init__(*args, **kwargs) + + def make_table_node(self, _): return '' + def link(self, node, entering): + if node.destination.startswith('luadox:'): + refid = node.destination[7:] + # If this raises KeyError it indicates a bug in the parser code + ref = self.parser.refs_by_id[refid] + node.destination = self.renderer._get_ref_href(ref) + super().link(node, entering) + # https://github.com/GovReady/CommonMark-py-Extensions/issues/3#issuecomment-756499491 # Thanks to hughdavenport class TableWaitingForBug3(commonmark_extensions.tables.Table): @staticmethod - def continue_(parser, container=None): + def continue_(parser, _=None): ln = parser.current_line if not parser.indented and commonmark.blocks.peek(ln, parser.next_nonspace) == "|": parser.advance_next_nonspace() @@ -59,22 +86,6 @@ class Renderer: """ Takes a Parser object and provides an interface to generate rendered HTML. """ - # Common abbreviations with periods that are considered when determining what is the - # first sentence of a markdown block - RE_ABBREV = re.compile(r'(e\.?g\.|i\.?e\.|etc\.|et al\.|vs\.)', flags=re.I|re.S) - # Regexp patterns that progressively narrow down a markdown block to its first - # sentence - RE_FIRST_SENTENCE = ( - # First pass: Move everything after a paragraph break (two newlines) to - # the remaining block - re.compile(r'^(.*\n\s*\n)(.*)$', flags=re.S), - # Second pass: Move (prepend) anything including and below a markdown heading - # to the remaining block. Fixes #6. - re.compile(r'(.*)(?:^|\n)(#.*)', flags=re.S), - # Final pass: take everything up to the first period as the first sentence. - re.compile(r'^(.+?[.?!])(?: |$|\n)(.*)', flags=re.S), - ) - def __init__(self, parser: Parser): self.parser = parser self.config = parser.config @@ -120,6 +131,7 @@ def _get_ref_link_info(self, ref: Reference) -> Tuple[str, str]: topref = self.parser.refs[topsym] except KeyError: raise KeyError('top-level reference "%s" not found (from "%s")' % (topsym, ref.name)) from None + prefix = self._get_root_path() if not isinstance(ref.topref, ManualRef) or ref.topref.name != 'index': prefix += '{}/'.format(topref.type) @@ -128,8 +140,7 @@ def _get_ref_link_info(self, ref: Reference) -> Tuple[str, str]: fragment = '#' + ref.symbol if ref.scopes else '' else: fragment = '#{}'.format(ref.name) if ref.name != ref.topsym else '' - - return prefix + (ref.userdata.get('within_topsym') or ref.topsym) + '.html', fragment + return prefix + topsym + '.html', fragment def _get_ref_href(self, ref: Reference) -> str: """ @@ -139,104 +150,76 @@ def _get_ref_href(self, ref: Reference) -> str: file, fragment = self._get_ref_link_info(ref) return file + fragment - def _render_ref_markdown(self, ref: Reference, text: str, code=False) -> str: - """ - Returns the Reference as a markdown link. - - If code is True, then the given text is wrapped in backticks. - """ - backtick = '`' if code else '' - return '[{tick}{text}{parens}{tick}]({href})'.format( - tick=backtick, - text=text or ref.name, - parens='()' if isinstance(ref, FunctionRef) and not text else '', - href=self._get_ref_href(ref) - ) - - def _render_ref_markdown_re(self, m: Match[str]) -> str: - """ - Regexp callback to handle the @{refname} case. - """ - code: bool = (m.group(1) == '`') - ref = self.parser._resolve_ref(m.group(2)) - if ref: - return self._render_ref_markdown(ref, m.group(3), code=code) - else: - log.warning('%s:~%s: reference "%s" could not be resolved', self.ctx.file, self.ctx.line, m.group(2)) - return m.group(3) or m.group(2) - - def _render_backtick_ref_markdown_re(self, m: Match[str]) -> str: - """ - Regexp callback to handle the `refname` case. - """ - ref = self.parser._resolve_ref(m.group(1)) - if ref: - return self._render_ref_markdown(ref, text=m.group(1), code=True) - else: - # Couldn't resolve the ref, just return back the original text. - return '`{}`'.format(m.group(1)) - - def _refs_to_markdown(self, block: str) -> str: + def _permalink(self, id: str) -> str: """ - Replaces `refname` and @{refname} in the given block of text with - markdown links. + Returns the HTML for a permalink used for directly linkable references such + as section headings, functions, fields, etc. """ - # Resolve `ref` - block = re.sub(r'(?¶'.format(id) def _markdown_to_html(self, md: str) -> str: """ Renders the given markdown as HTML and returns the result. """ - md = self._refs_to_markdown(md) parser = commonmark_extensions.tables.ParserWithTables() ast = parser.parse(md) - html = CustomRendererWithTables().render(ast) - - def replace_admonition(m: Match[str]): - type, title, content = m.group(1).split('\x02') - # content = content.replace('

', '') - return '
{}

{}

\n
'.format(type, title, content.rstrip()) - - html = re.sub(r'

\x01adm([^\x03]+)\x03\s*

', replace_admonition, html, flags=re.S) - html = re.sub(r'

\x01see([^\x03]+)\x03\s*

', '
See also \\1
', html, flags=re.S) - return html + return CustomRendererWithTables(self).render(ast) + + def _content_to_html(self, content: Content) -> str: + output = [] + for elem in content: + if isinstance(elem, Markdown): + output.append(self._markdown_to_html(elem.get())) + elif isinstance(elem, Admonition): + inner = self._content_to_html(elem.content) + output.append(f'
{elem.title}
{inner.strip()}\n
') + elif isinstance(elem, SeeAlso): + refs = [self.parser.refs_by_id[id] for id in elem.refs] + md = ', '.join(self.parser.render_ref_markdown(ref) for ref in refs) + # HTML will have

tags so strip them out first. + html = self._markdown_to_html(md).strip()[3:-4] + output.append(f'
See also {html}
') + else: + raise ValueError(f'unsupported content fragment type {type(elem)}') + return '\n'.join(output) def _markdown_to_text(self, md: str) -> str: """ - Strips markdown codes from the given markdown and returns the result. + Strips markdown codes from the given Markdown and returns the result. """ # Code blocks - text = re.sub(r'```.*?```', '', md, flags=re.S) + text = recache(r'```.*?```', re.S).sub('', md) # Inline preformatted code - text = re.sub(r'`([^`]+)`', '\\1', text) + text = recache(r'`([^`]+)`').sub('\\1', text) # Headings - text = re.sub(r'#+', '', text) + text = recache(r'#+').sub('', text) # Bold - text = re.sub(r'\*([^*]+)\*', '\\1', text) + text = recache(r'\*([^*]+)\*').sub('\\1', text) # Link or inline image - text = re.sub(r'!?\[([^]]*)\]\([^)]+\)', '\\1', text) + text = recache(r'!?\[([^]]*)\]\([^)]+\)').sub('\\1', text) # Clean up non-markdown things. # Reference with custom display - text = re.sub(r'@{[^|]+\|([^}]+)\}', '\\1', text) + text = recache(r'@{[^|]+\|([^}]+)\}').sub('\\1', text) # Just a reference - text = re.sub(r'@{([^}]+)\}', '\\1', text) - # Replace admonissions with text elements - text = re.sub( - r'\x01adm[^\x02]+\x02([^\x03]+)\x03', - lambda m: m.group(1).replace('\x02', ' '), - text, - flags=re.S) - # Remove other special encoded content - text = re.sub(r'\x01([^\x03]*)\x03', '', text) + text = recache(r'@{([^}]+)\}').sub('\\1', text) # Consolidate multiple whitespaces - text = re.sub(r'\s+', ' ', text) - return text.strip() + text = recache(r'\s+').sub(' ', text) + return text + + def _content_to_text(self, content: Content) -> str: + """ + Strips markdown codes from the given Content and returns the result. + """ + output = [] + for elem in content: + if isinstance(elem, Admonition): + output.append(self._markdown_to_text(elem.title)) + output.append(self._content_to_text(elem.content)) + elif isinstance(elem, Markdown): + output.append(self._markdown_to_text(elem.get())) + return '\n'.join(output).strip() + def _types_to_html(self, types: List[str]) -> str: """ @@ -245,7 +228,7 @@ def _types_to_html(self, types: List[str]) -> str: """ resolved: list[str] = [] for tp in types: - ref = self.parser._resolve_ref(tp) + ref = self.parser.resolve_ref(tp) if ref: href = self._get_ref_href(ref) tp = '{}'.format(href, tp) @@ -303,9 +286,11 @@ def _render_html(self, topref: TopRef, lines: List[str]) -> Generator[ out = lines.append root = self._get_root_path() head: list[str] = [] + css = self.config.get('project', 'css', fallback=None) if css: head.append(''.format(root, css, self._assets_version)) + favicon = self.config.get('project', 'favicon', fallback=None) if favicon: mimetype, _ = mimetypes.guess_type(favicon) @@ -313,6 +298,7 @@ def _render_html(self, topref: TopRef, lines: List[str]) -> Generator[ # Favicon is always copied to doc root, so take only the filename _, favicon = os.path.split(favicon) head.append(''.format(mimetype, root, favicon, self._assets_version)) + out(self._templates['head'].format( version=self._assets_version, title=html_title, @@ -323,7 +309,7 @@ def _render_html(self, topref: TopRef, lines: List[str]) -> Generator[ # as Search page) fall back to 'other' topref.type or 'other', # Second segment is the stripped form of the ref name. - re.sub(r'\W+', '', topref.name).lower() + recache(r'\W+').sub('', topref.name).lower() ) )) @@ -404,7 +390,7 @@ def _render_html(self, topref: TopRef, lines: List[str]) -> Generator[ # it in the list of manual pages. continue cls = ' class="selected"' if ref.name == topref.name else '' - out('{}'.format(cls, self._get_ref_href(ref), ref.display)) + out('{}'.format(cls, self._get_ref_href(ref), ref.heading)) out('') out('') @@ -440,10 +426,9 @@ def _render_html(self, topref: TopRef, lines: List[str]) -> Generator[ out('') out(self._templates['foot'].format(root=root, version=self._assets_version)) - def render(self, topref: TopRef) -> str: + def _render_topref(self, topref: TopRef) -> str: """ - Renders a prerendered Page to HTML, returning a string containing the rendered - HTML. + Renders a topref to HTML, returning a string containing the rendered HTML. """ lines = [] with self._render_html(topref, lines) as out: @@ -453,29 +438,22 @@ def render(self, topref: TopRef) -> str: self._render_manual(topref, out) return '\n'.join(lines) - def _permalink(self, id: str) -> str: - """ - Returns the HTML for a permalink used for directly linkable references such - as section headings, functions, fields, etc. - """ - return ''.format(id) - def _render_manual(self, topref: ManualRef, out: Callable[[str], None]) -> None: """ Renders the given manual top-level Reference as HTML, calling the given out() function for each line of HTML. """ out('
') - if topref.md: + if topref.content: # Preamble - out(self._markdown_to_html(topref.md)) + out(self._content_to_html(topref.content)) for secref in topref.collections: # Manual pages only contain SectionRefs assert(isinstance(secref, SectionRef)) out('{}'.format(secref.level, secref.symbol, secref.heading)) out(self._permalink(secref.symbol)) out(''.format(secref.level)) - out(self._markdown_to_html(secref.body)) + out(self._content_to_html(secref.content)) out('
') def _render_classmod(self, topref: Union[ClassRef, ModuleRef], out: Callable[[str], None]) -> None: @@ -522,8 +500,8 @@ def _render_classmod(self, topref: Union[ClassRef, ModuleRef], out: Callable[[st out('') out('') - if colref.body: - out(self._markdown_to_html(colref.body)) + if colref.content: + out(self._content_to_html(colref.content)) fields_title = 'Fields' fields_meta_columns = 0 @@ -582,9 +560,12 @@ def _render_classmod(self, topref: Union[ClassRef, ModuleRef], out: Callable[[st out('
') nmeta -= 1 - md = get_first_sentence(ref.md)[0] if not fields_compact else ref.md - if md: - out(''.format(self._markdown_to_html(md))) + if not fields_compact: + html = self._markdown_to_html(ref.content.get_first_sentence()) + else: + html = self._content_to_html(ref.content) + if html: + out(''.format(html)) out('') out('
{}{}
') @@ -611,8 +592,11 @@ def _render_classmod(self, topref: Union[ClassRef, ModuleRef], out: Callable[[st out('') meta -= 1 - md = get_first_sentence(ref.md)[0] if not functions_compact else ref.md - out('{}'.format(self._markdown_to_html(md))) + if not functions_compact: + html = self._markdown_to_html(ref.content.get_first_sentence()) + else: + html = self._content_to_html(ref.content) + out('{}'.format(html)) out('') out('') out('') @@ -635,7 +619,7 @@ def _render_classmod(self, topref: Union[ClassRef, ModuleRef], out: Callable[[st out(self._permalink(ref.name)) out('') out('
') - out(self._markdown_to_html(ref.md)) + out(self._content_to_html(ref.content)) out('
') out('') @@ -655,7 +639,7 @@ def _render_classmod(self, topref: Union[ClassRef, ModuleRef], out: Callable[[st out(self._permalink(ref.name)) out('') out('
') - out(self._markdown_to_html(ref.md)) + out(self._content_to_html(ref.content)) # Only show the praameters table if there's at least one documented parameter. if any(types or doc for _, types, doc in ref.params): out('
Parameters
') @@ -664,7 +648,7 @@ def _render_classmod(self, topref: Union[ClassRef, ModuleRef], out: Callable[[st out('') out('{}'.format(param)) out('({})'.format(self._types_to_html(types))) - out('{}'.format(self._markdown_to_html(doc))) + out('{}'.format(self._content_to_html(doc))) out('') out('') if ref.returns: @@ -675,7 +659,7 @@ def _render_classmod(self, topref: Union[ClassRef, ModuleRef], out: Callable[[st if len(ref.returns) > 1: out('{}.'.format(n)) out('({})'.format(self._types_to_html(types))) - out('{}'.format(self._markdown_to_html(doc))) + out('{}'.format(self._content_to_html(doc))) out('') out('') out('
') @@ -691,10 +675,9 @@ def render_search_index(self) -> str: self.ctx.update(ref=topref) lines = [] out = lines.append - def add(ref: Reference, typ: Type[Reference]): + def add(ref: RefT, typ: Type[RefT]): href = self._get_ref_href(ref) - _, _, md = self.parser.content_to_markdown(ref.content) - text = self._markdown_to_text(md) + text = self._content_to_text(ref.content) title = ref.display if typ == SectionRef and not isinstance(ref.topref, ManualRef): # Non-manual sections typically use the first sentence as the section @@ -740,3 +723,48 @@ def render_landing_page(self) -> str: with self._render_html(topref, lines): pass return '\n'.join(lines) + + + def render(self, toprefs: List[TopRef], outdir: str) -> None: + """ + Renders all toprefs to the given output directory. + + It's the caller's obligation to have passed these toprefs through the prerenderer. + """ + for ref in toprefs: + if ref.userdata.get('empty') and ref.implicit: + # Reference has no content and it was also implicitly generated, so we don't render it. + log.info('not rendering empty %s %s', ref.type, ref.name) + continue + if isinstance(ref, ManualRef) and ref.name == 'index': + typedir = outdir + else: + typedir = os.path.join(outdir, ref.type) + os.makedirs(typedir, exist_ok=True) + outfile = os.path.join(typedir, ref.name + '.html') + log.info('rendering %s %s -> %s', ref.type, ref.name, outfile) + html = self._render_topref(ref) + with open(outfile, 'w', encoding='utf8') as f: + f.write(html) + + js = self.render_search_index() + with open(os.path.join(outdir, 'index.js'), 'w', encoding='utf8') as f: + f.write(js) + + html = self.render_search_page() + with open(os.path.join(outdir, 'search.html'), 'w', encoding='utf8') as f: + f.write(html) + + if not self.parser.get_reference(ManualRef, 'index'): + # The user hasn't specified an index manual page, so we generate a blank + # landing page that at least presents the sidebar with available links. + html = self.render_landing_page() + with open(os.path.join(outdir, 'index.html'), 'w', encoding='utf8') as f: + f.write(html) + + for name in ASSETS: + outfile = os.path.join(outdir, name) + if os.path.dirname(name): + os.makedirs(os.path.dirname(outfile), exist_ok=True) + with open(outfile, 'wb') as f: + f.write(assets.get(name)) diff --git a/src/utils.py b/src/utils.py index 9297366..55c1d43 100644 --- a/src/utils.py +++ b/src/utils.py @@ -13,30 +13,144 @@ # limitations under the License. __all__ = [ - 'Sentinel', 'get_first_sentence', 'get_indent_level', 'strip_trailing_comment' + 'Sentinel', 'Content', 'ContentFragment', 'Markdown', 'Admonition', 'SeeAlso', + 'recache', 'get_first_sentence', 'get_indent_level', 'strip_trailing_comment', ] import enum import re -from typing import Tuple +import string +from dataclasses import dataclass +from functools import lru_cache +from typing import Tuple, List, Callable, Optional, Pattern # Common abbreviations with periods that are considered when determining what is the -# first sentence of a markdown block -RE_ABBREV = re.compile(r'(e\.?g\.|i\.?e\.|etc\.|et al\.|vs\.)', flags=re.I|re.S) -# Regexp patterns that progressively narrow down a markdown block to its first -# sentence -RE_FIRST_SENTENCE = ( - # First pass: Move everything after a paragraph break (two newlines) to - # the remaining block - re.compile(r'^(.*\n\s*\n)(.*)$', flags=re.S), - # Second pass: Move (prepend) anything including and below a markdown heading - # to the remaining block. Fixes #6. - re.compile(r'(.*)(?:^|\n)(#.*)', flags=re.S), - # Final pass: take everything up to the first period as the first sentence. - re.compile(r'^(.+?[.?!])(?: |$|\n)(.*)', flags=re.S), -) -RE_INDENT = re.compile(r'^( *)') -RE_COMMENT = re.compile(r'--.*') +# first sentence of a markdown block. +ABBREV = { + 'e': ('e.g.', 'eg.', 'etc.', 'et al.'), + 'i': ('i.e.', 'ie.'), + 'v': ('vs.',), +} +# Used for detecting word boundaries. Anything *not* in this set can be considered as a +# word boundary. +WORD_CHARS = set(string.ascii_lowercase) + +# Callback type used by content objects for postprocessing finalized content. Used for +# converting refs to markdown links. +PostProcessFunc = Optional[Callable[[str], str]] + +class ContentFragment: + """ + Base class for elements of a Content list. + """ + pass + + +class Markdown(ContentFragment): + """ + Represents a markdown string. + """ + def __init__(self, value: Optional[str] = None, postprocess: Optional[PostProcessFunc]=None): + # Lines accumulated via append() + self._lines = [value] if value is not None else [] + self._postprocess = postprocess + # Cached postprocessed value + # append() is called between get() calls (this case is rare or nonexistent) + self._value: str|None = None + + def append(self, s: str) -> 'Markdown': + """ + Appends a line to the markdown string. Cannot be called after get(). + """ + assert(self._value is None) + self._lines.append(s) + return self + + def rstrip(self) -> 'Markdown': + """ + Removes trailing whitespace from the current set of lines added by append(). + """ + self._lines = ['\n'.join(self._lines).rstrip()] + return self + + def get(self) -> str: + """ + Returns the final markdown string, postprocessed if a postprocessor was passed during initialization. + + append() cannot be called after this point. + """ + if self._value is None: + md = '\n'.join(self._lines) + if self._postprocess: + md = self._postprocess(md) + self._value = md + del self._lines[:] + return self._value + + +@dataclass +class Admonition(ContentFragment): + """ + A @note or @warning admonition tag. + """ + type: str + title: str + content: 'Content' + + +@dataclass +class SeeAlso(ContentFragment): + """ + A @see tag. + """ + # List of ref ids. + refs: List[str] + + +class Content(List[ContentFragment]): + """ + Parsed and prerendered content. The prerender stage resolves all references to + 'luadox:' markdown links. + + Content is captured as a list of content fragments -- the most common of which is + Markdown -- where fragments are different types of objects that the renderer needs to + decide how to translate. + """ + def __init__(self, *args, postprocess: PostProcessFunc = None): + super().__init__(*args) + self._md_postprocess = postprocess + self._first = None + + def get_first_sentence(self, pop=False) -> str: + """ + Returns the first sentence from the content. If pop is True then the content + is updated in-place to remove the sentence that was returned. + """ + if len(self) == 0: + return '' + e = self[0] + if not isinstance(e, Markdown): + return '' + first, remaining = get_first_sentence(e.get()) + if pop: + if remaining: + self[0] = Markdown(remaining) + else: + self.pop(0) + return first + + def md(self, postprocess: PostProcessFunc = None) -> Markdown: + """ + Convenience method that returns the last fragment in the content list if it's a + Markdown, or creates and appends a new one if the last element isn't Markdown. + """ + if len(self) > 0 and isinstance(self[-1], Markdown): + md = self[-1] + assert(isinstance(md, Markdown)) + else: + md = Markdown(postprocess=postprocess or self._md_postprocess) + self.append(md) + return md class Sentinel(enum.Enum): @@ -46,36 +160,62 @@ class Sentinel(enum.Enum): UNDEF = object() -def get_first_sentence(md: str) -> Tuple[str, str]: +@lru_cache(maxsize=None) +def recache(pattern: str, flags: int = 0) -> Pattern[str]: + """ + Returns a compiled regexp pattern, caching the result for subsequent invocations. + """ + return re.compile(pattern, flags) + + +def get_first_sentence(s: str) -> Tuple[str, str]: """ - Returns a 2-tuple of the first sentence from the given markdown, and - all remaining. + Returns a 2-tuple of the first sentence from the given markdown, and all remaining. """ - # This is rather cheeky, but just handles these common abbreviations so they don't - # interpreted as end-of-sentence. - escape = lambda m: m.group(1).replace('.', '\x00') - unescape = lambda s: s.replace('\x00', '.') - first = RE_ABBREV.sub(escape, md) - remaining = '' - for pat in RE_FIRST_SENTENCE: - m = pat.search(first) - if m: - first, pre = m.groups() - remaining = pre + remaining - # Remove period but preserve other sentence-ending punctuation from first - # sentence - return unescape(first).strip().rstrip('.'), unescape(remaining).strip() + # This is fairly low level looking code, but it performs reasonably well for what it + # does. + l = s.lower() + end = len(l) - 1 + last = '' + n = 0 + while n <= end: + c = l[n] + if c == '\n' and last == '\n': + # Treat two consecutive newlines as a sentence terminator. + break + elif c == '.': + # Is this period followed by whitespace or EOL? + if n == end or l[n+1] == ' ' or l[n+1] == '\n': + # Found end-of-sentence. + break + elif c in ABBREV and last not in WORD_CHARS: + # This character appears to start a word of an abbreviation we want to handle. + # If the next set of characters matches an abbrevation variation, skip over + # it. + for abbr in ABBREV[c]: + if l[n:n+len(abbr)] == abbr: + # Subtract 1 from the abbrevation length since we're adding 1 below + n += len(abbr) - 1 + break + last = l[n] + n += 1 + else: + # Didn't break out of while loop so we weren't able to find end-of-sentence. + # Consider the entire given string as the first sentence. + return s, '' + + # If we're here, n represents the position of the end of first sentence. + return s[:n], s[n+1:].strip() def get_indent_level(s: str) -> int: """ Returns the number of spaces on left side of the string. """ - m = RE_INDENT.search(s) + m = recache(r'^( *)').search(s) return len(m.group(1)) if m else 0 def strip_trailing_comment(line: str) -> str: - return RE_COMMENT.sub('', line) - + return recache(r'--.*').sub('', line)