From fc2a4cd70421457682b76e9c4d54910a593f1f03 Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Tue, 5 Nov 2024 15:07:07 -0500 Subject: [PATCH] Fix incorrect handling of descriptive titles in USX --- machine/corpora/usx_verse_parser.py | 54 ++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/machine/corpora/usx_verse_parser.py b/machine/corpora/usx_verse_parser.py index c857b20..2d1dfd4 100644 --- a/machine/corpora/usx_verse_parser.py +++ b/machine/corpora/usx_verse_parser.py @@ -1,11 +1,12 @@ from __future__ import annotations +import string from dataclasses import dataclass, field from typing import BinaryIO, Iterable, List, Optional from xml.etree import ElementTree from ..scripture.verse_ref import are_overlapping_verse_ranges -from ..utils.string_utils import has_sentence_ending, is_integer +from ..utils.string_utils import has_sentence_ending from .corpora_utils import merge_verse_ranges from .usx_token import UsxToken from .usx_verse import UsxVerse @@ -86,25 +87,46 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter ctxt.add_token(e.tail) -_NONVERSE_PARA_STYLES = {"ms", "mr", "s", "sr", "r", "d", "sp", "rem", "restore", "cl"} - - -def _is_numbered_style(style_prefix: str, style: str) -> bool: - return style.startswith(style_prefix) and is_integer(style[len(style_prefix) :]) +_VERSE_PARA_STYLES = { + # Paragraphs + "p", + "m", + "po", + "pr", + "cls", + "pmo", + "pm", + "pmc", + "pmr", + "pi", + "pc", + "mi", + "nb", + # Poetry + "q", + "qc", + "qr", + "qm", + "qd", + "b", + "d", + # Lists + "lh", + "li", + "lf", + "lim", + # Deprecated + "ph", + "phi", + "ps", + "psi", +} def _is_verse_para(para_elem: ElementTree.Element) -> bool: style = para_elem.get("style", "") - if style in _NONVERSE_PARA_STYLES: - return False - - if _is_numbered_style("ms", style): - return False - - if _is_numbered_style("s", style): - return False - - return True + style = style.rstrip(string.digits) + return style in _VERSE_PARA_STYLES @dataclass