Skip to content

Commit

Permalink
Fix incorrect handling of descriptive titles in USX
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed Nov 5, 2024
1 parent 3a9df2c commit fc2a4cd
Showing 1 changed file with 38 additions and 16 deletions.
54 changes: 38 additions & 16 deletions machine/corpora/usx_verse_parser.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from __future__ import annotations

import string
from dataclasses import dataclass, field
from typing import BinaryIO, Iterable, List, Optional
from xml.etree import ElementTree

from ..scripture.verse_ref import are_overlapping_verse_ranges
from ..utils.string_utils import has_sentence_ending, is_integer
from ..utils.string_utils import has_sentence_ending
from .corpora_utils import merge_verse_ranges
from .usx_token import UsxToken
from .usx_verse import UsxVerse
Expand Down Expand Up @@ -86,25 +87,46 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter
ctxt.add_token(e.tail)


_NONVERSE_PARA_STYLES = {"ms", "mr", "s", "sr", "r", "d", "sp", "rem", "restore", "cl"}


def _is_numbered_style(style_prefix: str, style: str) -> bool:
return style.startswith(style_prefix) and is_integer(style[len(style_prefix) :])
_VERSE_PARA_STYLES = {
# Paragraphs
"p",
"m",
"po",
"pr",
"cls",
"pmo",
"pm",
"pmc",
"pmr",
"pi",
"pc",
"mi",
"nb",
# Poetry
"q",
"qc",
"qr",
"qm",
"qd",
"b",
"d",
# Lists
"lh",
"li",
"lf",
"lim",
# Deprecated
"ph",
"phi",
"ps",
"psi",
}


def _is_verse_para(para_elem: ElementTree.Element) -> bool:
style = para_elem.get("style", "")
if style in _NONVERSE_PARA_STYLES:
return False

if _is_numbered_style("ms", style):
return False

if _is_numbered_style("s", style):
return False

return True
style = style.rstrip(string.digits)
return style in _VERSE_PARA_STYLES


@dataclass
Expand Down

0 comments on commit fc2a4cd

Please sign in to comment.