From fc2a4cd70421457682b76e9c4d54910a593f1f03 Mon Sep 17 00:00:00 2001
From: Damien Daspit <damien_daspit@sil.org>
Date: Tue, 5 Nov 2024 15:07:07 -0500
Subject: [PATCH] Fix incorrect handling of descriptive titles in USX

---
 machine/corpora/usx_verse_parser.py | 54 ++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 16 deletions(-)

diff --git a/machine/corpora/usx_verse_parser.py b/machine/corpora/usx_verse_parser.py
index c857b20..2d1dfd4 100644
--- a/machine/corpora/usx_verse_parser.py
+++ b/machine/corpora/usx_verse_parser.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
+import string
 from dataclasses import dataclass, field
 from typing import BinaryIO, Iterable, List, Optional
 from xml.etree import ElementTree
 
 from ..scripture.verse_ref import are_overlapping_verse_ranges
-from ..utils.string_utils import has_sentence_ending, is_integer
+from ..utils.string_utils import has_sentence_ending
 from .corpora_utils import merge_verse_ranges
 from .usx_token import UsxToken
 from .usx_verse import UsxVerse
@@ -86,25 +87,46 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter
                 ctxt.add_token(e.tail)
 
 
-_NONVERSE_PARA_STYLES = {"ms", "mr", "s", "sr", "r", "d", "sp", "rem", "restore", "cl"}
-
-
-def _is_numbered_style(style_prefix: str, style: str) -> bool:
-    return style.startswith(style_prefix) and is_integer(style[len(style_prefix) :])
+_VERSE_PARA_STYLES = {
+    # Paragraphs
+    "p",
+    "m",
+    "po",
+    "pr",
+    "cls",
+    "pmo",
+    "pm",
+    "pmc",
+    "pmr",
+    "pi",
+    "pc",
+    "mi",
+    "nb",
+    # Poetry
+    "q",
+    "qc",
+    "qr",
+    "qm",
+    "qd",
+    "b",
+    "d",
+    # Lists
+    "lh",
+    "li",
+    "lf",
+    "lim",
+    # Deprecated
+    "ph",
+    "phi",
+    "ps",
+    "psi",
+}
 
 
 def _is_verse_para(para_elem: ElementTree.Element) -> bool:
     style = para_elem.get("style", "")
-    if style in _NONVERSE_PARA_STYLES:
-        return False
-
-    if _is_numbered_style("ms", style):
-        return False
-
-    if _is_numbered_style("s", style):
-        return False
-
-    return True
+    style = style.rstrip(string.digits)
+    return style in _VERSE_PARA_STYLES
 
 
 @dataclass