Add Paratext/USFM processing tutorial

- replace "strip_all_text" and "prefer_existing_text" parameters with a single enum parameter
sillsdev · Oct 17, 2024 · 2984de1 · 2984de1
1 parent b7c06c8
commit 2984de1
Show file tree

Hide file tree

Showing 6 changed files with 459 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -17,3 +17,4 @@ If you would like to find out more about how to use Machine, check out the tutor
 - [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
 - [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
 - [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
+- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)
diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
@@ -50,7 +50,7 @@
     normalize,
     unescape_spaces,
 )
-from .update_usfm_parser_handler import UpdateUsfmParserHandler
+from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler
 from .usfm_file_text import UsfmFileText
 from .usfm_file_text_corpus import UsfmFileTextCorpus
 from .usfm_memory_text import UsfmMemoryText
@@ -125,6 +125,7 @@
     "TextRow",
     "TextRowFlags",
     "unescape_spaces",
+    "UpdateUsfmBehavior",
     "UpdateUsfmParserHandler",
     "UsfmAttribute",
     "UsfmElementType",

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
@@ -5,7 +5,7 @@
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from .scripture_ref import ScriptureRef
-from .update_usfm_parser_handler import UpdateUsfmParserHandler
+from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler
 from .usfm_parser import parse_usfm
 
 
@@ -21,17 +21,14 @@ def update_usfm(
         book_id: str,
         rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
         full_name: Optional[str] = None,
-        strip_all_text: bool = False,
-        prefer_existing_text: bool = True,
+        behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING,
     ) -> Optional[str]:
         file_name: str = self._settings.get_book_file_name(book_id)
         if not self._exists(file_name):
             return None
         with self._open(file_name) as sfm_file:
             usfm: str = sfm_file.read().decode(self._settings.encoding)
-        handler = UpdateUsfmParserHandler(
-            rows, None if full_name is None else f"- {full_name}", strip_all_text, prefer_existing_text
-        )
+        handler = UpdateUsfmParserHandler(rows, None if full_name is None else f"- {full_name}", behavior)
         try:
             parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
             return handler.get_usfm(self._settings.stylesheet)

diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
@@ -1,3 +1,4 @@
+from enum import Enum, auto
 from typing import List, Optional, Sequence, Tuple, Union
 
 from .scripture_ref import ScriptureRef
@@ -8,21 +9,25 @@
 from .usfm_tokenizer import UsfmTokenizer
 
 
+class UpdateUsfmBehavior(Enum):
+    PREFER_EXISTING = auto()
+    PREFER_NEW = auto()
+    STRIP_EXISTING = auto()
+
+
 class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
     def __init__(
         self,
         rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
         id_text: Optional[str] = None,
-        strip_all_text: bool = False,
-        prefer_existing_text: bool = False,
+        behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING,
     ) -> None:
         super().__init__()
         self._rows = rows or []
         self._tokens: List[UsfmToken] = []
         self._new_tokens: List[UsfmToken] = []
         self._id_text = id_text
-        self._strip_all_text = strip_all_text
-        self._prefer_existing_text = prefer_existing_text
+        self._behavior = behavior
         self._replace_stack: List[bool] = []
         self._row_index: int = 0
         self._token_index: int = 0
@@ -283,7 +288,9 @@ def _replace_with_new_tokens(self, state: UsfmParserState) -> bool:
                 existing_text = True
                 break
         use_new_tokens: bool = (
-            self._strip_all_text or (new_text and not existing_text) or (new_text and not self._prefer_existing_text)
+            self._behavior is UpdateUsfmBehavior.STRIP_EXISTING
+            or (new_text and not existing_text)
+            or (new_text and self._behavior is UpdateUsfmBehavior.PREFER_NEW)
         )
         if use_new_tokens:
             self._tokens.extend(self._new_tokens)