From 2984de13d89189bdec614d007cc02cab773e435b Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Wed, 16 Oct 2024 21:08:55 -0500 Subject: [PATCH] Add Paratext/USFM processing tutorial - replace "strip_all_text" and "prefer_existing_text" parameters with a single enum parameter --- README.md | 1 + machine/corpora/__init__.py | 3 +- .../paratext_project_text_updater_base.py | 9 +- machine/corpora/update_usfm_parser_handler.py | 17 +- samples/paratext_usfm.ipynb | 428 ++++++++++++++++++ .../test_update_usfm_parser_handler.py | 21 +- 6 files changed, 459 insertions(+), 20 deletions(-) create mode 100644 samples/paratext_usfm.ipynb diff --git a/README.md b/README.md index 0b4714e..686190e 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,4 @@ If you would like to find out more about how to use Machine, check out the tutor - [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb) - [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb) - [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb) +- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 9cc8ff7..cb8c9da 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -50,7 +50,7 @@ normalize, unescape_spaces, ) -from .update_usfm_parser_handler import UpdateUsfmParserHandler +from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler from .usfm_file_text import UsfmFileText from .usfm_file_text_corpus import UsfmFileTextCorpus from .usfm_memory_text import UsfmMemoryText @@ -125,6 +125,7 @@ "TextRow", "TextRowFlags", "unescape_spaces", + "UpdateUsfmBehavior", "UpdateUsfmParserHandler", "UsfmAttribute", "UsfmElementType", diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index d08d064..43797cc 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -5,7 +5,7 @@ from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .scripture_ref import ScriptureRef -from .update_usfm_parser_handler import UpdateUsfmParserHandler +from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler from .usfm_parser import parse_usfm @@ -21,17 +21,14 @@ def update_usfm( book_id: str, rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, full_name: Optional[str] = None, - strip_all_text: bool = False, - prefer_existing_text: bool = True, + behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING, ) -> Optional[str]: file_name: str = self._settings.get_book_file_name(book_id) if not self._exists(file_name): return None with self._open(file_name) as sfm_file: usfm: str = sfm_file.read().decode(self._settings.encoding) - handler = UpdateUsfmParserHandler( - rows, None if full_name is None else f"- {full_name}", strip_all_text, prefer_existing_text - ) + handler = UpdateUsfmParserHandler(rows, None if full_name is None else f"- {full_name}", behavior) try: parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) return handler.get_usfm(self._settings.stylesheet) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 18e0136..58dd217 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -1,3 +1,4 @@ +from enum import Enum, auto from typing import List, Optional, Sequence, Tuple, Union from .scripture_ref import ScriptureRef @@ -8,21 +9,25 @@ from .usfm_tokenizer import UsfmTokenizer +class UpdateUsfmBehavior(Enum): + PREFER_EXISTING = auto() + PREFER_NEW = auto() + STRIP_EXISTING = auto() + + class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler): def __init__( self, rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, id_text: Optional[str] = None, - strip_all_text: bool = False, - prefer_existing_text: bool = False, + behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING, ) -> None: super().__init__() self._rows = rows or [] self._tokens: List[UsfmToken] = [] self._new_tokens: List[UsfmToken] = [] self._id_text = id_text - self._strip_all_text = strip_all_text - self._prefer_existing_text = prefer_existing_text + self._behavior = behavior self._replace_stack: List[bool] = [] self._row_index: int = 0 self._token_index: int = 0 @@ -283,7 +288,9 @@ def _replace_with_new_tokens(self, state: UsfmParserState) -> bool: existing_text = True break use_new_tokens: bool = ( - self._strip_all_text or (new_text and not existing_text) or (new_text and not self._prefer_existing_text) + self._behavior is UpdateUsfmBehavior.STRIP_EXISTING + or (new_text and not existing_text) + or (new_text and self._behavior is UpdateUsfmBehavior.PREFER_NEW) ) if use_new_tokens: self._tokens.extend(self._new_tokens) diff --git a/samples/paratext_usfm.ipynb b/samples/paratext_usfm.ipynb new file mode 100644 index 0000000..5220643 --- /dev/null +++ b/samples/paratext_usfm.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Paratext/USFM Processing Tutorial\n", + "\n", + "Machine provides various classes for parsing and mutating USFM. These classes are used to implement the USFM and Paratext corpus classes. Machine also provides functionality for parsing Paratext project settings. The USFM processing functionality is designed to be as compatible as possible with USFM that is produced by Paratext." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install sil-machine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!git clone https://github.com/sillsdev/machine.py.git\n", + "%cd machine.py/samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Parsing USFM\n", + "\n", + "Machine provides a couple of options for parsing USFM. These classes can also be used to perform simple changes to the USFM.\n", + "\n", + "First, let's define a simple USFM string and instantiate a USFM stylesheet object using the default stylesheet that is included in Machine. If you are using a custom stylesheet, you can use that file when you create the stylesheet object." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from machine.corpora import UsfmStylesheet\n", + "\n", + "usfm = \"\"\"\\\\id MAT - Test\n", + "\\\\h Matthew\n", + "\\\\mt Matthew\n", + "\\\\ip An introduction to Matthew\n", + "\\\\c 1\n", + "\\\\s Chapter One\n", + "\\\\p\n", + "\\\\v 1 This is verse \\\\pn one\\\\pn* of chapter one.\n", + "\\\\v 2 This is verse two\\\\f + \\\\fr 1:2: \\\\ft This is a footnote.\\\\f* of chapter one. \n", + "\"\"\".replace(\"\\n\", \"\\r\\n\")\n", + "\n", + "stylesheet = UsfmStylesheet(\"usfm.sty\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 1: UsfmTokenizer\n", + "\n", + "Machine provides a `UsfmTokenizer` class that can be used to split up the USFM into separate tokens. A `UsfmToken` object is used to represent each token. The class contains minimal metadata about each token. This is the simplest method for parsing USFM. You can process the tokens by simply iterating through them. USFM can be reconstructed from the tokens by using the `detokenize` method.\n", + "\n", + "In this example, we uppercase all text tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\id MAT - TEST\n", + "\\h MATTHEW\n", + "\\mt MATTHEW\n", + "\\ip AN INTRODUCTION TO MATTHEW\n", + "\\c 1\n", + "\\s CHAPTER ONE\n", + "\\p\n", + "\\v 1 THIS IS VERSE \\pn ONE\\pn* OF CHAPTER ONE.\n", + "\\v 2 THIS IS VERSE TWO\\f + \\fr 1:2: \\ft THIS IS A FOOTNOTE.\\f* OF CHAPTER ONE.\n", + "\n" + ] + } + ], + "source": [ + "from machine.corpora import UsfmTokenizer, UsfmTokenType\n", + "\n", + "usfm_tokenizer = UsfmTokenizer(stylesheet)\n", + "tokens = usfm_tokenizer.tokenize(usfm)\n", + "for token in tokens:\n", + " if token.type is UsfmTokenType.TEXT:\n", + " token.text = token.text.upper()\n", + "\n", + "print(usfm_tokenizer.detokenize(tokens))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 2: UsfmParser\n", + "\n", + "The `UsfmParser` class is a higher-level parsing option that provides more information about the semantics and context of the current token. The `UsfmParserState` instance contains the current context and metadata.\n", + "\n", + "In this example, we use the more intelligent features of the `UsfmParser` to only uppercase verse text." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\id MAT - Test\n", + "\\h Matthew\n", + "\\mt Matthew\n", + "\\ip An introduction to Matthew\n", + "\\c 1\n", + "\\s Chapter One\n", + "\\p\n", + "\\v 1 THIS IS VERSE \\pn ONE\\pn* OF CHAPTER ONE.\n", + "\\v 2 THIS IS VERSE TWO\\f + \\fr 1:2: \\ft This is a footnote.\\f* OF CHAPTER ONE.\n", + "\n" + ] + } + ], + "source": [ + "from machine.corpora import UsfmParser\n", + "\n", + "usfm_parser = UsfmParser(usfm, stylesheet=stylesheet)\n", + "state = usfm_parser.state\n", + "while usfm_parser.process_token():\n", + " if state.token.type is UsfmTokenType.TEXT and state.is_verse_text:\n", + " state.token.text = state.token.text.upper()\n", + "\n", + "print(usfm_tokenizer.detokenize(state.tokens))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 3: UsfmParser with UsfmParserHandler\n", + "\n", + "This option is very similar to the previous except an instance of `UsfmParserHandler` is passed to the `UsfmParser`. This class provides a set of callback methods that can be extended with custom processing logic. The methods are called when certain events are encountered during parsing. For example, there are callbacks for the start/end of a book, start/end of a paragraph, a chapter/verse milestone, etc.\n", + "\n", + "In this example, we override the `text` callback to uppercase verse text just like in the previous example." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\id MAT - Test\n", + "\\h Matthew\n", + "\\mt Matthew\n", + "\\ip An introduction to Matthew\n", + "\\c 1\n", + "\\s Chapter One\n", + "\\p\n", + "\\v 1 THIS IS VERSE \\pn ONE\\pn* OF CHAPTER ONE.\n", + "\\v 2 THIS IS VERSE TWO\\f + \\fr 1:2: \\ft This is a footnote.\\f* OF CHAPTER ONE.\n", + "\n" + ] + } + ], + "source": [ + "from machine.corpora import UsfmParserHandler\n", + "\n", + "class VerseTextUppercaser(UsfmParserHandler):\n", + " def text(self, state, text):\n", + " if state.is_verse_text:\n", + " state.token.text = text.upper()\n", + "\n", + "usfm_parser = UsfmParser(usfm, VerseTextUppercaser(), stylesheet)\n", + "usfm_parser.process_tokens()\n", + "\n", + "print(usfm_tokenizer.detokenize(usfm_parser.state.tokens))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Machine provides a convenience function, `parse_usfm`, for easily parsing USFM using a handler." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is verse \n", + "one\n", + " of chapter one. \n", + "This is verse two\n", + " of chapter one.\n" + ] + } + ], + "source": [ + "from machine.corpora import parse_usfm\n", + "\n", + "class VerseTextCollector(UsfmParserHandler):\n", + " def __init__(self):\n", + " self.verse_texts = []\n", + "\n", + " def text(self, state, text):\n", + " if state.is_verse_text:\n", + " self.verse_texts.append(text)\n", + "\n", + "collector = VerseTextCollector()\n", + "parse_usfm(usfm, collector, stylesheet)\n", + "for verse_text in collector.verse_texts:\n", + " print(verse_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Processing Paratext Projects\n", + "\n", + "Machine provides classes for parsing Paratext project settings and updating the text segments in a project." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parsing Paratext Settings\n", + "\n", + "First, let's demonstrate how to parse Paratext settings. We use `FileParatextProjectSettingsParser` to read the settings for a project. The parsed settings contain important information about the project, such as the project name, file encoding, versification, USFM stylesheet, USFM file name format, language code, and Biblical terms list." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: engWEB14\n", + "Full Name: World English Bible (American Edition)\n", + "Language Code: en\n", + "Encoding: utf_8_sig\n" + ] + } + ], + "source": [ + "from machine.corpora import FileParatextProjectSettingsParser\n", + "\n", + "settings_parser = FileParatextProjectSettingsParser(\"data/WEB-PT\")\n", + "settings = settings_parser.parse()\n", + "print(\"Name:\", settings.name)\n", + "print(\"Full Name:\", settings.full_name)\n", + "print(\"Language Code:\", settings.language_code)\n", + "print(\"Encoding:\", settings.encoding)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Updating a Paratext Book\n", + "\n", + "Machine has the ability to replace the text segments in a Paratext project. This is useful for implementing machine translation systems. We will use `FileParatextProjectTextUpdater` to update the text segments. All we have to do is pass in a list of text segments and the corresponding Scripture reference to the `update_usfm` method. The class can be used to update both verse text and non-Scripture text. By default, `update_usfm` will only update text segments that are blank, so we set the `behavior` parameter to `UpdateUsfmBehavior.PREFER_NEW` to indicate that we want to replace existing segments." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\id 3JN 64-3JN-web.sfm World English Bible (WEB)\n", + "\\ide UTF-8\n", + "\\h 3 John\n", + "\\toc1 John’s Third Letter\n", + "\\toc2 3 John\n", + "\\toc3 3 John\n", + "\\mt1 THIS IS THE MAJOR TITLE OF 3 JOHN.\n", + "\\c 1\n", + "\\p\n", + "\\v 1 THIS IS THE FIRST VERSE OF 3 JOHN.\n", + "\\p\n", + "\\v 2 Beloved, I pray that you may prosper in all things and be healthy, even as your soul prospers.\n", + "\\v 3 For I rejoiced greatly when brothers came and testified about your truth, even as you walk in truth.\n", + "\\v 4 I have no greater joy than this: to hear about my children walking in truth.\n", + "\\p\n", + "\\v 5 Beloved, you do a faithful work in whatever you accomplish for those who are brothers and strangers.\n", + "\\v 6 They have testified about your love before the assembly. You will do well to send them forward on their journey in a way worthy of God,\n", + "\\v 7 because for the sake of the Name they went out, taking nothing from the Gentiles.\n", + "\\v 8 We therefore ought to receive such, that we may be fellow workers for the truth.\n", + "\\p\n", + "\\v 9 I wrote to the assembly, but Diotrephes, who loves to be first among them, doesn’t accept what we say.\n", + "\\v 10 Therefore, if I come, I will call attention to his deeds which he does, unjustly accusing us with wicked words. Not content with this, he doesn’t receive the brothers himself, and those who would, he forbids and throws out of the assembly.\n", + "\\p\n", + "\\v 11 Beloved, don’t imitate that which is evil, but that which is good. He who does good is of God. He who does evil hasn’t seen God.\n", + "\\v 12 Demetrius has the testimony of all, and of the truth itself; yes, we also testify, and you know that our testimony is true.\n", + "\\p\n", + "\\v 13 I had many things to write to you, but I am unwilling to write to you with ink and pen;\n", + "\\v 14 THIS IS THE FOURTEENTH VERSE OF 3 JOHN.\n", + "\\p\n", + "\n" + ] + } + ], + "source": [ + "from machine.corpora import FileParatextProjectTextUpdater, ScriptureRef, UpdateUsfmBehavior\n", + "\n", + "new_segments = [\n", + " ([ScriptureRef.parse(\"3JN 1:0/mt1\", settings.versification)], \"THIS IS THE MAJOR TITLE OF 3 JOHN.\"),\n", + " ([ScriptureRef.parse(\"3JN 1:1\", settings.versification)], \"THIS IS THE FIRST VERSE OF 3 JOHN.\"),\n", + " ([ScriptureRef.parse(\"3JN 1:14\", settings.versification)], \"THIS IS THE FOURTEENTH VERSE OF 3 JOHN.\")\n", + "]\n", + "\n", + "updater = FileParatextProjectTextUpdater(\"data/WEB-PT\")\n", + "new_usfm = updater.update_usfm(\"3JN\", new_segments, behavior=UpdateUsfmBehavior.PREFER_NEW)\n", + "print(new_usfm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By setting the `behavior` parameter to `UpdateUsfmBehavior.STRIP_EXISTING`, we can remove all existing text segments, leaving only the new segments and the basic USFM structure." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\id 3JN\n", + "\\ide\n", + "\\h\n", + "\\toc1\n", + "\\toc2\n", + "\\toc3\n", + "\\mt1 THIS IS THE MAJOR TITLE OF 3 JOHN.\n", + "\\c 1\n", + "\\p\n", + "\\v 1 THIS IS THE FIRST VERSE OF 3 JOHN.\n", + "\\p\n", + "\\v 2\n", + "\\v 3\n", + "\\v 4\n", + "\\p\n", + "\\v 5\n", + "\\v 6\n", + "\\v 7\n", + "\\v 8\n", + "\\p\n", + "\\v 9\n", + "\\v 10\n", + "\\p\n", + "\\v 11\n", + "\\v 12\n", + "\\p\n", + "\\v 13\n", + "\\v 14 THIS IS THE FOURTEENTH VERSE OF 3 JOHN.\n", + "\\p\n", + "\n" + ] + } + ], + "source": [ + "new_usfm = updater.update_usfm(\"3JN\", new_segments, behavior=UpdateUsfmBehavior.STRIP_EXISTING)\n", + "print(new_usfm)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sil-machine-7etMAqKM-py3.8", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 3c373b2..00eb3f3 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -2,7 +2,13 @@ from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, ignore_line_endings -from machine.corpora import FileParatextProjectTextUpdater, ScriptureRef, UpdateUsfmParserHandler, parse_usfm +from machine.corpora import ( + FileParatextProjectTextUpdater, + ScriptureRef, + UpdateUsfmBehavior, + UpdateUsfmParserHandler, + parse_usfm, +) def test_get_usfm_verse_char_style() -> None: @@ -25,7 +31,7 @@ def test_get_usfm_id_text() -> None: def test_get_usfm_strip_all_text() -> None: - target = update_usfm(strip_all_text=True) + target = update_usfm(behavior=UpdateUsfmBehavior.STRIP_EXISTING) assert target is not None assert "\\id MAT\r\n" in target assert "\\v 1\r\n" in target @@ -43,7 +49,7 @@ def test_get_usfm_prefer_existing(): str("Text 7"), ), ] - target = update_usfm(rows, prefer_existing_text=True) + target = update_usfm(rows, behavior=UpdateUsfmBehavior.PREFER_EXISTING) assert target is not None assert "\\id MAT - Test\r\n" in target assert "\\v 6 Verse 6 content.\r\n" in target @@ -61,7 +67,7 @@ def test_get_usfm_prefer_rows(): str("Text 7"), ), ] - target = update_usfm(rows, prefer_existing_text=False) + target = update_usfm(rows, behavior=UpdateUsfmBehavior.PREFER_NEW) assert target is not None assert "\\id MAT - Test\r\n" in target assert "\\v 6 Text 6\r\n" in target @@ -452,15 +458,14 @@ def update_usfm( rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, source: Optional[str] = None, id_text: Optional[str] = None, - strip_all_text: bool = False, - prefer_existing_text: bool = False, + behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_NEW, ) -> Optional[str]: if source is None: updater = FileParatextProjectTextUpdater(USFM_TEST_PROJECT_PATH) - return updater.update_usfm("MAT", rows, id_text, strip_all_text, prefer_existing_text) + return updater.update_usfm("MAT", rows, id_text, behavior) else: source = source.strip().replace("\r\n", "\n") + "\r\n" - updater = UpdateUsfmParserHandler(rows, id_text, strip_all_text, prefer_existing_text) + updater = UpdateUsfmParserHandler(rows, id_text, behavior) parse_usfm(source, updater) return updater.get_usfm()