Skip to content

Commit

Permalink
raise error when id tag doesn't match filename book id
Browse files Browse the repository at this point in the history
  • Loading branch information
mshannon-sil committed Nov 11, 2024
1 parent 0fb9518 commit 8679b78
Show file tree
Hide file tree
Showing 11 changed files with 230 additions and 27 deletions.
32 changes: 20 additions & 12 deletions machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,26 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all
for sfm_entry in archive.filelist:
book_id = settings.get_book_id(sfm_entry.filename)
if book_id:
texts.append(
UsfmZipText(
settings.stylesheet,
settings.encoding,
book_id,
filename,
sfm_entry.filename,
versification,
include_markers,
include_all_text,
settings.name,
)
text = UsfmZipText(
settings.stylesheet,
settings.encoding,
book_id,
filename,
sfm_entry.filename,
versification,
include_markers,
include_all_text,
settings.name,
)
with text.get_rows() as rows:
row = next(rows, None)
if row and row.ref.book != book_id:
if row.ref.book == "":
raise ValueError(f"The \\id tag in {sfm_entry.filename} is invalid.")
raise ValueError(
f"The \\id tag {row.ref.book} in {sfm_entry.filename}"
f" does not match filename book id {book_id}."
)
texts.append(text)

super().__init__(versification, texts)
29 changes: 18 additions & 11 deletions machine/corpora/paratext_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,24 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_
for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
book_id = settings.get_book_id(sfm_filename.name)
if book_id:
texts.append(
UsfmFileText(
settings.stylesheet,
settings.encoding,
book_id,
sfm_filename,
versification,
include_markers,
include_all_text,
settings.name,
)
text = UsfmFileText(
settings.stylesheet,
settings.encoding,
book_id,
sfm_filename,
versification,
include_markers,
include_all_text,
settings.name,
)
with text.get_rows() as rows:
row = next(rows, None)
if row and row.ref.book != book_id:
if row.ref.book == "":
raise ValueError(f"The \\id tag in {sfm_filename} is invalid.")
raise ValueError(
f"The \\id tag {row.ref.book} in {sfm_filename} does not match filename book id {book_id}."
)
texts.append(text)

super().__init__(versification, texts)
30 changes: 26 additions & 4 deletions tests/corpora/test_paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@

from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, ContextManager
from typing import Any, ContextManager, Optional

from testutils.corpora_test_helpers import create_test_paratext_backup
from pytest import raises
from testutils.corpora_test_helpers import (
create_test_paratext_backup,
create_test_paratext_backup_invalid_id,
create_test_paratext_backup_mismatch_id,
)

from machine.corpora import ParatextBackupTextCorpus

Expand All @@ -28,10 +33,27 @@ def test_get_text() -> None:
assert not any(jhn.get_rows())


def test_invalid_id() -> None:
with raises(ValueError, match=r"The \\id tag in .* is invalid."):
with _TestEnvironment("invalid_id") as env:
env.corpus.get_text("JDG")


def test_mismatch_id() -> None:
with raises(ValueError, match=r"The \\id tag .* in .* does not match filename book id .*"):
with _TestEnvironment("mismatch_id") as env:
env.corpus.get_text("JDG")


class _TestEnvironment(ContextManager["_TestEnvironment"]):
def __init__(self) -> None:
def __init__(self, project_folder_name: Optional[str] = None) -> None:
self._temp_dir = TemporaryDirectory()
archive_filename = create_test_paratext_backup(Path(self._temp_dir.name))
if project_folder_name == "invalid_id":
archive_filename = create_test_paratext_backup_invalid_id(Path(self._temp_dir.name))
elif project_folder_name == "mismatch_id":
archive_filename = create_test_paratext_backup_mismatch_id(Path(self._temp_dir.name))
else:
archive_filename = create_test_paratext_backup(Path(self._temp_dir.name))
self._corpus = ParatextBackupTextCorpus(archive_filename)

@property
Expand Down
14 changes: 14 additions & 0 deletions tests/corpora/test_paratext_text_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pytest import raises
from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_MISMATCH_ID_PROJECT_PATH

from machine.corpora import ParatextTextCorpus


def test_paratext_text_corpus_invalid_id() -> None:
with raises(ValueError, match=r"The \\id tag in .* is invalid."):
ParatextTextCorpus(USFM_INVALID_ID_PROJECT_PATH, include_all_text=True)


def test_paratext_text_corpus_mismatch_id() -> None:
with raises(ValueError, match=r"The \\id tag .* in .* does not match filename book id .*"):
ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True)
12 changes: 12 additions & 0 deletions tests/testutils/corpora_test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes"
USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target"
USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source"
USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id"
USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id"
USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes"
TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt"
CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs"
Expand All @@ -24,6 +26,16 @@ def create_test_paratext_backup(temp_dir: Path) -> Path:
return temp_dir / "Tes.zip"


def create_test_paratext_backup_invalid_id(temp_dir: Path) -> Path:
shutil.make_archive(str(temp_dir / "invalid_id"), "zip", USFM_INVALID_ID_PROJECT_PATH)
return temp_dir / "invalid_id.zip"


def create_test_paratext_backup_mismatch_id(temp_dir: Path) -> Path:
shutil.make_archive(str(temp_dir / "mismatch_id"), "zip", USFM_MISMATCH_ID_PROJECT_PATH)
return temp_dir / "mismatch_id.zip"


def verse_ref(segment: TextRow) -> VerseRef:
assert isinstance(segment.ref, VerseRef)
return segment.ref
Expand Down
5 changes: 5 additions & 0 deletions tests/testutils/data/usfm/invalid_id/07JDG.SFM
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
\id JGS - Test
\h Judges
\mt Judges
\c 1
\v 1 Chapter one, verse one.
34 changes: 34 additions & 0 deletions tests/testutils/data/usfm/invalid_id/Settings.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<ScriptureText>
<StyleSheet>usfm.sty</StyleSheet>
<Versification>4</Versification>
<LanguageIsoCode>en:::</LanguageIsoCode>
<Language>English</Language>
<MinParatextVersion>8.0.100.76</MinParatextVersion>
<FullName>Test</FullName>
<Encoding>65001</Encoding>
<Editable>T</Editable>
<Copyright />
<NormalizationForm>NFC</NormalizationForm>
<Name>invalid_id</Name>
<Guid>a7e0b3ce0200736062f9f810a444dbfbe64aca35</Guid>
<DefaultFont>Charis SIL</DefaultFont>
<DefaultFontSize>12</DefaultFontSize>
<FontFeatures />
<HtmlLanguage />
<AssociatedLexicalProject />
<FileNameBookNameForm>41MAT</FileNameBookNameForm>
<FileNamePrePart />
<FileNamePostPart>.SFM</FileNamePostPart>
<BiblicalTermsListSetting>Major::BiblicalTerms.xml</BiblicalTermsListSetting>
<MatchBasedOnStems>F</MatchBasedOnStems>
<AllowReadAccess>F</AllowReadAccess>
<AllowSharingWithSLDR>F</AllowSharingWithSLDR>
<Visibility>Public</Visibility>
<TranslationInfo>Standard::</TranslationInfo>
<EncodingConverter />
<UsfmVersion>3</UsfmVersion>
<ParallelPassagesBooks>000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000</ParallelPassagesBooks>
<BooksPresent>000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000</BooksPresent>
<BibleModuleAssociations />
<Naming PrePart="" PostPart=".SFM" BookNameForm="41MAT" />
</ScriptureText>
31 changes: 31 additions & 0 deletions tests/testutils/data/usfm/invalid_id/custom.vrs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# custom.vrs

LEV 14:56
ROM 14:26
REV 12:17
TOB 5:22
TOB 10:12
SIR 23:28
ESG 1:22
ESG 3:15
ESG 5:14
ESG 8:17
ESG 10:14
SIR 33:33
SIR 41:24
BAR 1:22
4MA 7:25
4MA 12:20

# deliberately missing verses
-ROM 16:26
-ROM 16:27
-3JN 1:15
-S3Y 1:49
-ESG 4:6
-ESG 9:5
-ESG 9:30

LEV 14:55 = LEV 14:55
LEV 14:55 = LEV 14:56
LEV 14:56 = LEV 14:57
5 changes: 5 additions & 0 deletions tests/testutils/data/usfm/mismatch_id/07JDG.SFM
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
\id JUD - Test
\h Judges
\mt Judges
\c 1
\v 1 Chapter one, verse one.
34 changes: 34 additions & 0 deletions tests/testutils/data/usfm/mismatch_id/Settings.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<ScriptureText>
<StyleSheet>usfm.sty</StyleSheet>
<Versification>4</Versification>
<LanguageIsoCode>en:::</LanguageIsoCode>
<Language>English</Language>
<MinParatextVersion>8.0.100.76</MinParatextVersion>
<FullName>Test</FullName>
<Encoding>65001</Encoding>
<Editable>T</Editable>
<Copyright />
<NormalizationForm>NFC</NormalizationForm>
<Name>mismatch_id</Name>
<Guid>a7e0b3ce0200736062f9f810a444dbfbe64aca35</Guid>
<DefaultFont>Charis SIL</DefaultFont>
<DefaultFontSize>12</DefaultFontSize>
<FontFeatures />
<HtmlLanguage />
<AssociatedLexicalProject />
<FileNameBookNameForm>41MAT</FileNameBookNameForm>
<FileNamePrePart />
<FileNamePostPart>.SFM</FileNamePostPart>
<BiblicalTermsListSetting>Major::BiblicalTerms.xml</BiblicalTermsListSetting>
<MatchBasedOnStems>F</MatchBasedOnStems>
<AllowReadAccess>F</AllowReadAccess>
<AllowSharingWithSLDR>F</AllowSharingWithSLDR>
<Visibility>Public</Visibility>
<TranslationInfo>Standard::</TranslationInfo>
<EncodingConverter />
<UsfmVersion>3</UsfmVersion>
<ParallelPassagesBooks>000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000</ParallelPassagesBooks>
<BooksPresent>000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000</BooksPresent>
<BibleModuleAssociations />
<Naming PrePart="" PostPart=".SFM" BookNameForm="41MAT" />
</ScriptureText>
31 changes: 31 additions & 0 deletions tests/testutils/data/usfm/mismatch_id/custom.vrs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# custom.vrs

LEV 14:56
ROM 14:26
REV 12:17
TOB 5:22
TOB 10:12
SIR 23:28
ESG 1:22
ESG 3:15
ESG 5:14
ESG 8:17
ESG 10:14
SIR 33:33
SIR 41:24
BAR 1:22
4MA 7:25
4MA 12:20

# deliberately missing verses
-ROM 16:26
-ROM 16:27
-3JN 1:15
-S3Y 1:49
-ESG 4:6
-ESG 9:5
-ESG 9:30

LEV 14:55 = LEV 14:55
LEV 14:55 = LEV 14:56
LEV 14:56 = LEV 14:57

0 comments on commit 8679b78

Please sign in to comment.