diff --git a/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs b/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs index 3a88cd276..f644e524d 100644 --- a/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs @@ -1,6 +1,4 @@ using System.IO.Compression; -using System.Linq; -using System.Text.RegularExpressions; namespace SIL.Machine.Corpora { @@ -15,23 +13,23 @@ public ParatextBackupTextCorpus(string fileName, bool includeMarkers = false, bo Versification = settings.Versification; - var regex = new Regex( - $"^{Regex.Escape(settings.FileNamePrefix)}.*{Regex.Escape(settings.FileNameSuffix)}$" - ); - - foreach (ZipArchiveEntry sfmEntry in archive.Entries.Where(e => regex.IsMatch(e.FullName))) + foreach (ZipArchiveEntry sfmEntry in archive.Entries) { - AddText( - new UsfmZipText( - settings.Stylesheet, - settings.Encoding, - fileName, - sfmEntry.FullName, - Versification, - includeMarkers, - includeAllText - ) - ); + if (settings.IsBookFileName(sfmEntry.FullName, out string bookId)) + { + AddText( + new UsfmZipText( + settings.Stylesheet, + settings.Encoding, + bookId, + fileName, + sfmEntry.FullName, + Versification, + includeMarkers, + includeAllText + ) + ); + } } } } diff --git a/src/SIL.Machine/Corpora/ParatextProjectSettings.cs b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs index 6837c33ed..4d5e49bf8 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectSettings.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs @@ -45,6 +45,48 @@ string biblicalTermsFileName public string BiblicalTermsProjectName { get; } public string BiblicalTermsFileName { get; } + public bool IsBookFileName(string fileName, out string bookId) + { + bookId = null; + + if (!fileName.StartsWith(FileNamePrefix) || !fileName.EndsWith(FileNameSuffix)) + return false; + + string bookPart = fileName.Substring( + FileNamePrefix.Length, + fileName.Length - FileNamePrefix.Length - FileNameSuffix.Length + ); + if (FileNameForm == "MAT") + { + if (bookPart.Length != 3) + return false; + + bookId = bookPart; + } + else if (FileNameForm == "40" || FileNameForm == "41") + { + if (bookPart != "100" && bookPart.Length != 2) + return false; + + bookId = Canon.BookNumberToId(GetBookNumber(bookPart)); + } + else + { + if (bookPart.StartsWith("100")) + { + if (bookPart.Length != 6) + return false; + } + else if (bookPart.Length != 5) + { + return false; + } + + bookId = bookPart.Length == 5 ? bookPart.Substring(2) : bookPart.Substring(3); + } + return true; + } + public string GetBookFileName(string bookId) { string bookPart; @@ -73,5 +115,20 @@ private static string GetBookFileNameDigits(string bookId) return "B" + (bookNum - 110); return "C" + (bookNum - 120); } + + private static int GetBookNumber(string bookFileNameDigits) + { + if (bookFileNameDigits.StartsWith("A")) + return 100 + int.Parse(bookFileNameDigits.Substring(1), CultureInfo.InvariantCulture); + if (bookFileNameDigits.StartsWith("B")) + return 110 + int.Parse(bookFileNameDigits.Substring(1), CultureInfo.InvariantCulture); + if (bookFileNameDigits.StartsWith("C")) + return 120 + int.Parse(bookFileNameDigits.Substring(1), CultureInfo.InvariantCulture); + + int bookNum = int.Parse(bookFileNameDigits, CultureInfo.InvariantCulture); + if (bookNum >= 40) + return bookNum - 1; + return bookNum; + } } } diff --git a/src/SIL.Machine/Corpora/ParatextTextCorpus.cs b/src/SIL.Machine/Corpora/ParatextTextCorpus.cs index 2b9357e9d..b1fa50158 100644 --- a/src/SIL.Machine/Corpora/ParatextTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParatextTextCorpus.cs @@ -18,16 +18,20 @@ string sfmFileName in Directory.EnumerateFiles( ) ) { - AddText( - new UsfmFileText( - settings.Stylesheet, - settings.Encoding, - sfmFileName, - Versification, - includeMarkers, - includeAllText - ) - ); + if (settings.IsBookFileName(Path.GetFileName(sfmFileName), out string bookId)) + { + AddText( + new UsfmFileText( + settings.Stylesheet, + settings.Encoding, + bookId, + sfmFileName, + Versification, + includeMarkers, + includeAllText + ) + ); + } } } } diff --git a/src/SIL.Machine/Corpora/UsfmFileText.cs b/src/SIL.Machine/Corpora/UsfmFileText.cs index 6f1c34f82..ee3dde07a 100644 --- a/src/SIL.Machine/Corpora/UsfmFileText.cs +++ b/src/SIL.Machine/Corpora/UsfmFileText.cs @@ -1,6 +1,4 @@ -using System; -using System.IO; -using System.Text; +using System.Text; using SIL.Scripture; namespace SIL.Machine.Corpora @@ -12,12 +10,13 @@ public class UsfmFileText : UsfmTextBase public UsfmFileText( UsfmStylesheet stylesheet, Encoding encoding, + string id, string fileName, ScrVers versification = null, bool includeMarkers = false, bool includeAllText = false ) - : base(GetId(fileName, encoding), stylesheet, encoding, versification, includeMarkers, includeAllText) + : base(id, stylesheet, encoding, versification, includeMarkers, includeAllText) { _fileName = fileName; } @@ -26,26 +25,5 @@ protected override IStreamContainer CreateStreamContainer() { return new FileStreamContainer(_fileName); } - - private static string GetId(string fileName, Encoding encoding) - { - using (var reader = new StreamReader(fileName, encoding)) - { - string line; - while ((line = reader.ReadLine()) != null) - { - line = line.Trim(); - if (line.StartsWith("\\id ")) - { - string id = line.Substring(4); - int index = id.IndexOf(" "); - if (index != -1) - id = id.Substring(0, index); - return id.Trim(); - } - } - } - throw new InvalidOperationException("The USFM does not contain an 'id' marker."); - } } } diff --git a/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs b/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs index e3e452ed7..abb336dfa 100644 --- a/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs +++ b/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs @@ -20,10 +20,43 @@ public UsfmFileTextCorpus( var stylesheet = new UsfmStylesheet(stylesheetFileName); foreach (string sfmFileName in Directory.EnumerateFiles(projectPath, filePattern)) { - AddText( - new UsfmFileText(stylesheet, encoding, sfmFileName, Versification, includeMarkers, includeAllText) - ); + string id = GetId(sfmFileName, encoding); + if (id != null) + { + AddText( + new UsfmFileText( + stylesheet, + encoding, + id, + sfmFileName, + Versification, + includeMarkers, + includeAllText + ) + ); + } } } + + private static string GetId(string fileName, Encoding encoding) + { + using (var reader = new StreamReader(fileName, encoding)) + { + string line; + while ((line = reader.ReadLine()) != null) + { + line = line.Trim(); + if (line.StartsWith("\\id ")) + { + string id = line.Substring(4); + int index = id.IndexOf(" "); + if (index != -1) + id = id.Substring(0, index); + return id.Trim(); + } + } + } + return null; + } } } diff --git a/src/SIL.Machine/Corpora/UsfmZipText.cs b/src/SIL.Machine/Corpora/UsfmZipText.cs index c1ce48641..c7f4793e7 100644 --- a/src/SIL.Machine/Corpora/UsfmZipText.cs +++ b/src/SIL.Machine/Corpora/UsfmZipText.cs @@ -1,7 +1,4 @@ -using System; -using System.IO; -using System.IO.Compression; -using System.Text; +using System.Text; using SIL.Scripture; namespace SIL.Machine.Corpora @@ -14,20 +11,14 @@ public class UsfmZipText : UsfmTextBase public UsfmZipText( UsfmStylesheet stylesheet, Encoding encoding, + string id, string archiveFileName, string path, ScrVers versification = null, bool includeMarkers = false, bool includeAllText = false ) - : base( - GetId(archiveFileName, path, encoding), - stylesheet, - encoding, - versification, - includeMarkers, - includeAllText - ) + : base(id, stylesheet, encoding, versification, includeMarkers, includeAllText) { _archiveFileName = archiveFileName; _path = path; @@ -37,30 +28,5 @@ protected override IStreamContainer CreateStreamContainer() { return new ZipEntryStreamContainer(_archiveFileName, _path); } - - private static string GetId(string archiveFileName, string path, Encoding encoding) - { - using (var archive = ZipFile.OpenRead(archiveFileName)) - { - ZipArchiveEntry entry = archive.GetEntry(path); - using (var reader = new StreamReader(entry.Open(), encoding)) - { - string line; - while ((line = reader.ReadLine()) != null) - { - line = line.Trim(); - if (line.StartsWith("\\id ")) - { - string id = line.Substring(4); - int index = id.IndexOf(" "); - if (index != -1) - id = id.Substring(0, index); - return id.Trim(); - } - } - } - } - throw new InvalidOperationException("The USFM does not contain an 'id' marker."); - } } } diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextBackupTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextBackupTextCorpusTests.cs index e68f9ce97..036a0ed7e 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParatextBackupTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParatextBackupTextCorpusTests.cs @@ -10,7 +10,7 @@ public class ParatextBackupTextCorpusTests public void Texts() { using var env = new TestEnvironment(); - Assert.That(env.Corpus.Texts.Select(t => t.Id), Is.EquivalentTo(new[] { "LEV", "1CH", "MAT", "MRK" })); + Assert.That(env.Corpus.Texts.Select(t => t.Id), Is.EquivalentTo(new[] { "LEV", "1CH", "MAT", "MRK", "JHN" })); } [Test] @@ -20,6 +20,8 @@ public void TryGetText() Assert.That(env.Corpus.TryGetText("MAT", out IText mat), Is.True); Assert.That(mat.GetRows(), Is.Not.Empty); Assert.That(env.Corpus.TryGetText("LUK", out _), Is.False); + Assert.That(env.Corpus.TryGetText("JHN", out IText jhn), Is.True); + Assert.That(jhn.GetRows(), Is.Empty); } private class TestEnvironment : DisposableBase diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextProjectSettingsTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextProjectSettingsTests.cs index e28e12b8b..07bdf7927 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParatextProjectSettingsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParatextProjectSettingsTests.cs @@ -63,6 +63,114 @@ public void GetBookFileName_BookNumPrefixC() Assert.That(settings.GetBookFileName("3MQ"), Is.EqualTo("PROJC0.SFM")); } + [Test] + public void IsBookFileName_BookNum() + { + ParatextProjectSettings settings = CreateSettings("41"); + Assert.That(settings.IsBookFileName("PROJ42.SFM", out string bookId), Is.True); + Assert.That(bookId, Is.EqualTo("MRK")); + } + + [Test] + public void IsBookFileName_BookNumBookId() + { + ParatextProjectSettings settings = CreateSettings("41MAT"); + Assert.That(settings.IsBookFileName("PROJ42MRK.SFM", out string bookId), Is.True); + Assert.That(bookId, Is.EqualTo("MRK")); + } + + [Test] + public void IsBookFileName_BookId() + { + ParatextProjectSettings settings = CreateSettings("MAT"); + Assert.That(settings.IsBookFileName("PROJMRK.SFM", out string bookId), Is.True); + Assert.That(bookId, Is.EqualTo("MRK")); + } + + [Test] + public void IsBookFileName_BookNumDoubleDigit() + { + ParatextProjectSettings settings = CreateSettings("41"); + Assert.That(settings.IsBookFileName("PROJ01.SFM", out string bookId), Is.True); + Assert.That(bookId, Is.EqualTo("GEN")); + } + + [Test] + public void IsBookFileName_BookNumXXG_BookNum() + { + ParatextProjectSettings settings = CreateSettings("41"); + Assert.That(settings.IsBookFileName("PROJ100.SFM", out string bookId), Is.True); + Assert.That(bookId, Is.EqualTo("XXG")); + } + + [Test] + public void IsBookFileName_BookNumXXG_BookNumBookId() + { + ParatextProjectSettings settings = CreateSettings("41MAT"); + Assert.That(settings.IsBookFileName("PROJ100XXG.SFM", out string bookId), Is.True); + Assert.That(bookId, Is.EqualTo("XXG")); + } + + [Test] + public void IsBookFileName_BookNumPrefixA() + { + ParatextProjectSettings settings = CreateSettings("41"); + Assert.That(settings.IsBookFileName("PROJA0.SFM", out string bookId), Is.True); + Assert.That(bookId, Is.EqualTo("FRT")); + } + + [Test] + public void IsBookFileName_BookNumPrefixB() + { + ParatextProjectSettings settings = CreateSettings("41"); + Assert.That(settings.IsBookFileName("PROJB0.SFM", out string bookId), Is.True); + Assert.That(bookId, Is.EqualTo("TDX")); + } + + [Test] + public void IsBookFileName_BookNumPrefixC() + { + ParatextProjectSettings settings = CreateSettings("41"); + Assert.That(settings.IsBookFileName("PROJC0.SFM", out string bookId), Is.True); + Assert.That(bookId, Is.EqualTo("3MQ")); + } + + [Test] + public void IsBookFileName_WrongPrefix() + { + ParatextProjectSettings settings = CreateSettings("41"); + Assert.That(settings.IsBookFileName("WRONG42.SFM", out _), Is.False); + } + + [Test] + public void IsBookFileName_WrongSuffix() + { + ParatextProjectSettings settings = CreateSettings("41"); + Assert.That(settings.IsBookFileName("PROJ42.TXT", out _), Is.False); + } + + [Test] + public void IsBookFileName_WrongBookPart_BookNum() + { + ParatextProjectSettings settings = CreateSettings("41"); + Assert.That(settings.IsBookFileName("PROJ42MRK.SFM", out _), Is.False); + } + + [Test] + public void IsBookFileName_WrongBookPart_BookId() + { + ParatextProjectSettings settings = CreateSettings("MAT"); + Assert.That(settings.IsBookFileName("PROJ42.SFM", out _), Is.False); + } + + [Test] + public void IsBookFileName_WrongBookPart_BookNumBookId() + { + ParatextProjectSettings settings = CreateSettings("41MAT"); + Assert.That(settings.IsBookFileName("PROJMRK.SFM", out _), Is.False); + Assert.That(settings.IsBookFileName("PROJ100.SFM", out _), Is.False); + } + private static ParatextProjectSettings CreateSettings(string fileNameForm) { return new ParatextProjectSettings( diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/04LEVTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM similarity index 100% rename from tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/04LEVTes.SFM rename to tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/44JHNTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/44JHNTes.SFM new file mode 100644 index 000000000..e69de29bb