Skip to content

Commit

Permalink
Extract book id from filename (#193)
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit authored Apr 24, 2024
1 parent 52b732c commit 448bd1e
Show file tree
Hide file tree
Showing 10 changed files with 240 additions and 94 deletions.
34 changes: 16 additions & 18 deletions src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using System.IO.Compression;
using System.Linq;
using System.Text.RegularExpressions;

namespace SIL.Machine.Corpora
{
Expand All @@ -15,23 +13,23 @@ public ParatextBackupTextCorpus(string fileName, bool includeMarkers = false, bo

Versification = settings.Versification;

var regex = new Regex(
$"^{Regex.Escape(settings.FileNamePrefix)}.*{Regex.Escape(settings.FileNameSuffix)}$"
);

foreach (ZipArchiveEntry sfmEntry in archive.Entries.Where(e => regex.IsMatch(e.FullName)))
foreach (ZipArchiveEntry sfmEntry in archive.Entries)
{
AddText(
new UsfmZipText(
settings.Stylesheet,
settings.Encoding,
fileName,
sfmEntry.FullName,
Versification,
includeMarkers,
includeAllText
)
);
if (settings.IsBookFileName(sfmEntry.FullName, out string bookId))
{
AddText(
new UsfmZipText(
settings.Stylesheet,
settings.Encoding,
bookId,
fileName,
sfmEntry.FullName,
Versification,
includeMarkers,
includeAllText
)
);
}
}
}
}
Expand Down
57 changes: 57 additions & 0 deletions src/SIL.Machine/Corpora/ParatextProjectSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,48 @@ string biblicalTermsFileName
public string BiblicalTermsProjectName { get; }
public string BiblicalTermsFileName { get; }

public bool IsBookFileName(string fileName, out string bookId)
{
bookId = null;

if (!fileName.StartsWith(FileNamePrefix) || !fileName.EndsWith(FileNameSuffix))
return false;

string bookPart = fileName.Substring(
FileNamePrefix.Length,
fileName.Length - FileNamePrefix.Length - FileNameSuffix.Length
);
if (FileNameForm == "MAT")
{
if (bookPart.Length != 3)
return false;

bookId = bookPart;
}
else if (FileNameForm == "40" || FileNameForm == "41")
{
if (bookPart != "100" && bookPart.Length != 2)
return false;

bookId = Canon.BookNumberToId(GetBookNumber(bookPart));
}
else
{
if (bookPart.StartsWith("100"))
{
if (bookPart.Length != 6)
return false;
}
else if (bookPart.Length != 5)
{
return false;
}

bookId = bookPart.Length == 5 ? bookPart.Substring(2) : bookPart.Substring(3);
}
return true;
}

public string GetBookFileName(string bookId)
{
string bookPart;
Expand Down Expand Up @@ -73,5 +115,20 @@ private static string GetBookFileNameDigits(string bookId)
return "B" + (bookNum - 110);
return "C" + (bookNum - 120);
}

private static int GetBookNumber(string bookFileNameDigits)
{
if (bookFileNameDigits.StartsWith("A"))
return 100 + int.Parse(bookFileNameDigits.Substring(1), CultureInfo.InvariantCulture);
if (bookFileNameDigits.StartsWith("B"))
return 110 + int.Parse(bookFileNameDigits.Substring(1), CultureInfo.InvariantCulture);
if (bookFileNameDigits.StartsWith("C"))
return 120 + int.Parse(bookFileNameDigits.Substring(1), CultureInfo.InvariantCulture);

int bookNum = int.Parse(bookFileNameDigits, CultureInfo.InvariantCulture);
if (bookNum >= 40)
return bookNum - 1;
return bookNum;
}
}
}
24 changes: 14 additions & 10 deletions src/SIL.Machine/Corpora/ParatextTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,20 @@ string sfmFileName in Directory.EnumerateFiles(
)
)
{
AddText(
new UsfmFileText(
settings.Stylesheet,
settings.Encoding,
sfmFileName,
Versification,
includeMarkers,
includeAllText
)
);
if (settings.IsBookFileName(Path.GetFileName(sfmFileName), out string bookId))
{
AddText(
new UsfmFileText(
settings.Stylesheet,
settings.Encoding,
bookId,
sfmFileName,
Versification,
includeMarkers,
includeAllText
)
);
}
}
}
}
Expand Down
28 changes: 3 additions & 25 deletions src/SIL.Machine/Corpora/UsfmFileText.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using System;
using System.IO;
using System.Text;
using System.Text;
using SIL.Scripture;

namespace SIL.Machine.Corpora
Expand All @@ -12,12 +10,13 @@ public class UsfmFileText : UsfmTextBase
public UsfmFileText(
UsfmStylesheet stylesheet,
Encoding encoding,
string id,
string fileName,
ScrVers versification = null,
bool includeMarkers = false,
bool includeAllText = false
)
: base(GetId(fileName, encoding), stylesheet, encoding, versification, includeMarkers, includeAllText)
: base(id, stylesheet, encoding, versification, includeMarkers, includeAllText)
{
_fileName = fileName;
}
Expand All @@ -26,26 +25,5 @@ protected override IStreamContainer CreateStreamContainer()
{
return new FileStreamContainer(_fileName);
}

private static string GetId(string fileName, Encoding encoding)
{
using (var reader = new StreamReader(fileName, encoding))
{
string line;
while ((line = reader.ReadLine()) != null)
{
line = line.Trim();
if (line.StartsWith("\\id "))
{
string id = line.Substring(4);
int index = id.IndexOf(" ");
if (index != -1)
id = id.Substring(0, index);
return id.Trim();
}
}
}
throw new InvalidOperationException("The USFM does not contain an 'id' marker.");
}
}
}
39 changes: 36 additions & 3 deletions src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,43 @@ public UsfmFileTextCorpus(
var stylesheet = new UsfmStylesheet(stylesheetFileName);
foreach (string sfmFileName in Directory.EnumerateFiles(projectPath, filePattern))
{
AddText(
new UsfmFileText(stylesheet, encoding, sfmFileName, Versification, includeMarkers, includeAllText)
);
string id = GetId(sfmFileName, encoding);
if (id != null)
{
AddText(
new UsfmFileText(
stylesheet,
encoding,
id,
sfmFileName,
Versification,
includeMarkers,
includeAllText
)
);
}
}
}

private static string GetId(string fileName, Encoding encoding)
{
using (var reader = new StreamReader(fileName, encoding))
{
string line;
while ((line = reader.ReadLine()) != null)
{
line = line.Trim();
if (line.StartsWith("\\id "))
{
string id = line.Substring(4);
int index = id.IndexOf(" ");
if (index != -1)
id = id.Substring(0, index);
return id.Trim();
}
}
}
return null;
}
}
}
40 changes: 3 additions & 37 deletions src/SIL.Machine/Corpora/UsfmZipText.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
using System;
using System.IO;
using System.IO.Compression;
using System.Text;
using System.Text;
using SIL.Scripture;

namespace SIL.Machine.Corpora
Expand All @@ -14,20 +11,14 @@ public class UsfmZipText : UsfmTextBase
public UsfmZipText(
UsfmStylesheet stylesheet,
Encoding encoding,
string id,
string archiveFileName,
string path,
ScrVers versification = null,
bool includeMarkers = false,
bool includeAllText = false
)
: base(
GetId(archiveFileName, path, encoding),
stylesheet,
encoding,
versification,
includeMarkers,
includeAllText
)
: base(id, stylesheet, encoding, versification, includeMarkers, includeAllText)
{
_archiveFileName = archiveFileName;
_path = path;
Expand All @@ -37,30 +28,5 @@ protected override IStreamContainer CreateStreamContainer()
{
return new ZipEntryStreamContainer(_archiveFileName, _path);
}

private static string GetId(string archiveFileName, string path, Encoding encoding)
{
using (var archive = ZipFile.OpenRead(archiveFileName))
{
ZipArchiveEntry entry = archive.GetEntry(path);
using (var reader = new StreamReader(entry.Open(), encoding))
{
string line;
while ((line = reader.ReadLine()) != null)
{
line = line.Trim();
if (line.StartsWith("\\id "))
{
string id = line.Substring(4);
int index = id.IndexOf(" ");
if (index != -1)
id = id.Substring(0, index);
return id.Trim();
}
}
}
}
throw new InvalidOperationException("The USFM does not contain an 'id' marker.");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ public class ParatextBackupTextCorpusTests
public void Texts()
{
using var env = new TestEnvironment();
Assert.That(env.Corpus.Texts.Select(t => t.Id), Is.EquivalentTo(new[] { "LEV", "1CH", "MAT", "MRK" }));
Assert.That(env.Corpus.Texts.Select(t => t.Id), Is.EquivalentTo(new[] { "LEV", "1CH", "MAT", "MRK", "JHN" }));
}

[Test]
Expand All @@ -20,6 +20,8 @@ public void TryGetText()
Assert.That(env.Corpus.TryGetText("MAT", out IText mat), Is.True);
Assert.That(mat.GetRows(), Is.Not.Empty);
Assert.That(env.Corpus.TryGetText("LUK", out _), Is.False);
Assert.That(env.Corpus.TryGetText("JHN", out IText jhn), Is.True);
Assert.That(jhn.GetRows(), Is.Empty);
}

private class TestEnvironment : DisposableBase
Expand Down
Loading

0 comments on commit 448bd1e

Please sign in to comment.