Skip to content

Commit

Permalink
Fix usfm population (#195)
Browse files Browse the repository at this point in the history
Add test framework for testing usfm generation
Add test case
Fix double \va \vp tags for replacement.

Reverted tokenization change.
  • Loading branch information
johnml1135 authored Apr 25, 2024
1 parent 9ff51e6 commit dd7ac6d
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 63 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,5 @@ src/sentencepiece4cbuild/
tests/CoverageResults/*
tests/SIL.Machine.Tests/Corpora/TestData/usfm/source/*
tests/SIL.Machine.Tests/Corpora/TestData/usfm/target/*

tests/SIL.Machine.Tests/Corpora/TestData/project/*
tests/SIL.Machine.Tests/Corpora/TestData/pretranslations.json
48 changes: 26 additions & 22 deletions src/SIL.Machine/Corpora/UsfmTextUpdater.cs
Original file line number Diff line number Diff line change
Expand Up @@ -304,38 +304,42 @@ public string GetUsfm(UsfmStylesheet stylesheet)
private IReadOnlyList<string> AdvanceRows(IReadOnlyList<ScriptureRef> segScrRefs)
{
var rowTexts = new List<string>();
int i = 0;
while (_rowIndex < _rows.Count && i < segScrRefs.Count)
int sourceIndex = 0;
// search the sorted rows with updated text, starting from where we left off last.
while (_rowIndex < _rows.Count && sourceIndex < segScrRefs.Count)
{
// get the set of references for the current row
int compare = 0;
(IReadOnlyList<ScriptureRef> rowScrRefs, string text) = _rows[_rowIndex];
bool stop = false;
foreach (ScriptureRef rowScrRef in rowScrRefs)
{
bool found = false;
for (; i < segScrRefs.Count; i++)
while (sourceIndex < segScrRefs.Count)
{
int compare = rowScrRef.CompareTo(segScrRefs[i], compareSegments: false, _strictComparison);
if (compare == 0)
{
rowTexts.Add(text);
i++;
found = true;
compare = rowScrRef.CompareTo(
segScrRefs[sourceIndex],
compareSegments: false,
_strictComparison
);
if (compare > 0)
// source is ahead of row, increment source
sourceIndex++;
else
break;
}
else if (compare > 0)
{
stop = true;
break;
}
}
if (stop || found)
if (compare == 0)
{
// source and row match
// grab the text and increment both
rowTexts.Add(text);
sourceIndex++;
break;
}
}

if (stop)
break;
else
if (compare <= 0)
{
// row is ahead of source, increment row
_rowIndex++;
}
}
return rowTexts;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
\mt1 Major Title 1
\d \va (1)\va* Description
\q1
\v 1 \va 2\va* \vp 1 (2)\vp* Chapter 3 verse 1.
\v 1 \va 2\va*\vp 1 (2)\vp* Chapter 3 verse 1.
\q1 3.1 part 2
\b
\q1 3.1 part 3
Expand Down
99 changes: 99 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
using System.Text.Json;
using NUnit.Framework;

namespace SIL.Machine.Corpora;

[TestFixture]
public class UsfmManualTests
{
[Test]
[Ignore("This is for manual testing only. Remove this tag to run the test.")]
public void ParseParallelCorpus()
{
var tCorpus = new ParatextTextCorpus(
projectDir: CorporaTestHelpers.UsfmTargetProjectPath,
includeAllText: true,
includeMarkers: true
);

var sCorpus = new ParatextTextCorpus(
projectDir: CorporaTestHelpers.UsfmSourceProjectPath,
includeAllText: true,
includeMarkers: true
);

ParallelTextCorpus pCorpus = new ParallelTextCorpus(
sCorpus,
tCorpus,
alignmentCorpus: null,
rowRefComparer: null
)
{
AllSourceRows = true,
AllTargetRows = false
};

var rows = pCorpus.GetRows().ToList();
Assert.That(rows.Count, Is.Not.Zero);
}

public record PretranslationDto
{
public required string TextId { get; init; }
public required IReadOnlyList<string> Refs { get; init; }
public required string Translation { get; init; }
}

public static readonly string PretranslationPath = Path.Combine(
CorporaTestHelpers.TestDataPath,
"pretranslations.json"
);
public static readonly string ParatextProjectPath = Path.Combine(CorporaTestHelpers.TestDataPath, "project");

[Test]
[Ignore("This is for manual testing only. Remove this tag to run the test.")]
public async Task CreateUsfmFile()
{
var parser = new FileParatextProjectSettingsParser(ParatextProjectPath);
ParatextProjectSettings settings = parser.Parse();

// Read text from pretranslations file
Stream pretranslationStream = File.OpenRead(PretranslationPath);
IAsyncEnumerable<PretranslationDto?>? pretranslations =
JsonSerializer.DeserializeAsyncEnumerable<PretranslationDto>(
pretranslationStream,
new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase }
);

var pretranslationsList = (await pretranslations.ToListAsync())
.Where(p => p is not null)
.Select(p =>
(
(IReadOnlyList<ScriptureRef>)
p!.Refs.Select(r => ScriptureRef.Parse(r, settings.Versification)).ToList(),
p.Translation
)
)
.OrderBy(p => p.Item1[0])
.ToList();

foreach (
string sfmFileName in Directory.EnumerateFiles(
ParatextProjectPath,
$"{settings.FileNamePrefix}*{settings.FileNameSuffix}"
)
)
{
var updater = new UsfmTextUpdater(
pretranslationsList,
stripAllText: true,
strictComparison: false,
preferExistingText: true
);
var usfm = await File.ReadAllTextAsync(sfmFileName);
UsfmParser.Parse(usfm, updater, settings.Stylesheet, settings.Versification);
var newUsfm = updater.GetUsfm(settings.Stylesheet);
Assert.That(newUsfm, Is.Not.Null);
}
}
}
38 changes: 0 additions & 38 deletions tests/SIL.Machine.Tests/Corpora/UsfmParserTests.cs

This file was deleted.

33 changes: 33 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,39 @@ public void GetUsfm_NonVerse_ReplaceNote()
);
}

[Test]
public void GetUsfm_Verse_DoubleVaVp()
{
var rows = new List<(IReadOnlyList<ScriptureRef>, string)>
{
(ScrRef("MAT 3:1"), "Updating later in the book to start.")
};

string target = UpdateUsfm(rows);
Assert.That(target, Contains.Substring("\\id MAT - Test\r\n"));
Assert.That(
target,
Contains.Substring("\\v 1 \\va 2\\va*\\vp 1 (2)\\vp*Updating later in the book to start.\r\n")
);
}

[Test]
public void GetUsfm_Verse_PretranslationsBeforeText()
{
var rows = new List<(IReadOnlyList<ScriptureRef>, string)>
{
(ScrRef("GEN 1:1"), "Pretranslations before the start"),
(ScrRef("GEN 1:2"), "Pretranslations before the start"),
(ScrRef("GEN 1:3"), "Pretranslations before the start"),
(ScrRef("GEN 1:4"), "Pretranslations before the start"),
(ScrRef("GEN 1:5"), "Pretranslations before the start"),
(ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.")
};

string target = UpdateUsfm(rows);
Assert.That(target, Contains.Substring("\\ip The introductory paragraph.\r\n"));
}

private static ScriptureRef[] ScrRef(params string[] refs)
{
return refs.Select(r => ScriptureRef.Parse(r)).ToArray();
Expand Down
2 changes: 1 addition & 1 deletion tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public void Tokenize()
string usfm = ReadUsfm();
var tokenizer = new UsfmTokenizer();
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
Assert.That(tokens, Has.Count.EqualTo(203));
Assert.That(tokens, Has.Count.EqualTo(202));

Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book));
Assert.That(tokens[0].Marker, Is.EqualTo("id"));
Expand Down
1 change: 1 addition & 0 deletions tests/SIL.Machine.Tests/SIL.Machine.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="System.Linq.Async" Version="6.0.1" />
</ItemGroup>

<ItemGroup>
Expand Down

0 comments on commit dd7ac6d

Please sign in to comment.