Skip to content

Commit

Permalink
Add support for mixed source corpora (#177)
Browse files Browse the repository at this point in the history
- align all Scripture corpora to Original versification
  • Loading branch information
ddaspit authored Mar 22, 2024
1 parent b4e91d0 commit 29c9799
Show file tree
Hide file tree
Showing 41 changed files with 950 additions and 529 deletions.
8 changes: 4 additions & 4 deletions src/SIL.Machine.AspNetCore/Models/Corpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ public record Corpus
public required string TargetLanguage { get; init; }
public required bool TrainOnAll { get; init; }
public required bool PretranslateAll { get; init; }
public IReadOnlyDictionary<string, IReadOnlySet<int>>? TrainOnChapters { get; init; }
public IReadOnlyDictionary<string, IReadOnlySet<int>>? PretranslateChapters { get; init; }
public required IReadOnlySet<string> TrainOnTextIds { get; init; }
public required IReadOnlySet<string> PretranslateTextIds { get; init; }
public IReadOnlyDictionary<string, HashSet<int>>? TrainOnChapters { get; init; }
public IReadOnlyDictionary<string, HashSet<int>>? PretranslateChapters { get; init; }
public required HashSet<string> TrainOnTextIds { get; init; }
public required HashSet<string> PretranslateTextIds { get; init; }
public required IReadOnlyList<CorpusFile> SourceFiles { get; init; }
public required IReadOnlyList<CorpusFile> TargetFiles { get; init; }
}
60 changes: 38 additions & 22 deletions src/SIL.Machine.AspNetCore/Services/CorpusService.cs
Original file line number Diff line number Diff line change
@@ -1,35 +1,51 @@
namespace SIL.Machine.AspNetCore.Services;

public enum CorpusType
{
Text,
Term
}

public class CorpusService : ICorpusService
{
public IDictionary<CorpusType, ITextCorpus> CreateTextCorpus(IReadOnlyList<CorpusFile> files)
public IEnumerable<ITextCorpus> CreateTextCorpora(IReadOnlyList<CorpusFile> files)
{
IDictionary<CorpusType, ITextCorpus> corpora = new Dictionary<CorpusType, ITextCorpus>();
if (files.Count == 1 && files[0].Format == FileFormat.Paratext)
List<ITextCorpus> corpora = [];

List<Dictionary<string, IText>> textFileCorpora = [];
foreach (CorpusFile file in files)
{
corpora[CorpusType.Text] = new ParatextBackupTextCorpus(files[0].Location);
corpora[CorpusType.Term] = new ParatextBackupTermsCorpus(files[0].Location, new string[] { "PN" });
switch (file.Format)
{
case FileFormat.Text:
// if there are multiple texts with the same id, then add it to a new corpus or the first
// corpus that doesn't contain a text with that id
Dictionary<string, IText>? corpus = textFileCorpora.FirstOrDefault(c =>
!c.ContainsKey(file.TextId)
);
if (corpus is null)
{
corpus = [];
textFileCorpora.Add(corpus);
}
corpus[file.TextId] = new TextFileText(file.TextId, file.Location);
break;

case FileFormat.Paratext:
corpora.Add(new ParatextBackupTextCorpus(file.Location));
break;
}
}
else
foreach (Dictionary<string, IText> corpus in textFileCorpora)
corpora.Add(new DictionaryTextCorpus(corpus.Values));

return corpora;
}

public IEnumerable<ITextCorpus> CreateTermCorpora(IReadOnlyList<CorpusFile> files)
{
foreach (CorpusFile file in files)
{
var texts = new List<IText>();
foreach (CorpusFile file in files)
switch (file.Format)
{
switch (file.Format)
{
case FileFormat.Text:
texts.Add(new TextFileText(file.TextId, file.Location));
break;
}
case FileFormat.Paratext:
yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]);
break;
}
corpora[CorpusType.Text] = new DictionaryTextCorpus(texts);
}
return corpora;
}
}
3 changes: 2 additions & 1 deletion src/SIL.Machine.AspNetCore/Services/ICorpusService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@

public interface ICorpusService
{
IDictionary<CorpusType, ITextCorpus> CreateTextCorpus(IReadOnlyList<CorpusFile> files);
IEnumerable<ITextCorpus> CreateTextCorpora(IReadOnlyList<CorpusFile> files);
IEnumerable<ITextCorpus> CreateTermCorpora(IReadOnlyList<CorpusFile> files);
}
Loading

0 comments on commit 29c9799

Please sign in to comment.