From 8ea1117e3324de3eefd43ceb9fcf3374edcda277 Mon Sep 17 00:00:00 2001 From: "Eli C. Lowry" <83078660+Enkidu93@users.noreply.github.com> Date: Wed, 7 Feb 2024 11:05:13 -0500 Subject: [PATCH] Scripture range parser (#168) * Added chapter-level filtering; fixes https://github.com/sillsdev/serval/issues/150 * Move scripture range parsing to Serval * Changes as per review comments * Count() to Count * Remove AspNetCore changes * Remove import change --- .../Scripture/ScriptureRangeParser.cs | 243 ++++++++++++++++++ .../Scripture/ScriptureRangeParserTests.cs | 205 +++++++++++++++ 2 files changed, 448 insertions(+) create mode 100644 src/SIL.Machine/Scripture/ScriptureRangeParser.cs create mode 100644 tests/SIL.Machine.Tests/Scripture/ScriptureRangeParserTests.cs diff --git a/src/SIL.Machine/Scripture/ScriptureRangeParser.cs b/src/SIL.Machine/Scripture/ScriptureRangeParser.cs new file mode 100644 index 000000000..e3f78ba3b --- /dev/null +++ b/src/SIL.Machine/Scripture/ScriptureRangeParser.cs @@ -0,0 +1,243 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using SIL.Extensions; +using SIL.Scripture; + +public class ScriptureRangeParser +{ + private readonly Dictionary _bookLengths = new Dictionary(); + private static readonly Regex CommaSeparatedBooks = new Regex( + @"^([A-Z\d]{3}|OT|NT)(, ?([A-Z\d]{3}|OT|NT))*$", + RegexOptions.Compiled + ); + private static readonly Regex BookRange = new Regex(@"^-?[A-Z\d]{3}-[A-Z\d]{3}$", RegexOptions.Compiled); + private static readonly Regex ChapterSelection = new Regex( + @"^-?[A-Z\d]{3} ?(\d+|\d+-\d+)(, ?(\d+|\d+-\d+))*$", + RegexOptions.Compiled + ); + + public static Dictionary> GetChapters(string chapterSelections, ScrVers versification = null) + { + return new ScriptureRangeParser(versification).GetChapters(chapterSelections); + } + + public ScriptureRangeParser(ScrVers versification = null) + { + if (versification == null) + versification = ScrVers.Original; + foreach ((string bookId, int bookNum) in Canon.AllBookIds.Zip(Canon.AllBookNumbers)) + { + _bookLengths[bookId] = versification.GetLastChapter(bookNum); + } + } + + private Dictionary> ParseSection(string section) + { + section = section.Trim(); + Dictionary> chaptersPerBook = new Dictionary>(); + + //*Specific chapters from one book* + if (char.IsDigit(section.Last())) + { + string bookName = section.Substring(0, 3); + if (!_bookLengths.ContainsKey(bookName)) + { + throw new ArgumentException($"{bookName} is an invalid book ID."); + } + + HashSet chapters = new HashSet(); + + int lastChapter = _bookLengths[bookName]; + string[] chapterRangeStrings = section.Substring(3).Split(','); + foreach (string chapterRangeString in chapterRangeStrings.Select(s => s.Trim())) + { + if (chapterRangeString.Contains('-')) + { + string[] startAndEnd = chapterRangeString.Split('-'); + int start, + end; + if (!(int.TryParse(startAndEnd[0], out start) && int.TryParse(startAndEnd[1], out end))) + { + throw new ArgumentException($"{chapterRangeString} is an invalid chapter range."); + } + if (start == 0 || end > lastChapter || end <= start) + { + throw new ArgumentException($"{chapterRangeString} is an invalid chapter range."); + } + for (int chapterNum = start; chapterNum <= end; chapterNum++) + { + chapters.Add(chapterNum); + } + } + else + { + int chapterNum; + if (!int.TryParse(chapterRangeString, out chapterNum)) + { + throw new ArgumentException($"{section} is an invalid chapter number."); + } + if (chapterNum > lastChapter) + { + throw new ArgumentException($"{section} is an invalid chapter number."); + } + chapters.Add(chapterNum); + } + } + if (chapters.Count() == lastChapter) + { + chaptersPerBook[bookName] = new List(); + } + else + { + chaptersPerBook[bookName] = chapters.ToList(); + chaptersPerBook[bookName].Sort(); + } + } + //*Ranges of books to be added* + else if (section.Contains('-')) + { + string[] startAndEnd = section.Split('-'); + if ( + startAndEnd.Length != 2 + || !_bookLengths.ContainsKey(startAndEnd[0]) + || !_bookLengths.ContainsKey(startAndEnd[1]) + || Canon.BookIdToNumber(startAndEnd[1]) <= Canon.BookIdToNumber(startAndEnd[0]) + ) + { + throw new ArgumentException($"{section} is an invalid book range."); + } + for ( + int bookNum = Canon.BookIdToNumber(startAndEnd[0]); + bookNum <= Canon.BookIdToNumber(startAndEnd[1]); + bookNum++ + ) + { + chaptersPerBook[Canon.BookNumberToId(bookNum)] = new List(); + } + } + //*OT* + else if (section == "OT") + { + for (int bookNum = 1; bookNum <= 39; bookNum++) + { + chaptersPerBook[Canon.BookNumberToId(bookNum)] = new List(); + } + } + //*NT* + else if (section == "NT") + { + for (int bookNum = 40; bookNum <= 66; bookNum++) + { + chaptersPerBook[Canon.BookNumberToId(bookNum)] = new List(); + } + } + //*Whole book* + else + { + if (!_bookLengths.ContainsKey(section)) + { + throw new ArgumentException($"{section} is an invalid book ID."); + } + chaptersPerBook[section] = new List(); + } + + return chaptersPerBook; + } + + public Dictionary> GetChapters(string chapterSelections) + { + Dictionary> chaptersPerBook = new Dictionary>(); + chapterSelections = chapterSelections.Trim(); + + char delimiter = ';'; + if (chapterSelections.Contains(';')) + { + delimiter = ';'; + } + else if (CommaSeparatedBooks.IsMatch(chapterSelections)) + { + delimiter = ','; + } + else if (!BookRange.IsMatch(chapterSelections) && !ChapterSelection.IsMatch(chapterSelections)) + { + throw new ArgumentException( + "Invalid syntax. If you are providing multiple selections, e.g. a range of books followed by a selection of chapters from a book, separate each selection with a semicolon." + ); + } + string[] selections = chapterSelections.Split(delimiter); + foreach (string section in selections.Select(s => s.Trim())) + { + //*Subtraction* + if (section.StartsWith("-")) + { + Dictionary> sectionChapters = ParseSection(section.Substring(1)); + foreach (string bookName in sectionChapters.Keys) + { + if (!chaptersPerBook.ContainsKey(bookName)) + { + throw new ArgumentException( + $"{bookName} cannot be removed as it is not in the existing book selection." + ); + } + + if (sectionChapters[bookName].Count() == 0) + { + sectionChapters[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList(); + } + + if (chaptersPerBook[bookName].Count() == 0) + { + chaptersPerBook[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList(); + } + + foreach (int chapterNumber in sectionChapters[bookName]) + { + if (!chaptersPerBook[bookName].Remove(chapterNumber)) + { + throw new ArgumentException( + $"{chapterNumber} cannot be removed as it is not in the existing chapter selection." + ); + } + } + + if (chaptersPerBook[bookName].Count() == 0) + { + chaptersPerBook.Remove(bookName); + } + } + } + //*Addition* + else + { + Dictionary> sectionChapters = ParseSection(section); + foreach (string bookName in sectionChapters.Keys) + { + if (chaptersPerBook.ContainsKey(bookName)) + { + if (chaptersPerBook[bookName].Count() == 0 || sectionChapters[bookName].Count() == 0) + { + chaptersPerBook[bookName] = new List(); + continue; + } + chaptersPerBook[bookName] = chaptersPerBook[bookName] + .Concat(sectionChapters[bookName]) + .Distinct() + .ToList(); + chaptersPerBook[bookName].Sort(); + if (chaptersPerBook[bookName].Count() == _bookLengths[bookName]) + { + chaptersPerBook[bookName] = new List(); + } + } + else + { + chaptersPerBook[bookName] = sectionChapters[bookName]; + } + } + } + } + return chaptersPerBook; + } +} diff --git a/tests/SIL.Machine.Tests/Scripture/ScriptureRangeParserTests.cs b/tests/SIL.Machine.Tests/Scripture/ScriptureRangeParserTests.cs new file mode 100644 index 000000000..4330f89ef --- /dev/null +++ b/tests/SIL.Machine.Tests/Scripture/ScriptureRangeParserTests.cs @@ -0,0 +1,205 @@ +using NUnit.Framework; +using SIL.Scripture; + +namespace SIL.Machine; + +[TestFixture] +public class ScriptureRangeParserTests +{ + [Test] + [TestCaseSource(nameof(GetCases))] + public void TestParse(string rangeString, Dictionary> expectedOutput, bool throwsException) + { + var parser = new ScriptureRangeParser(); + if (!throwsException) + { + Assert.That(parser.GetChapters(rangeString), Is.EquivalentTo(expectedOutput)); + } + else + { + Assert.Throws(() => + { + parser.GetChapters(rangeString); + }); + } + } + + public static IEnumerable GetCases() + { + yield return new TestCaseData("MAL", new Dictionary> { { "MAL", new List() } }, false); + yield return new TestCaseData( + "GEN,EXO", + new Dictionary> { { "GEN", new List() }, { "EXO", new List() } }, + false + ); + yield return new TestCaseData( + "1JN,2JN", + new Dictionary> { { "1JN", new List() }, { "2JN", new List() } }, + false + ); + yield return new TestCaseData( + "OT", + Enumerable.Range(1, 39).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), + false + ); + yield return new TestCaseData( + "NT", + Enumerable.Range(40, 27).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), + false + ); + yield return new TestCaseData( + "NT,OT", + Enumerable.Range(1, 66).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), + false + ); + yield return new TestCaseData( + "MAT;MRK", + new Dictionary> { { "MAT", new List() }, { "MRK", new List() } }, + false + ); + yield return new TestCaseData( + "MAT; MRK", + new Dictionary> { { "MAT", new List() }, { "MRK", new List() } }, + false + ); + yield return new TestCaseData( + "MAT1,2,3", + new Dictionary> + { + { + "MAT", + new List() { 1, 2, 3 } + } + }, + false + ); + yield return new TestCaseData( + "MAT1, 2, 3", + new Dictionary> + { + { + "MAT", + new List() { 1, 2, 3 } + } + }, + false + ); + yield return new TestCaseData( + "MAT-LUK", + new Dictionary> + { + { "MAT", new List() }, + { "MRK", new List() }, + { "LUK", new List() } + }, + false + ); + yield return new TestCaseData( + "MAT1,2,3;MAT-LUK", + new Dictionary> + { + { "MAT", new List() }, + { "MRK", new List() }, + { "LUK", new List() } + }, + false + ); + yield return new TestCaseData( + "2JN-3JN;EXO1,8,3-5;GEN", + new Dictionary> + { + { "GEN", new List() }, + { + "EXO", + new List() { 1, 3, 4, 5, 8 } + }, + { "2JN", new List() }, + { "3JN", new List() } + }, + false + ); + yield return new TestCaseData( + "1JN 1;1JN 2;1JN 3-5", + new Dictionary> { { "1JN", new List() } }, + false + ); + yield return new TestCaseData( + "MAT-ROM;-ACT4-28", + new Dictionary> + { + { "MAT", new List() }, + { "MRK", new List() }, + { "LUK", new List() }, + { "JHN", new List() }, + { + "ACT", + new List() { 1, 2, 3 } + }, + { "ROM", new List() } + }, + false + ); + yield return new TestCaseData("2JN;-2JN 1", new Dictionary> { }, false); + yield return new TestCaseData( + "NT;OT;-MRK;-EXO", + Enumerable + .Range(1, 66) + .Where(i => i != 2 && i != 41) + .Select(i => (Canon.BookNumberToId(i), new List())) + .ToDictionary(), + false + ); + yield return new TestCaseData( + "NT;-MAT3-5,17;-REV21,22", + Enumerable + .Range(40, 27) + .Select(i => + { + if (i == 40) + { + return ( + Canon.BookNumberToId(i), + Enumerable.Range(1, 28).Where(c => !(c == 3 || c == 4 || c == 5 || c == 17)).ToList() + ); + } + if (i == 66) + { + return (Canon.BookNumberToId(i), Enumerable.Range(1, 20).ToList()); + } + return (Canon.BookNumberToId(i), new List()); + }) + .ToDictionary(), + false + ); + yield return new TestCaseData( + "MAT-JHN;-MAT-LUK", + new Dictionary> { { "JHN", new List() } }, + false + ); + + //*Throw exceptions + yield return new TestCaseData("MAT3-1", new Dictionary>(), true); + yield return new TestCaseData("MRK-MAT", new Dictionary>(), true); + yield return new TestCaseData("MRK;-MRK10-3", new Dictionary>(), true); + yield return new TestCaseData("MAT0-10", new Dictionary>(), true); + yield return new TestCaseData("MAT-FLUM", new Dictionary>(), true); + yield return new TestCaseData("-MAT-FLUM", new Dictionary>(), true); + yield return new TestCaseData("", new Dictionary>(), true); + yield return new TestCaseData("ABC", new Dictionary>(), true); + yield return new TestCaseData("MAT-ABC", new Dictionary>(), true); + yield return new TestCaseData("NT;-ABC-LUK", new Dictionary>(), true); + yield return new TestCaseData("MAT 500", new Dictionary>(), true); + yield return new TestCaseData("MAT 1-500", new Dictionary>(), true); + yield return new TestCaseData("MAT;-MAT 300-500", new Dictionary>(), true); + yield return new TestCaseData("-MRK", new Dictionary>(), true); + yield return new TestCaseData("-MRK 1", new Dictionary>(), true); + yield return new TestCaseData("MRK 2-5;-MRK 1-4", new Dictionary>(), true); + yield return new TestCaseData("MRK 2-5;-MRK 6", new Dictionary>(), true); + yield return new TestCaseData("OT;-MRK-LUK", new Dictionary>(), true); + yield return new TestCaseData("NT;OT;-ABC", new Dictionary>(), true); + yield return new TestCaseData("MAT;-ABC 1", new Dictionary>(), true); + yield return new TestCaseData("NT,OT,-MRK,-EXO", new Dictionary>(), true); + yield return new TestCaseData("OT,MAT1", new Dictionary>(), true); + yield return new TestCaseData("OT,MAT-LUK", new Dictionary>(), true); + } +}