From f59e3a9c4539e993714fc7dd94de0417c320d726 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Thu, 21 Nov 2024 12:59:15 -0500
Subject: [PATCH 1/2] Search prefixed Hebrew words, return non-prefixed
 versions (#1582)

---
 geniza/corpus/solr_queryset.py                | 43 ++++++++++++++++++-
 .../corpus/tests/test_corpus_solrqueryset.py  | 10 +++++
 geniza/corpus/tests/test_corpus_views.py      | 32 ++++++++++++++
 3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
index 8d1392f13..545fd7830 100644
--- a/geniza/corpus/solr_queryset.py
+++ b/geniza/corpus/solr_queryset.py
@@ -135,6 +135,44 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
     # if search consists only of quoted phrase scoped to shelfmark, handle separately
     shelfmark_query = None
 
+    # hebrew prefixes that should be removed to produce an additional keyword to search
+    re_hebrew_prefix = re.compile(r"\b(אל|[ולבכמהשׁפ])[\u0590-\u05fe]+\b")
+
+    def _handle_hebrew_prefixes(self, search_term):
+        # if any word begins with one of the prefixes, update search to include the word
+        # without that prefix as well
+        prefixed_words = self.re_hebrew_prefix.finditer(search_term)
+        if prefixed_words:
+            prefixed_words = [w.group(0) for w in prefixed_words]
+            prefixed_or_nonprefixed_query = [
+                # handle two-charater prefix אל by removing 2 chars
+                f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})"
+                for word in prefixed_words
+            ]
+            # use a custom delimiter to split on, since we need a capturing
+            # group in the original expression, but it changes the split function's
+            # behavior in an undesirable way
+            delim = "!SPLITME!"
+            nonprefixed_words = [
+                n
+                for n in re.sub(self.re_hebrew_prefix, delim, search_term).split(delim)
+                if n
+            ]
+
+            # stitch the search query back together
+            return "".join(
+                itertools.chain.from_iterable(
+                    (
+                        itertools.zip_longest(
+                            nonprefixed_words,
+                            prefixed_or_nonprefixed_query,
+                            fillvalue="",
+                        )
+                    )
+                )
+            )
+        return search_term
+
     def _search_term_cleanup(self, search_term):
         # adjust user search string before sending to solr
 
@@ -157,7 +195,8 @@ def _search_term_cleanup(self, search_term):
             # add in judaeo-arabic conversion for the rest (double-quoted phrase should NOT be
             # converted to JA, as this breaks if any brackets or other sigla are in doublequotes)
             remaining_phrases = [
-                arabic_or_ja(p) for p in self.re_exact_match.split(search_term)
+                self._handle_hebrew_prefixes(arabic_or_ja(p))
+                for p in self.re_exact_match.split(search_term)
             ]
             # stitch the search query back together, in order, so that boolean operators
             # and phrase order are preserved
@@ -171,7 +210,7 @@ def _search_term_cleanup(self, search_term):
                 )
             )
         else:
-            search_term = arabic_or_ja(search_term)
+            search_term = self._handle_hebrew_prefixes(arabic_or_ja(search_term))
 
         # convert any field aliases used in search terms to actual solr fields
         # (i.e. "pgpid:950 shelfmark:ena" -> "pgpid_i:950 shelfmark_t:ena")
diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
index e6b540c2e..047e31608 100644
--- a/geniza/corpus/tests/test_corpus_solrqueryset.py
+++ b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -217,6 +217,16 @@ def test_search_term_cleanup__quoted_shelfmark_only(self):
         assert "NS" in dqs._search_term_cleanup("shelfmark:NS")
         assert not dqs.shelfmark_query
 
+    def test_handle_hebrew_prefixes(self):
+        dqs = DocumentSolrQuerySet()
+        # should replace words with hebrew prefixes with OR queries
+        # on the same word with or without prefix
+        assert dqs._search_term_cleanup("אלמרכב") == "(אלמרכב OR מרכב)"
+        assert (
+            dqs._search_term_cleanup("test one משיח two כבוד")
+            == "test one (משיח OR שיח) two (כבוד OR בוד)"
+        )
+
     def test_keyword_search__quoted_shelfmark(self):
         dqs = DocumentSolrQuerySet()
         with patch.object(dqs, "search") as mocksearch:
diff --git a/geniza/corpus/tests/test_corpus_views.py b/geniza/corpus/tests/test_corpus_views.py
index cec5ae8ca..8fd3a9618 100644
--- a/geniza/corpus/tests/test_corpus_views.py
+++ b/geniza/corpus/tests/test_corpus_views.py
@@ -1307,6 +1307,38 @@ def test_exact_search_highlight(self, source, empty_solr):
             in dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][0]
         )
 
+    @pytest.mark.django_db
+    def test_hebrew_prefix_highlight(self, source, empty_solr):
+        # test matching for words without searched hebrew prefixes
+        document = Document.objects.create()
+        footnote = Footnote.objects.create(
+            content_object=document,
+            source=source,
+            doc_relation=Footnote.DIGITAL_EDITION,
+        )
+        Annotation.objects.create(
+            footnote=footnote,
+            content={
+                # body contains word מרכב without prefix אל
+                "body": [{"value": "מרכב"}],
+                "target": {
+                    "source": {
+                        "id": source.uri,
+                    }
+                },
+            },
+        )
+        SolrClient().update.index([document.index_data()], commit=True)
+        docsearch_view = DocumentSearchView(kwargs={})
+        docsearch_view.request = Mock()
+
+        # should match word without prefix, smaller than the entered query
+        docsearch_view.request.GET = {"q": "אלמרכב"}
+        dqs = docsearch_view.get_queryset()
+        assert dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][
+            0
+        ] == clean_html("<em>מרכב</em>")
+
 
 class TestDocumentScholarshipView:
     def test_page_title(self, document, client, source):

From 3c30b12ba119ca650a836eb6ce68bb69c9b965ad Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Thu, 21 Nov 2024 13:27:50 -0500
Subject: [PATCH 2/2] Fix conditional check if prefixes are present (#1582)

---
 geniza/corpus/solr_queryset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
index 545fd7830..1a828d668 100644
--- a/geniza/corpus/solr_queryset.py
+++ b/geniza/corpus/solr_queryset.py
@@ -142,8 +142,8 @@ def _handle_hebrew_prefixes(self, search_term):
         # if any word begins with one of the prefixes, update search to include the word
         # without that prefix as well
         prefixed_words = self.re_hebrew_prefix.finditer(search_term)
+        prefixed_words = [w.group(0) for w in prefixed_words]
         if prefixed_words:
-            prefixed_words = [w.group(0) for w in prefixed_words]
             prefixed_or_nonprefixed_query = [
                 # handle two-charater prefix אל by removing 2 chars
                 f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})"