Merge pull request #1681 from Princeton-CDH/feature/1582-hebrew-prefixes

Search prefixed Hebrew words, return non-prefixed versions (#1582)
Princeton-CDH · Nov 21, 2024 · f5c6cb4 · f5c6cb4
2 parents 9617499 + 3c30b12
commit f5c6cb4
Show file tree

Hide file tree

Showing 3 changed files with 83 additions and 2 deletions.
diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
@@ -135,6 +135,44 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
     # if search consists only of quoted phrase scoped to shelfmark, handle separately
     shelfmark_query = None
 
+    # hebrew prefixes that should be removed to produce an additional keyword to search
+    re_hebrew_prefix = re.compile(r"\b(אל|[ולבכמהשׁפ])[\u0590-\u05fe]+\b")
+
+    def _handle_hebrew_prefixes(self, search_term):
+        # if any word begins with one of the prefixes, update search to include the word
+        # without that prefix as well
+        prefixed_words = self.re_hebrew_prefix.finditer(search_term)
+        prefixed_words = [w.group(0) for w in prefixed_words]
+        if prefixed_words:
+            prefixed_or_nonprefixed_query = [
+                # handle two-charater prefix אל by removing 2 chars
+                f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})"
+                for word in prefixed_words
+            ]
+            # use a custom delimiter to split on, since we need a capturing
+            # group in the original expression, but it changes the split function's
+            # behavior in an undesirable way
+            delim = "!SPLITME!"
+            nonprefixed_words = [
+                n
+                for n in re.sub(self.re_hebrew_prefix, delim, search_term).split(delim)
+                if n
+            ]
+
+            # stitch the search query back together
+            return "".join(
+                itertools.chain.from_iterable(
+                    (
+                        itertools.zip_longest(
+                            nonprefixed_words,
+                            prefixed_or_nonprefixed_query,
+                            fillvalue="",
+                        )
+                    )
+                )
+            )
+        return search_term
+
     def _search_term_cleanup(self, search_term):
         # adjust user search string before sending to solr
 
@@ -157,7 +195,8 @@ def _search_term_cleanup(self, search_term):
             # add in judaeo-arabic conversion for the rest (double-quoted phrase should NOT be
             # converted to JA, as this breaks if any brackets or other sigla are in doublequotes)
             remaining_phrases = [
-                arabic_or_ja(p) for p in self.re_exact_match.split(search_term)
+                self._handle_hebrew_prefixes(arabic_or_ja(p))
+                for p in self.re_exact_match.split(search_term)
             ]
             # stitch the search query back together, in order, so that boolean operators
             # and phrase order are preserved
@@ -171,7 +210,7 @@ def _search_term_cleanup(self, search_term):
                 )
             )
         else:
-            search_term = arabic_or_ja(search_term)
+            search_term = self._handle_hebrew_prefixes(arabic_or_ja(search_term))
 
         # convert any field aliases used in search terms to actual solr fields
         # (i.e. "pgpid:950 shelfmark:ena" -> "pgpid_i:950 shelfmark_t:ena")

diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -217,6 +217,16 @@ def test_search_term_cleanup__quoted_shelfmark_only(self):
         assert "NS" in dqs._search_term_cleanup("shelfmark:NS")
         assert not dqs.shelfmark_query
 
+    def test_handle_hebrew_prefixes(self):
+        dqs = DocumentSolrQuerySet()
+        # should replace words with hebrew prefixes with OR queries
+        # on the same word with or without prefix
+        assert dqs._search_term_cleanup("אלמרכב") == "(אלמרכב OR מרכב)"
+        assert (
+            dqs._search_term_cleanup("test one משיח two כבוד")
+            == "test one (משיח OR שיח) two (כבוד OR בוד)"
+        )
+
     def test_keyword_search__quoted_shelfmark(self):
         dqs = DocumentSolrQuerySet()
         with patch.object(dqs, "search") as mocksearch:

diff --git a/geniza/corpus/tests/test_corpus_views.py b/geniza/corpus/tests/test_corpus_views.py
@@ -1307,6 +1307,38 @@ def test_exact_search_highlight(self, source, empty_solr):
             in dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][0]
         )
 
+    @pytest.mark.django_db
+    def test_hebrew_prefix_highlight(self, source, empty_solr):
+        # test matching for words without searched hebrew prefixes
+        document = Document.objects.create()
+        footnote = Footnote.objects.create(
+            content_object=document,
+            source=source,
+            doc_relation=Footnote.DIGITAL_EDITION,
+        )
+        Annotation.objects.create(
+            footnote=footnote,
+            content={
+                # body contains word מרכב without prefix אל
+                "body": [{"value": "מרכב"}],
+                "target": {
+                    "source": {
+                        "id": source.uri,
+                    }
+                },
+            },
+        )
+        SolrClient().update.index([document.index_data()], commit=True)
+        docsearch_view = DocumentSearchView(kwargs={})
+        docsearch_view.request = Mock()
+
+        # should match word without prefix, smaller than the entered query
+        docsearch_view.request.GET = {"q": "אלמרכב"}
+        dqs = docsearch_view.get_queryset()
+        assert dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][
+            0
+        ] == clean_html("<em>מרכב</em>")
+
 
 class TestDocumentScholarshipView:
     def test_page_title(self, document, client, source):