From f59e3a9c4539e993714fc7dd94de0417c320d726 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Thu, 21 Nov 2024 12:59:15 -0500 Subject: [PATCH 1/2] Search prefixed Hebrew words, return non-prefixed versions (#1582) --- geniza/corpus/solr_queryset.py | 43 ++++++++++++++++++- .../corpus/tests/test_corpus_solrqueryset.py | 10 +++++ geniza/corpus/tests/test_corpus_views.py | 32 ++++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py index 8d1392f13..545fd7830 100644 --- a/geniza/corpus/solr_queryset.py +++ b/geniza/corpus/solr_queryset.py @@ -135,6 +135,44 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet): # if search consists only of quoted phrase scoped to shelfmark, handle separately shelfmark_query = None + # hebrew prefixes that should be removed to produce an additional keyword to search + re_hebrew_prefix = re.compile(r"\b(אל|[ולבכמהשׁפ])[\u0590-\u05fe]+\b") + + def _handle_hebrew_prefixes(self, search_term): + # if any word begins with one of the prefixes, update search to include the word + # without that prefix as well + prefixed_words = self.re_hebrew_prefix.finditer(search_term) + if prefixed_words: + prefixed_words = [w.group(0) for w in prefixed_words] + prefixed_or_nonprefixed_query = [ + # handle two-charater prefix אל by removing 2 chars + f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})" + for word in prefixed_words + ] + # use a custom delimiter to split on, since we need a capturing + # group in the original expression, but it changes the split function's + # behavior in an undesirable way + delim = "!SPLITME!" + nonprefixed_words = [ + n + for n in re.sub(self.re_hebrew_prefix, delim, search_term).split(delim) + if n + ] + + # stitch the search query back together + return "".join( + itertools.chain.from_iterable( + ( + itertools.zip_longest( + nonprefixed_words, + prefixed_or_nonprefixed_query, + fillvalue="", + ) + ) + ) + ) + return search_term + def _search_term_cleanup(self, search_term): # adjust user search string before sending to solr @@ -157,7 +195,8 @@ def _search_term_cleanup(self, search_term): # add in judaeo-arabic conversion for the rest (double-quoted phrase should NOT be # converted to JA, as this breaks if any brackets or other sigla are in doublequotes) remaining_phrases = [ - arabic_or_ja(p) for p in self.re_exact_match.split(search_term) + self._handle_hebrew_prefixes(arabic_or_ja(p)) + for p in self.re_exact_match.split(search_term) ] # stitch the search query back together, in order, so that boolean operators # and phrase order are preserved @@ -171,7 +210,7 @@ def _search_term_cleanup(self, search_term): ) ) else: - search_term = arabic_or_ja(search_term) + search_term = self._handle_hebrew_prefixes(arabic_or_ja(search_term)) # convert any field aliases used in search terms to actual solr fields # (i.e. "pgpid:950 shelfmark:ena" -> "pgpid_i:950 shelfmark_t:ena") diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py index e6b540c2e..047e31608 100644 --- a/geniza/corpus/tests/test_corpus_solrqueryset.py +++ b/geniza/corpus/tests/test_corpus_solrqueryset.py @@ -217,6 +217,16 @@ def test_search_term_cleanup__quoted_shelfmark_only(self): assert "NS" in dqs._search_term_cleanup("shelfmark:NS") assert not dqs.shelfmark_query + def test_handle_hebrew_prefixes(self): + dqs = DocumentSolrQuerySet() + # should replace words with hebrew prefixes with OR queries + # on the same word with or without prefix + assert dqs._search_term_cleanup("אלמרכב") == "(אלמרכב OR מרכב)" + assert ( + dqs._search_term_cleanup("test one משיח two כבוד") + == "test one (משיח OR שיח) two (כבוד OR בוד)" + ) + def test_keyword_search__quoted_shelfmark(self): dqs = DocumentSolrQuerySet() with patch.object(dqs, "search") as mocksearch: diff --git a/geniza/corpus/tests/test_corpus_views.py b/geniza/corpus/tests/test_corpus_views.py index cec5ae8ca..8fd3a9618 100644 --- a/geniza/corpus/tests/test_corpus_views.py +++ b/geniza/corpus/tests/test_corpus_views.py @@ -1307,6 +1307,38 @@ def test_exact_search_highlight(self, source, empty_solr): in dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][0] ) + @pytest.mark.django_db + def test_hebrew_prefix_highlight(self, source, empty_solr): + # test matching for words without searched hebrew prefixes + document = Document.objects.create() + footnote = Footnote.objects.create( + content_object=document, + source=source, + doc_relation=Footnote.DIGITAL_EDITION, + ) + Annotation.objects.create( + footnote=footnote, + content={ + # body contains word מרכב without prefix אל + "body": [{"value": "מרכב"}], + "target": { + "source": { + "id": source.uri, + } + }, + }, + ) + SolrClient().update.index([document.index_data()], commit=True) + docsearch_view = DocumentSearchView(kwargs={}) + docsearch_view.request = Mock() + + # should match word without prefix, smaller than the entered query + docsearch_view.request.GET = {"q": "אלמרכב"} + dqs = docsearch_view.get_queryset() + assert dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][ + 0 + ] == clean_html("מרכב") + class TestDocumentScholarshipView: def test_page_title(self, document, client, source): From 3c30b12ba119ca650a836eb6ce68bb69c9b965ad Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Thu, 21 Nov 2024 13:27:50 -0500 Subject: [PATCH 2/2] Fix conditional check if prefixes are present (#1582) --- geniza/corpus/solr_queryset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py index 545fd7830..1a828d668 100644 --- a/geniza/corpus/solr_queryset.py +++ b/geniza/corpus/solr_queryset.py @@ -142,8 +142,8 @@ def _handle_hebrew_prefixes(self, search_term): # if any word begins with one of the prefixes, update search to include the word # without that prefix as well prefixed_words = self.re_hebrew_prefix.finditer(search_term) + prefixed_words = [w.group(0) for w in prefixed_words] if prefixed_words: - prefixed_words = [w.group(0) for w in prefixed_words] prefixed_or_nonprefixed_query = [ # handle two-charater prefix אל by removing 2 chars f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})"