Skip to content

Commit

Permalink
Merge pull request #1681 from Princeton-CDH/feature/1582-hebrew-prefixes
Browse files Browse the repository at this point in the history
Search prefixed Hebrew words, return non-prefixed versions (#1582)
  • Loading branch information
blms authored Nov 21, 2024
2 parents 9617499 + 3c30b12 commit f5c6cb4
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 2 deletions.
43 changes: 41 additions & 2 deletions geniza/corpus/solr_queryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,44 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
# if search consists only of quoted phrase scoped to shelfmark, handle separately
shelfmark_query = None

# hebrew prefixes that should be removed to produce an additional keyword to search
re_hebrew_prefix = re.compile(r"\b(אל|[ולבכמהשׁפ])[\u0590-\u05fe]+\b")

def _handle_hebrew_prefixes(self, search_term):
# if any word begins with one of the prefixes, update search to include the word
# without that prefix as well
prefixed_words = self.re_hebrew_prefix.finditer(search_term)
prefixed_words = [w.group(0) for w in prefixed_words]
if prefixed_words:
prefixed_or_nonprefixed_query = [
# handle two-charater prefix אל by removing 2 chars
f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})"
for word in prefixed_words
]
# use a custom delimiter to split on, since we need a capturing
# group in the original expression, but it changes the split function's
# behavior in an undesirable way
delim = "!SPLITME!"
nonprefixed_words = [
n
for n in re.sub(self.re_hebrew_prefix, delim, search_term).split(delim)
if n
]

# stitch the search query back together
return "".join(
itertools.chain.from_iterable(
(
itertools.zip_longest(
nonprefixed_words,
prefixed_or_nonprefixed_query,
fillvalue="",
)
)
)
)
return search_term

def _search_term_cleanup(self, search_term):
# adjust user search string before sending to solr

Expand All @@ -157,7 +195,8 @@ def _search_term_cleanup(self, search_term):
# add in judaeo-arabic conversion for the rest (double-quoted phrase should NOT be
# converted to JA, as this breaks if any brackets or other sigla are in doublequotes)
remaining_phrases = [
arabic_or_ja(p) for p in self.re_exact_match.split(search_term)
self._handle_hebrew_prefixes(arabic_or_ja(p))
for p in self.re_exact_match.split(search_term)
]
# stitch the search query back together, in order, so that boolean operators
# and phrase order are preserved
Expand All @@ -171,7 +210,7 @@ def _search_term_cleanup(self, search_term):
)
)
else:
search_term = arabic_or_ja(search_term)
search_term = self._handle_hebrew_prefixes(arabic_or_ja(search_term))

# convert any field aliases used in search terms to actual solr fields
# (i.e. "pgpid:950 shelfmark:ena" -> "pgpid_i:950 shelfmark_t:ena")
Expand Down
10 changes: 10 additions & 0 deletions geniza/corpus/tests/test_corpus_solrqueryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,16 @@ def test_search_term_cleanup__quoted_shelfmark_only(self):
assert "NS" in dqs._search_term_cleanup("shelfmark:NS")
assert not dqs.shelfmark_query

def test_handle_hebrew_prefixes(self):
dqs = DocumentSolrQuerySet()
# should replace words with hebrew prefixes with OR queries
# on the same word with or without prefix
assert dqs._search_term_cleanup("אלמרכב") == "(אלמרכב OR מרכב)"
assert (
dqs._search_term_cleanup("test one משיח two כבוד")
== "test one (משיח OR שיח) two (כבוד OR בוד)"
)

def test_keyword_search__quoted_shelfmark(self):
dqs = DocumentSolrQuerySet()
with patch.object(dqs, "search") as mocksearch:
Expand Down
32 changes: 32 additions & 0 deletions geniza/corpus/tests/test_corpus_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,38 @@ def test_exact_search_highlight(self, source, empty_solr):
in dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][0]
)

@pytest.mark.django_db
def test_hebrew_prefix_highlight(self, source, empty_solr):
# test matching for words without searched hebrew prefixes
document = Document.objects.create()
footnote = Footnote.objects.create(
content_object=document,
source=source,
doc_relation=Footnote.DIGITAL_EDITION,
)
Annotation.objects.create(
footnote=footnote,
content={
# body contains word מרכב without prefix אל
"body": [{"value": "מרכב"}],
"target": {
"source": {
"id": source.uri,
}
},
},
)
SolrClient().update.index([document.index_data()], commit=True)
docsearch_view = DocumentSearchView(kwargs={})
docsearch_view.request = Mock()

# should match word without prefix, smaller than the entered query
docsearch_view.request.GET = {"q": "אלמרכב"}
dqs = docsearch_view.get_queryset()
assert dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][
0
] == clean_html("<em>מרכב</em>")


class TestDocumentScholarshipView:
def test_page_title(self, document, client, source):
Expand Down

0 comments on commit f5c6cb4

Please sign in to comment.