Allow searching JA to return Arabic results (#1679)

Princeton-CDH · Nov 25, 2024 · c790685 · c790685
1 parent 9617499
commit c790685
Show file tree

Hide file tree

Showing 3 changed files with 157 additions and 36 deletions.
diff --git a/geniza/corpus/ja.py b/geniza/corpus/ja.py
@@ -50,6 +50,44 @@
     "נ": "ן",
 }
 
+ja_arabic_chars = {
+    "א": "ا",
+    "ב": "ب",
+    "ג": ["غ", "ج"],
+    "ג̇": ["غ", "ج"],
+    "ד": ["د", "ذ"],
+    "ד̇": ["د", "ذ"],
+    "ה": ["ة", "ه"],
+    "ו": "و",
+    "ז": "ز",
+    "ח": "ح",
+    "ט": ["ط", "ظ"],
+    "ט̇": ["ط", "ظ"],
+    "י": ["ى", "ي"],
+    "ך": ["ك", "خ"],
+    "ך̇": ["ك", "خ"],
+    "כ": ["ك", "خ"],
+    "כ̇": ["ك", "خ"],
+    "ל": "ل",
+    "ם": "م",
+    "מ": "م",
+    "ן": "ن",
+    "נ": "ن",
+    "ס": "س",
+    "ע": "ع",
+    "ף": "ف",
+    "פ": "ف",
+    "ץ": ["ص", "ض"],
+    "ץ̇": ["ص", "ض"],
+    "צ": ["ص", "ض"],
+    "צ̇": ["ص", "ض"],
+    "ק": "ق",
+    "ר": "ر",
+    "ש": "ش",
+    "ת": ["ت", "ث"],
+    "ת̇": ["ت", "ث"],
+}
+
 # iso codes are AR and JRB if we want to use those
 
 # generate translation tables
@@ -69,45 +107,85 @@ def contains_arabic(text):
 
 
 def arabic_to_ja(text):
-    # handle multiple words
-    # if there is no arabic text, return as is
-    if not contains_arabic(text):
-        return text
-
+    # handle multiple words, translate from arabic to ja
     text = text.translate(arabic_to_ja_table).strip()
     # convert last letter to final form if necessary
     # needs to use regex to handle accented characters, which complicate last letter indexing
     return re.sub(re_he_final_letters, lambda m: he_final_letters[m.group(0)], text)
 
 
-# regex to find arabic word or exact phrase with only arabic + whitepace
-re_AR_WORD_OR_PHRASE = re.compile(
-    r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+'
-)
+# regex for range of hebrew letters
+re_HE_letters = re.compile(r"[\u0590-\u05fe]+")
 
 
-def arabic_or_ja(text, boost=True):
-    # find arabic tokens
-    arabic_wordphrases = re_AR_WORD_OR_PHRASE.findall(text)
+def contains_hebrew(text):
+    # check if the text contains any hebrew letters
+    return re_HE_letters.search(text)
+
+
+def ja_to_arabic(text):
+    # handle multiple words, translate from ja to arabic
+
+    # we can't use translate() because there are sometimes multiple options for
+    # the arabic translation, due to hebrew having fewer letters in its alphabet
+    for k, v in ja_arabic_chars.items():
+        if type(v) == list and k in text:
+            # list means there is more than one option, so join translations with OR
+            texts = []
+            for option in v:
+                texts.append(re.sub(k, option, text))
+            text = " OR ".join(texts)
+        elif type(v) == str:
+            # only one possible translation
+            text = re.sub(k, v, text)
+
+    return text.strip()
+
+
+def make_translingual(text, boost, pattern, trans_func):
+    # find matching tokens by regex
+    matching_wordphrases = pattern.findall(text)
 
     # get everything surrounding the matches
-    nonarabic_wordphrases = re_AR_WORD_OR_PHRASE.split(text)
+    nonmatching_wordphrases = pattern.split(text)
 
-    # rewrite arabic phrasesmatches
-    arabic_or_ja_wordphrases = [
-        f"({arabic_wordphrase}{'^2.0' if boost else ''}|{arabic_to_ja(arabic_wordphrase)})"
-        for arabic_wordphrase in arabic_wordphrases
+    # rewrite phrasematches using translingual function, boost, and OR query
+    translingual_wordphrases = [
+        f"({wordphrase}{'^2.0' if boost else ''} OR {trans_func(wordphrase)})"
+        for wordphrase in matching_wordphrases
     ]
 
     # stitch the search query back together:
-    # pair tokens surrounding arabic terms with the arabic terms they were split on
-    # fill any missing values with empty strings and merge it all into a single string
+    # pair tokens surrounding matching terms with the terms they were split on,
+    # fill any missing values with empty strings, and merge it all into a single string
     return "".join(
         itertools.chain.from_iterable(
             (
                 itertools.zip_longest(
-                    nonarabic_wordphrases, arabic_or_ja_wordphrases, fillvalue=""
+                    nonmatching_wordphrases, translingual_wordphrases, fillvalue=""
                 )
             )
         )
     )
+
+
+# regex to find hebrew word, or exact phrase with only hebrew + whitepace
+re_HE_WORD_OR_PHRASE = re.compile(
+    r'"[\u0590-\u05fe]+[\s\u0590-\u05fe]*"|[\u0590-\u05fe]+'
+)
+
+# regex to find arabic word or exact phrase with only arabic + whitepace
+re_AR_WORD_OR_PHRASE = re.compile(
+    r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+'
+)
+
+
+def arabic_or_ja(text, boost=True):
+    if not contains_hebrew(text) and not contains_arabic(text):
+        return text
+    texts = []
+    if contains_hebrew(text):
+        texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic))
+    if contains_arabic(text):
+        texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja))
+    return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0]
diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -164,7 +164,7 @@ def test_search_term_cleanup__nonbool(self):
     def test_search_term_cleanup__arabic_to_ja(self):
         dqs = DocumentSolrQuerySet()
         # confirm arabic to judaeo-arabic runs here (with boost)
-        assert dqs._search_term_cleanup("دينار") == "(دينار^2.0|דינאר)"
+        assert dqs._search_term_cleanup("دينار") == "(دينار^2.0 OR דינאר)"
         # confirm arabic to judaeo-arabic does not run here
         assert (
             dqs._search_term_cleanup('"دي[نا]ر"')

diff --git a/geniza/corpus/tests/test_ja.py b/geniza/corpus/tests/test_ja.py
@@ -1,6 +1,10 @@
-from operator import contains
-
-from geniza.corpus.ja import arabic_or_ja, arabic_to_ja, contains_arabic
+from geniza.corpus.ja import (
+    arabic_or_ja,
+    arabic_to_ja,
+    contains_arabic,
+    contains_hebrew,
+    ja_to_arabic,
+)
 
 
 def test_contains_arabic():
@@ -19,48 +23,87 @@ def test_arabic_to_ja():
     assert arabic_to_ja("english text") == "english text"
 
 
-def test_arabic_or_ja__no_arabic():
+def test_contains_hebrew():
+    assert not contains_hebrew("my keyword search")
+    assert not contains_hebrew("دينار")
+    assert contains_hebrew("דינאר mixed with english")
+    assert contains_hebrew("mixed מצחף and english")
+
+
+def test_ja_to_arabic():
+    assert ja_to_arabic("דינאר") == "دىنار OR ذىنار OR دينار OR ذينار"
+    assert ja_to_arabic("מצחף") == "مصحف OR مضحف"
+    assert ja_to_arabic("סנה") == "سنة OR سنه"
+    assert ja_to_arabic("טבאךֹ") == "طباكֹ OR ظباكֹ OR طباخֹ OR ظباخֹ"
+    assert ja_to_arabic("מ") == "م"
+    assert ja_to_arabic("") == ""
+    assert ja_to_arabic("english text") == "english text"
+    assert ja_to_arabic("دينار") == "دينار"
+
+
+def test_arabic_or_ja__no_arabic_or_ja():
     txt = "my keyword search"
     # should be unchanged
     assert arabic_or_ja(txt) == txt
 
 
 def test_arabic_or_ja__arabic():
     # single word — should return match for arabic or judaeo-arabic
-    assert arabic_or_ja("دينار", boost=False) == "(دينار|דינאר)"
+    assert arabic_or_ja("دينار", boost=False) == "(دينار OR דינאר)"
     # multiple words — should return match for arabic or judaeo-arabic
-    assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار|דינאר) (مصحف|מצחף)"
+    assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار OR דינאר) (مصحف OR מצחף)"
     # mixed english and arabic
-    assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف|מצחף)"
+    assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف OR מצחף)"
+    # with boosting
+    assert arabic_or_ja("دينار") == "(دينار^2.0 OR דינאר)"
+
+
+def test_arabic_or_ja__ja():
+    # single word — should return match for arabic or judaeo-arabic
+    assert (
+        arabic_or_ja("דינאר", boost=False)
+        == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار)"
+    )
+    # multiple words — should return match for arabic or judaeo-arabic
+    assert (
+        arabic_or_ja("דינאר מצחף", boost=False)
+        == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار) (מצחף OR مصحف OR مضحف)"
+    )
+    # mixed english and judaeo-arabic
+    assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף OR مصحف OR مضحف)"
     # with boosting
-    assert arabic_or_ja("دينار") == "(دينار^2.0|דינאר)"
+    assert arabic_or_ja("דינאר") == "(דינאר^2.0 OR دىنار OR ذىنار OR دينار OR ذينار)"
 
 
 def test_arabic_or_ja_exact_phrase():
     # make sure basic exact quote is working
-    assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله"|"תעטל שגלה")'
+    assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله" OR "תעטל שגלה")'
 
     # make sure broken quotes are ignored and arabic words are converted
-    assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل|תעטל) (شغله|שגלה)'
+    assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل OR תעטל) (شغله OR שגלה)'
 
     # to test what would happen if we had 1+ arabic phrases
     # (within quotation marks) and 1+ arabic words (not inside quotes)
     assert (
         arabic_or_ja('"تعطل شغله" etc etc شغله', boost=False)
-        == '("تعطل شغله"|"תעטל שגלה") etc etc (شغله|שגלה)'
+        == '("تعطل شغله" OR "תעטל שגלה") etc etc (شغله OR שגלה)'
     )
 
     # proximity
-    assert arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله"|"תעטל שגלה")~10'
+    assert (
+        arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله" OR "תעטל שגלה")~10'
+    )
 
     # with boosting
-    assert arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0|תעטל) (شغله^2.0|שגלה)"
-    assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0|"תעטל שגלה")'
+    assert (
+        arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0 OR תעטל) (شغله^2.0 OR שגלה)"
+    )
+    assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0 OR "תעטל שגלה")'
 
     # make sure query string is working
     assert (
         arabic_or_ja('transcription:("تعطل شغله") etc etc شغله', boost=False)
-        == 'transcription:(("تعطل شغله"|"תעטל שגלה")) etc etc (شغله|שגלה)'
+        == 'transcription:(("تعطل شغله" OR "תעטל שגלה")) etc etc (شغله OR שגלה)'
     )
 
     # make sure non-arabic field query is left unchanged