Skip to content

Commit

Permalink
Allow searching JA to return Arabic results (#1679)
Browse files Browse the repository at this point in the history
  • Loading branch information
blms committed Nov 25, 2024
1 parent 9617499 commit c790685
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 36 deletions.
118 changes: 98 additions & 20 deletions geniza/corpus/ja.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,44 @@
"נ": "ן",
}

ja_arabic_chars = {
"א": "ا",
"ב": "ب",
"ג": ["غ", "ج"],
"ג̇": ["غ", "ج"],
"ד": ["د", "ذ"],
"ד̇": ["د", "ذ"],
"ה": ["ة", "ه"],
"ו": "و",
"ז": "ز",
"ח": "ح",
"ט": ["ط", "ظ"],
"ט̇": ["ط", "ظ"],
"י": ["ى", "ي"],
"ך": ["ك", "خ"],
"ך̇": ["ك", "خ"],
"כ": ["ك", "خ"],
"כ̇": ["ك", "خ"],
"ל": "ل",
"ם": "م",
"מ": "م",
"ן": "ن",
"נ": "ن",
"ס": "س",
"ע": "ع",
"ף": "ف",
"פ": "ف",
"ץ": ["ص", "ض"],
"ץ̇": ["ص", "ض"],
"צ": ["ص", "ض"],
"צ̇": ["ص", "ض"],
"ק": "ق",
"ר": "ر",
"ש": "ش",
"ת": ["ت", "ث"],
"ת̇": ["ت", "ث"],
}

# iso codes are AR and JRB if we want to use those

# generate translation tables
Expand All @@ -69,45 +107,85 @@ def contains_arabic(text):


def arabic_to_ja(text):
# handle multiple words
# if there is no arabic text, return as is
if not contains_arabic(text):
return text

# handle multiple words, translate from arabic to ja
text = text.translate(arabic_to_ja_table).strip()
# convert last letter to final form if necessary
# needs to use regex to handle accented characters, which complicate last letter indexing
return re.sub(re_he_final_letters, lambda m: he_final_letters[m.group(0)], text)


# regex to find arabic word or exact phrase with only arabic + whitepace
re_AR_WORD_OR_PHRASE = re.compile(
r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+'
)
# regex for range of hebrew letters
re_HE_letters = re.compile(r"[\u0590-\u05fe]+")


def arabic_or_ja(text, boost=True):
# find arabic tokens
arabic_wordphrases = re_AR_WORD_OR_PHRASE.findall(text)
def contains_hebrew(text):
# check if the text contains any hebrew letters
return re_HE_letters.search(text)


def ja_to_arabic(text):
# handle multiple words, translate from ja to arabic

# we can't use translate() because there are sometimes multiple options for
# the arabic translation, due to hebrew having fewer letters in its alphabet
for k, v in ja_arabic_chars.items():
if type(v) == list and k in text:
# list means there is more than one option, so join translations with OR
texts = []
for option in v:
texts.append(re.sub(k, option, text))
text = " OR ".join(texts)
elif type(v) == str:
# only one possible translation
text = re.sub(k, v, text)

return text.strip()


def make_translingual(text, boost, pattern, trans_func):
# find matching tokens by regex
matching_wordphrases = pattern.findall(text)

# get everything surrounding the matches
nonarabic_wordphrases = re_AR_WORD_OR_PHRASE.split(text)
nonmatching_wordphrases = pattern.split(text)

# rewrite arabic phrasesmatches
arabic_or_ja_wordphrases = [
f"({arabic_wordphrase}{'^2.0' if boost else ''}|{arabic_to_ja(arabic_wordphrase)})"
for arabic_wordphrase in arabic_wordphrases
# rewrite phrasematches using translingual function, boost, and OR query
translingual_wordphrases = [
f"({wordphrase}{'^2.0' if boost else ''} OR {trans_func(wordphrase)})"
for wordphrase in matching_wordphrases
]

# stitch the search query back together:
# pair tokens surrounding arabic terms with the arabic terms they were split on
# fill any missing values with empty strings and merge it all into a single string
# pair tokens surrounding matching terms with the terms they were split on,
# fill any missing values with empty strings, and merge it all into a single string
return "".join(
itertools.chain.from_iterable(
(
itertools.zip_longest(
nonarabic_wordphrases, arabic_or_ja_wordphrases, fillvalue=""
nonmatching_wordphrases, translingual_wordphrases, fillvalue=""
)
)
)
)


# regex to find hebrew word, or exact phrase with only hebrew + whitepace
re_HE_WORD_OR_PHRASE = re.compile(
r'"[\u0590-\u05fe]+[\s\u0590-\u05fe]*"|[\u0590-\u05fe]+'
)

# regex to find arabic word or exact phrase with only arabic + whitepace
re_AR_WORD_OR_PHRASE = re.compile(
r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+'
)


def arabic_or_ja(text, boost=True):
if not contains_hebrew(text) and not contains_arabic(text):
return text
texts = []
if contains_hebrew(text):
texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic))
if contains_arabic(text):
texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja))
return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0]
2 changes: 1 addition & 1 deletion geniza/corpus/tests/test_corpus_solrqueryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def test_search_term_cleanup__nonbool(self):
def test_search_term_cleanup__arabic_to_ja(self):
dqs = DocumentSolrQuerySet()
# confirm arabic to judaeo-arabic runs here (with boost)
assert dqs._search_term_cleanup("دينار") == "(دينار^2.0|דינאר)"
assert dqs._search_term_cleanup("دينار") == "(دينار^2.0 OR דינאר)"
# confirm arabic to judaeo-arabic does not run here
assert (
dqs._search_term_cleanup('"دي[نا]ر"')
Expand Down
73 changes: 58 additions & 15 deletions geniza/corpus/tests/test_ja.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from operator import contains

from geniza.corpus.ja import arabic_or_ja, arabic_to_ja, contains_arabic
from geniza.corpus.ja import (
arabic_or_ja,
arabic_to_ja,
contains_arabic,
contains_hebrew,
ja_to_arabic,
)


def test_contains_arabic():
Expand All @@ -19,48 +23,87 @@ def test_arabic_to_ja():
assert arabic_to_ja("english text") == "english text"


def test_arabic_or_ja__no_arabic():
def test_contains_hebrew():
assert not contains_hebrew("my keyword search")
assert not contains_hebrew("دينار")
assert contains_hebrew("דינאר mixed with english")
assert contains_hebrew("mixed מצחף and english")


def test_ja_to_arabic():
assert ja_to_arabic("דינאר") == "دىنار OR ذىنار OR دينار OR ذينار"
assert ja_to_arabic("מצחף") == "مصحف OR مضحف"
assert ja_to_arabic("סנה") == "سنة OR سنه"
assert ja_to_arabic("טבאךֹ") == "طباكֹ OR ظباكֹ OR طباخֹ OR ظباخֹ"
assert ja_to_arabic("מ") == "م"
assert ja_to_arabic("") == ""
assert ja_to_arabic("english text") == "english text"
assert ja_to_arabic("دينار") == "دينار"


def test_arabic_or_ja__no_arabic_or_ja():
txt = "my keyword search"
# should be unchanged
assert arabic_or_ja(txt) == txt


def test_arabic_or_ja__arabic():
# single word — should return match for arabic or judaeo-arabic
assert arabic_or_ja("دينار", boost=False) == "(دينار|דינאר)"
assert arabic_or_ja("دينار", boost=False) == "(دينار OR דינאר)"
# multiple words — should return match for arabic or judaeo-arabic
assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار|דינאר) (مصحف|מצחף)"
assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار OR דינאר) (مصحف OR מצחף)"
# mixed english and arabic
assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف|מצחף)"
assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف OR מצחף)"
# with boosting
assert arabic_or_ja("دينار") == "(دينار^2.0 OR דינאר)"


def test_arabic_or_ja__ja():
# single word — should return match for arabic or judaeo-arabic
assert (
arabic_or_ja("דינאר", boost=False)
== "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار)"
)
# multiple words — should return match for arabic or judaeo-arabic
assert (
arabic_or_ja("דינאר מצחף", boost=False)
== "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار) (מצחף OR مصحف OR مضحف)"
)
# mixed english and judaeo-arabic
assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף OR مصحف OR مضحف)"
# with boosting
assert arabic_or_ja("دينار") == "(دينار^2.0|דינאר)"
assert arabic_or_ja("דינאר") == "(דינאר^2.0 OR دىنار OR ذىنار OR دينار OR ذينار)"


def test_arabic_or_ja_exact_phrase():
# make sure basic exact quote is working
assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله"|"תעטל שגלה")'
assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله" OR "תעטל שגלה")'

# make sure broken quotes are ignored and arabic words are converted
assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل|תעטל) (شغله|שגלה)'
assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل OR תעטל) (شغله OR שגלה)'

# to test what would happen if we had 1+ arabic phrases
# (within quotation marks) and 1+ arabic words (not inside quotes)
assert (
arabic_or_ja('"تعطل شغله" etc etc شغله', boost=False)
== '("تعطل شغله"|"תעטל שגלה") etc etc (شغله|שגלה)'
== '("تعطل شغله" OR "תעטל שגלה") etc etc (شغله OR שגלה)'
)

# proximity
assert arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله"|"תעטל שגלה")~10'
assert (
arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله" OR "תעטל שגלה")~10'
)

# with boosting
assert arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0|תעטל) (شغله^2.0|שגלה)"
assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0|"תעטל שגלה")'
assert (
arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0 OR תעטל) (شغله^2.0 OR שגלה)"
)
assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0 OR "תעטל שגלה")'

# make sure query string is working
assert (
arabic_or_ja('transcription:("تعطل شغله") etc etc شغله', boost=False)
== 'transcription:(("تعطل شغله"|"תעטל שגלה")) etc etc (شغله|שגלה)'
== 'transcription:(("تعطل شغله" OR "תעטל שגלה")) etc etc (شغله OR שגלה)'
)

# make sure non-arabic field query is left unchanged
Expand Down

0 comments on commit c790685

Please sign in to comment.