Adding dictionary for Arabic (#129)

* clean_arabic added * Arabic added to supported languages * arabic files added to scripts/data * ar.json.gz added to spellchecker/resources * README.rst updated
barrust · Aug 29, 2022 · 1ef8359 · 1ef8359
1 parent 904b052
commit 1ef8359
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 2 deletions.
diff --git a/README.rst b/README.rst
@@ -135,6 +135,7 @@ package. Each is simple to use when initializing the dictionary:
     english = SpellChecker()  # the default is English (language='en')
     spanish = SpellChecker(language='es')  # use the Spanish Dictionary
     russian = SpellChecker(language='ru')  # use the Russian Dictionary
+    arabic = SpellChecker(language='ar')   # use the Arabic Dictionary
 
 
 The currently supported dictionaries are:
@@ -145,6 +146,7 @@ The currently supported dictionaries are:
 * Portuguese    - 'pt'
 * German        - 'de'
 * Russian       - 'ru'
+* Arabic        - 'ar'
 
 Dictionary Creation and Updating
 -------------------------------------------------------------------------------

diff --git a/scripts/build_dictionary.py b/scripts/build_dictionary.py
@@ -11,6 +11,7 @@
             French Input:     http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fr.gz
             Portuguese Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pt.gz
             Russian Input:    http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ru.gz
+            Arabic Input:    http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ar.gz
     Requirements:
             The script requires more than the standard library to run in its
             entirety. You will also need to install the NLTK package to build a
@@ -589,6 +590,63 @@ def clean_russian(word_frequency, filepath_exclude, filepath_include):
     return word_frequency
 
 
+def clean_arabic(word_frequency, filepath_exclude, filepath_include):
+    """Clean an Arabic word frequency list
+
+    Args:
+        word_frequency (Counter):
+        filepath_exclude (str):
+        filepath_include (str):
+    """
+    letters = set("دجحإﻹﻷأآﻵخهعغفقثصضذطكمنتالبيسشظزوةىﻻرؤءئ")
+
+    # remove words with invalid characters
+    invalid_chars = list()
+    for key in word_frequency:
+        kl = set(key)
+        if kl.issubset(letters):
+            continue
+        invalid_chars.append(key)
+    for misfit in invalid_chars:
+        word_frequency.pop(misfit)
+
+    # remove ellipses
+    ellipses = list()
+    for key in word_frequency:
+        if ".." in key:
+            ellipses.append(key)
+    for misfit in ellipses:
+        word_frequency.pop(misfit)
+
+    # TODO: other possible fixes?
+
+    # remove small numbers
+    small_frequency = list()
+    for key in word_frequency:
+        if word_frequency[key] <= MINIMUM_FREQUENCY:
+            small_frequency.append(key)
+    for misfit in small_frequency:
+        word_frequency.pop(misfit)
+
+    # remove flagged misspellings
+    with load_file(filepath_exclude) as fobj:
+        for line in fobj:
+            line = line.strip()
+            if line in word_frequency:
+                word_frequency.pop(line)
+
+    # Add known missing words back in (ugh)
+    with load_file(filepath_include) as fobj:
+        for line in fobj:
+            line = line.strip()
+            if line in word_frequency:
+                print("{} is already found in the dictionary! Skipping!")
+            else:
+                word_frequency[line] = MINIMUM_FREQUENCY
+
+    return word_frequency
+
+
 def _parse_args():
     """parse arguments for command-line usage"""
     import argparse
@@ -597,7 +655,7 @@ def _parse_args():
         description="Build a new dictionary (word frequency) using the OpenSubtitles2018 project"
     )
     parser.add_argument(
-        "-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru"]
+        "-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru", "ar"]
     )
     parser.add_argument(
         "-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
@@ -671,6 +729,8 @@ def _parse_args():
         word_frequency = clean_portuguese(word_frequency, exclude_filepath, include_filepath)
     elif args.language == "ru":
         word_frequency = clean_russian(word_frequency, exclude_filepath, include_filepath)
+    elif args.language == "ar":
+        word_frequency = clean_arabic(word_frequency, exclude_filepath, include_filepath)
 
     # export word frequency for review!
     word_frequency_path = os.path.join(script_path, "{}.json".format(args.language))

diff --git a/scripts/data/ar_exclude.txt b/scripts/data/ar_exclude.txt
diff --git a/scripts/data/ar_full.json.gz b/scripts/data/ar_full.json.gz
diff --git a/scripts/data/ar_include.txt b/scripts/data/ar_include.txt
diff --git a/spellchecker/resources/ar.json.gz b/spellchecker/resources/ar.json.gz
diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py
@@ -83,7 +83,7 @@ def __iter__(self) -> typing.Generator[str, None, None]:
     @classmethod
     def languages(cls) -> typing.Iterable[str]:
         """list: A list of all official languages supported by the library"""
-        return ["de", "en", "es", "fr", "pt", "ru"]
+        return ["de", "en", "es", "fr", "pt", "ru", "ar"]
 
     @property
     def word_frequency(self) -> "WordFrequency":