Skip to content

Commit

Permalink
Adding dictionary for Arabic (#129)
Browse files Browse the repository at this point in the history
* clean_arabic added

* Arabic added to supported languages

* arabic files added to scripts/data

* ar.json.gz added to spellchecker/resources

* README.rst updated
  • Loading branch information
msalhab96 authored Aug 29, 2022
1 parent 904b052 commit 1ef8359
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 2 deletions.
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ package. Each is simple to use when initializing the dictionary:
english = SpellChecker() # the default is English (language='en')
spanish = SpellChecker(language='es') # use the Spanish Dictionary
russian = SpellChecker(language='ru') # use the Russian Dictionary
arabic = SpellChecker(language='ar') # use the Arabic Dictionary
The currently supported dictionaries are:
Expand All @@ -145,6 +146,7 @@ The currently supported dictionaries are:
* Portuguese - 'pt'
* German - 'de'
* Russian - 'ru'
* Arabic - 'ar'

Dictionary Creation and Updating
-------------------------------------------------------------------------------
Expand Down
62 changes: 61 additions & 1 deletion scripts/build_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
French Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fr.gz
Portuguese Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pt.gz
Russian Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ru.gz
Arabic Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ar.gz
Requirements:
The script requires more than the standard library to run in its
entirety. You will also need to install the NLTK package to build a
Expand Down Expand Up @@ -589,6 +590,63 @@ def clean_russian(word_frequency, filepath_exclude, filepath_include):
return word_frequency


def clean_arabic(word_frequency, filepath_exclude, filepath_include):
"""Clean an Arabic word frequency list
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("دجحإﻹﻷأآﻵخهعغفقثصضذطكمنتالبيسشظزوةىﻻرؤءئ")

# remove words with invalid characters
invalid_chars = list()
for key in word_frequency:
kl = set(key)
if kl.issubset(letters):
continue
invalid_chars.append(key)
for misfit in invalid_chars:
word_frequency.pop(misfit)

# remove ellipses
ellipses = list()
for key in word_frequency:
if ".." in key:
ellipses.append(key)
for misfit in ellipses:
word_frequency.pop(misfit)

# TODO: other possible fixes?

# remove small numbers
small_frequency = list()
for key in word_frequency:
if word_frequency[key] <= MINIMUM_FREQUENCY:
small_frequency.append(key)
for misfit in small_frequency:
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency


def _parse_args():
"""parse arguments for command-line usage"""
import argparse
Expand All @@ -597,7 +655,7 @@ def _parse_args():
description="Build a new dictionary (word frequency) using the OpenSubtitles2018 project"
)
parser.add_argument(
"-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru"]
"-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru", "ar"]
)
parser.add_argument(
"-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
Expand Down Expand Up @@ -671,6 +729,8 @@ def _parse_args():
word_frequency = clean_portuguese(word_frequency, exclude_filepath, include_filepath)
elif args.language == "ru":
word_frequency = clean_russian(word_frequency, exclude_filepath, include_filepath)
elif args.language == "ar":
word_frequency = clean_arabic(word_frequency, exclude_filepath, include_filepath)

# export word frequency for review!
word_frequency_path = os.path.join(script_path, "{}.json".format(args.language))
Expand Down
Empty file added scripts/data/ar_exclude.txt
Empty file.
Binary file added scripts/data/ar_full.json.gz
Binary file not shown.
Empty file added scripts/data/ar_include.txt
Empty file.
Binary file added spellchecker/resources/ar.json.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __iter__(self) -> typing.Generator[str, None, None]:
@classmethod
def languages(cls) -> typing.Iterable[str]:
"""list: A list of all official languages supported by the library"""
return ["de", "en", "es", "fr", "pt", "ru"]
return ["de", "en", "es", "fr", "pt", "ru", "ar"]

@property
def word_frequency(self) -> "WordFrequency":
Expand Down

0 comments on commit 1ef8359

Please sign in to comment.