Skip to content

Commit

Permalink
Implemented downloads for cedict
Browse files Browse the repository at this point in the history
  • Loading branch information
dmort27 committed Apr 27, 2021
1 parent 53e9af8 commit fcab64a
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 4 deletions.
2 changes: 1 addition & 1 deletion epitran/cedict.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _construct_trie(self, hanzi):
py, en = df
py = str(''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py))))
pairs.append((hz, (py.encode('utf-8'),)))
trie = _trie_m.RecordTrie(str('@s'), pairs)
trie = marisa_trie.RecordTrie(str('@s'), pairs)
return trie

def has_key(self, key):
Expand Down
28 changes: 28 additions & 0 deletions epitran/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
import requests
import gzip

CEDICT_URL='https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz'

def get_dir():
data_dir = os.path.expanduser('~/epitran_data/')
os.makedirs(data_dir, exist_ok=True)
return data_dir

def get_cedict_file():
return os.path.join(get_dir(), 'cedict.txt')

def cedict_exists():
return os.path.exists(get_cedict_file())

def cedict():
gzfilename = os.path.join(get_dir(), 'cedict.txt.gz')
txtfilename = os.path.join(get_dir(), 'cedict.txt')
r = requests.get(CEDICT_URL)
with open(gzfilename, 'wb') as f:
f.write(r.content)
with gzip.open(gzfilename, 'rb') as ip_byte, open(txtfilename, 'w') as op:
op.write(ip_byte.read().decode('utf-8'))



12 changes: 9 additions & 3 deletions epitran/epihan.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from . import cedict
from . import rules
from . import download
from epitran.ligaturize import ligaturize


Expand Down Expand Up @@ -43,8 +44,10 @@ def __init__(self, ligatures=False, cedict_file=None,
"""
# If no cedict_file is specified, raise and error
if not cedict_file:
raise MissingData('Please specify a location ' +
'for the CC-CEDict file.')
if download.cedict_exists():
cedict_file = download.get_cedict_file()
else:
raise MissingData('Download CC-CEDICT with "epitran.download.cedict()')
if tones:
rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt')
else:
Expand Down Expand Up @@ -110,7 +113,10 @@ def __init__(self, ligatures=False, cedict_file=None, tones=False, rules_file='p
IPA
"""
if not cedict_file:
raise MissingData('Please specify a location for the CC-CEDict file.')
if download.cedict_exists():
cedict_file = download.get_cedict_file()
else:
raise MissingData('Download CC-CEDICT with "epitran.download.cedict().')
rules_file = os.path.join('data', 'rules', rules_file)
rules_file = pkg_resources.resource_filename(__name__, rules_file)
self.cedict = cedict.CEDictTrie(cedict_file, traditional=True)
Expand Down

0 comments on commit fcab64a

Please sign in to comment.