From fcab64a6432ec9c41f73cfd6e25dce9f204fd8f4 Mon Sep 17 00:00:00 2001 From: "David R. Mortensen" Date: Tue, 27 Apr 2021 16:35:25 -0400 Subject: [PATCH] Implemented downloads for cedict --- epitran/cedict.py | 2 +- epitran/download.py | 28 ++++++++++++++++++++++++++++ epitran/epihan.py | 12 +++++++++--- 3 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 epitran/download.py diff --git a/epitran/cedict.py b/epitran/cedict.py index bb9b1f72..63550d13 100644 --- a/epitran/cedict.py +++ b/epitran/cedict.py @@ -47,7 +47,7 @@ def _construct_trie(self, hanzi): py, en = df py = str(''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py)))) pairs.append((hz, (py.encode('utf-8'),))) - trie = _trie_m.RecordTrie(str('@s'), pairs) + trie = marisa_trie.RecordTrie(str('@s'), pairs) return trie def has_key(self, key): diff --git a/epitran/download.py b/epitran/download.py new file mode 100644 index 00000000..ac95cbc8 --- /dev/null +++ b/epitran/download.py @@ -0,0 +1,28 @@ +import os +import requests +import gzip + +CEDICT_URL='https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz' + +def get_dir(): + data_dir = os.path.expanduser('~/epitran_data/') + os.makedirs(data_dir, exist_ok=True) + return data_dir + +def get_cedict_file(): + return os.path.join(get_dir(), 'cedict.txt') + +def cedict_exists(): + return os.path.exists(get_cedict_file()) + +def cedict(): + gzfilename = os.path.join(get_dir(), 'cedict.txt.gz') + txtfilename = os.path.join(get_dir(), 'cedict.txt') + r = requests.get(CEDICT_URL) + with open(gzfilename, 'wb') as f: + f.write(r.content) + with gzip.open(gzfilename, 'rb') as ip_byte, open(txtfilename, 'w') as op: + op.write(ip_byte.read().decode('utf-8')) + + + diff --git a/epitran/epihan.py b/epitran/epihan.py index dc9426fd..b1bcdc08 100644 --- a/epitran/epihan.py +++ b/epitran/epihan.py @@ -8,6 +8,7 @@ from . import cedict from . import rules +from . import download from epitran.ligaturize import ligaturize @@ -43,8 +44,10 @@ def __init__(self, ligatures=False, cedict_file=None, """ # If no cedict_file is specified, raise and error if not cedict_file: - raise MissingData('Please specify a location ' + - 'for the CC-CEDict file.') + if download.cedict_exists(): + cedict_file = download.get_cedict_file() + else: + raise MissingData('Download CC-CEDICT with "epitran.download.cedict()') if tones: rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt') else: @@ -110,7 +113,10 @@ def __init__(self, ligatures=False, cedict_file=None, tones=False, rules_file='p IPA """ if not cedict_file: - raise MissingData('Please specify a location for the CC-CEDict file.') + if download.cedict_exists(): + cedict_file = download.get_cedict_file() + else: + raise MissingData('Download CC-CEDICT with "epitran.download.cedict().') rules_file = os.path.join('data', 'rules', rules_file) rules_file = pkg_resources.resource_filename(__name__, rules_file) self.cedict = cedict.CEDictTrie(cedict_file, traditional=True)