-
Notifications
You must be signed in to change notification settings - Fork 191
/
chinese_convert_mapper.py
92 lines (64 loc) · 2.84 KB
/
chinese_convert_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from data_juicer.utils.lazy_loader import LazyLoader
from ..base_op import OPERATORS, Mapper
opencc = LazyLoader('opencc', 'opencc')
OP_NAME = 'chinese_convert_mapper'
OPENCC_CONVERTER = None
def prepare_converter(mode):
mode_path = mode + '.json'
global OPENCC_CONVERTER
if OPENCC_CONVERTER is None:
# empty converter
OPENCC_CONVERTER = opencc.OpenCC(mode_path)
if not OPENCC_CONVERTER.config.endswith(mode_path):
# the config is actually a config path
# update and get a new converter with specified mode
OPENCC_CONVERTER = opencc.OpenCC(mode_path)
@OPERATORS.register_module(OP_NAME)
class ChineseConvertMapper(Mapper):
"""Mapper to convert Chinese between Traditional Chinese, Simplified Chinese
and Japanese Kanji."""
_batched_op = True
def __init__(self, mode: str = 's2t', *args, **kwargs):
"""
Initialization method.
:param mode: Choose the mode to convert Chinese:
s2t: Simplified Chinese to Traditional Chinese,
t2s: Traditional Chinese to Simplified Chinese,
s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),
tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,
s2hk: Simplified Chinese to Traditional Chinese
(Hong Kong variant),
hk2s: Traditional Chinese (Hong Kong variant) to Simplified
Chinese,
s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard)
with Taiwanese idiom,
tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese
with Mainland Chinese idiom,
t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),
tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,
hk2t: Traditional Chinese (Hong Kong variant) to Traditional
Chinese,
t2hk: Traditional Chinese to Traditional Chinese
(Hong Kong variant),
t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese
Kanji,
jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese
Characters,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
mode_list = [
's2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp',
't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t'
]
assert mode in mode_list, 'Please make sure mode is one of {}'.format(
mode_list)
self.mode = mode
prepare_converter(self.mode)
def process_batched(self, samples):
prepare_converter(self.mode)
samples[self.text_key] = [
OPENCC_CONVERTER.convert(text) for text in samples[self.text_key]
]
return samples