-
Notifications
You must be signed in to change notification settings - Fork 191
/
punctuation_normalization_mapper.py
65 lines (58 loc) · 1.69 KB
/
punctuation_normalization_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Some code here has been modified from:
# https://github.com/bigscience-workshop/data-preparation
# --------------------------------------------------------
from ..base_op import OPERATORS, Mapper
@OPERATORS.register_module('punctuation_normalization_mapper')
class PunctuationNormalizationMapper(Mapper):
"""Mapper to normalize unicode punctuations to English punctuations in text
samples."""
_batched_op = True
def __init__(self, *args, **kwargs):
"""
Initialization method.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.punctuation_unicode = {
',': ',',
'。': '.',
'、': ',',
'„': '"',
'”': '"',
'“': '"',
'«': '"',
'»': '"',
'1': '"',
'」': '"',
'「': '"',
'《': '"',
'》': '"',
'´': "'",
'∶': ':',
':': ':',
'?': '?',
'!': '!',
'(': '(',
')': ')',
';': ';',
'–': '-',
'—': ' - ',
'.': '. ',
'~': '~',
'’': "'",
'…': '...',
'━': '-',
'〈': '<',
'〉': '>',
'【': '[',
'】': ']',
'%': '%',
'►': '-',
}
def process_batched(self, samples):
samples[self.text_key] = [
''.join([self.punctuation_unicode.get(c, c) for c in text])
for text in samples[self.text_key]
]
return samples