Skip to content

Commit

Permalink
keep alphabet number punc
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhesen committed Oct 31, 2023
1 parent 263910f commit 01112ff
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 4 deletions.
3 changes: 3 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ process:
min_len: 1 # the min word length to keep words.
max_len: 128 # the max word length to keep words.
- remove_non_chinese_character_mapper: # remove non Chinese character in text samples.
keep_alphabet: true # whether to keep alpabet
keep_number: true # whether to keep number
keep_punc: true # whether to keep punctuation
- remove_specific_chars_mapper: # remove characters specified by users
chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□' # a string or a list including those characters that need to be removed
- remove_table_text_mapper: # remove possible table texts from text.
Expand Down
20 changes: 18 additions & 2 deletions data_juicer/ops/mapper/remove_non_chinese_character_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,31 @@
class RemoveNonChineseCharacterlMapper(Mapper):
"""Mapper to remove non chinese Character in text samples."""

def __init__(self, *args, **kwargs):
def __init__(self,
keep_alphabet: bool = True,
keep_number: bool = True,
keep_punc: bool = True,
*args,
**kwargs):
"""
Initialization method.
:param keep_alphabet: whether to keep alpabet
:param keep_number: whether to keep number
:param keep_punc: whether to keep punctuation
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.pattern = r'[^\u4e00-\u9fa5]'
self.pattern = u'[^\u4e00-\u9fa5'
if keep_alphabet:
self.pattern += u'A-Za-z'
if keep_number:
self.pattern += u'0-9'
if keep_punc:
self.pattern += u'., ,\\-。%《*》/•、&&(—)(+):?!!“”·]+'
else:
self.pattern += u']'

def process(self, sample):

Expand Down
132 changes: 130 additions & 2 deletions tests/ops/mapper/test_remove_non_chinese_character_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

class RemoveNonChineseCharacterlMapperrTest(unittest.TestCase):

def setUp(self):
self.op = RemoveNonChineseCharacterlMapper()
def setUp(self, keep_alphabet=True, keep_number=True, keep_punc=True):
self.op = RemoveNonChineseCharacterlMapper(keep_alphabet, keep_number,
keep_punc)

def _run_remove_non_chinese_character(self, samples):
for sample in samples:
Expand All @@ -29,6 +30,133 @@ def test_remove_non_chinese_character(self):
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉',
'target': '所有的非汉字都会被去掉'
}]
self.setUp(False, False, False)
self._run_remove_non_chinese_character(samples)

def test_remove_non_chinese_character2(self):

samples = [{
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇',
'target': '特殊的康熙部首或者扩展部首会被去除几'
}, {
'text': '请问你是谁[email protected]',
'target': '请问你是谁dasoidhaofgom'
}, {
'text': 'ftp://exam匹配ple汉字ma-niè包括rdas繁體字[email protected]',
'target': 'ftpexam匹配ple汉字mani包括rdas繁體字hhqbchdckdhnfescds'
}, {
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉',
'target': '所有的非汉字ash都h会被qb去掉'
}]
self.setUp(True, False, False)
self._run_remove_non_chinese_character(samples)

def test_remove_non_chinese_character3(self):

samples = [{
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇',
'target': '特殊的康熙部首或者扩展部首会被去除几'
}, {
'text': '请问你是谁[email protected]',
'target': '请问你是谁126445'
}, {
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]',
'target': '匹配汉12字包括繁88體字1'
}, {
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉',
'target': '所有的非汉字44都1246会被4525去掉'
}]
self.setUp(False, True, False)
self._run_remove_non_chinese_character(samples)

def test_remove_non_chinese_character4(self):

samples = [{
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇',
'target': '特殊的康熙部首或者扩展部首会被去除,几'
}, {
'text': '请问你是谁[email protected]',
'target': '请问你是谁.'
}, {
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]',
'target': '//匹配汉字-包括繁體字.'
}, {
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉',
'target': ' 所有的非汉字都会被*&*去掉'
}]
self.setUp(False, False, True)
self._run_remove_non_chinese_character(samples)

def test_remove_non_chinese_character5(self):

samples = [{
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇',
'target': '特殊的康熙部首或者扩展部首会被去除几'
}, {
'text': '请问你是谁[email protected]',
'target': '请问你是谁dasoidhao1264fg45om'
}, {
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]',
'target': 'fexam匹配ple汉12字mani包括rdas繁88體字hhqbchdds1'
}, {
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉',
'target': '所有的非汉字a44sh都1246h会被qb4525去掉'
}]
self.setUp(True, True, False)
self._run_remove_non_chinese_character(samples)

def test_remove_non_chinese_character6(self):

samples = [{
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇',
'target': '特殊的康熙部首或者扩展部首会被去除,几'
}, {
'text': '请问你是谁[email protected]',
'target': '请问你是谁dasoidhaofg.om'
}, {
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]',
'target': 'f//exam匹配ple汉字ma-ni包括rdas繁體字hhqbchd.ds'
}, {
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉',
'target': ' 所有的非汉字ash都h会被*&*qb去掉'
}]
self.setUp(True, False, True)
self._run_remove_non_chinese_character(samples)

def test_remove_non_chinese_character7(self):

samples = [{
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇',
'target': '特殊的康熙部首或者扩展部首会被去除,几'
}, {
'text': '请问你是谁[email protected]',
'target': '请问你是谁1264.45'
}, {
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]',
'target': '//匹配汉12字-包括繁88體字.1'
}, {
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉',
'target': ' 所有的非汉字44都1246会被*&*4525去掉'
}]
self.setUp(False, True, True)
self._run_remove_non_chinese_character(samples)

def test_remove_non_chinese_character8(self):

samples = [{
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇',
'target': '特殊的康熙部首或者扩展部首会被去除,几'
}, {
'text': '请问你是谁[email protected]',
'target': '请问你是谁dasoidhao1264fg.45om'
}, {
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]',
'target': 'f//exam匹配ple汉12字ma-ni包括rdas繁88體字hhqbchd.ds1'
}, {
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉',
'target': ' 所有的非汉字a44sh都1246h会被*&*qb4525去掉'
}]
self.setUp(True, True, True)
self._run_remove_non_chinese_character(samples)


Expand Down

0 comments on commit 01112ff

Please sign in to comment.