From 01112ff1fb8222573ceff374a88b9f235c899c52 Mon Sep 17 00:00:00 2001 From: "hesen.chs" Date: Tue, 31 Oct 2023 17:33:50 +0800 Subject: [PATCH] keep alphabet number punc --- configs/config_all.yaml | 3 + .../remove_non_chinese_character_mapper.py | 20 ++- ...est_remove_non_chinese_character_mapper.py | 132 +++++++++++++++++- 3 files changed, 151 insertions(+), 4 deletions(-) diff --git a/configs/config_all.yaml b/configs/config_all.yaml index bb26119a3..1b19bb309 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -74,6 +74,9 @@ process: min_len: 1 # the min word length to keep words. max_len: 128 # the max word length to keep words. - remove_non_chinese_character_mapper: # remove non Chinese character in text samples. + keep_alphabet: true # whether to keep alpabet + keep_number: true # whether to keep number + keep_punc: true # whether to keep punctuation - remove_specific_chars_mapper: # remove characters specified by users chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□' # a string or a list including those characters that need to be removed - remove_table_text_mapper: # remove possible table texts from text. diff --git a/data_juicer/ops/mapper/remove_non_chinese_character_mapper.py b/data_juicer/ops/mapper/remove_non_chinese_character_mapper.py index f25816d80..14b799452 100644 --- a/data_juicer/ops/mapper/remove_non_chinese_character_mapper.py +++ b/data_juicer/ops/mapper/remove_non_chinese_character_mapper.py @@ -7,15 +7,31 @@ class RemoveNonChineseCharacterlMapper(Mapper): """Mapper to remove non chinese Character in text samples.""" - def __init__(self, *args, **kwargs): + def __init__(self, + keep_alphabet: bool = True, + keep_number: bool = True, + keep_punc: bool = True, + *args, + **kwargs): """ Initialization method. + :param keep_alphabet: whether to keep alpabet + :param keep_number: whether to keep number + :param keep_punc: whether to keep punctuation :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) - self.pattern = r'[^\u4e00-\u9fa5]' + self.pattern = u'[^\u4e00-\u9fa5' + if keep_alphabet: + self.pattern += u'A-Za-z' + if keep_number: + self.pattern += u'0-9' + if keep_punc: + self.pattern += u'., ,\\-。%《*》/•、&&(—)(+):?!!“”·]+' + else: + self.pattern += u']' def process(self, sample): diff --git a/tests/ops/mapper/test_remove_non_chinese_character_mapper.py b/tests/ops/mapper/test_remove_non_chinese_character_mapper.py index 18785a9dc..d7c1953c8 100644 --- a/tests/ops/mapper/test_remove_non_chinese_character_mapper.py +++ b/tests/ops/mapper/test_remove_non_chinese_character_mapper.py @@ -6,8 +6,9 @@ class RemoveNonChineseCharacterlMapperrTest(unittest.TestCase): - def setUp(self): - self.op = RemoveNonChineseCharacterlMapper() + def setUp(self, keep_alphabet=True, keep_number=True, keep_punc=True): + self.op = RemoveNonChineseCharacterlMapper(keep_alphabet, keep_number, + keep_punc) def _run_remove_non_chinese_character(self, samples): for sample in samples: @@ -29,6 +30,133 @@ def test_remove_non_chinese_character(self): 'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', 'target': '所有的非汉字都会被去掉' }] + self.setUp(False, False, False) + self._run_remove_non_chinese_character(samples) + + def test_remove_non_chinese_character2(self): + + samples = [{ + 'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', + 'target': '特殊的康熙部首或者扩展部首会被去除几' + }, { + 'text': '请问你是谁dasoidhao@1264fg.45om', + 'target': '请问你是谁dasoidhaofgom' + }, { + 'text': 'ftp://exam匹配ple汉字ma-niè包括rdas繁體字h@hqbchd.ckdhnfes.cds', + 'target': 'ftpexam匹配ple汉字mani包括rdas繁體字hhqbchdckdhnfescds' + }, { + 'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', + 'target': '所有的非汉字ash都h会被qb去掉' + }] + self.setUp(True, False, False) + self._run_remove_non_chinese_character(samples) + + def test_remove_non_chinese_character3(self): + + samples = [{ + 'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', + 'target': '特殊的康熙部首或者扩展部首会被去除几' + }, { + 'text': '请问你是谁dasoidhao@1264fg.45om', + 'target': '请问你是谁126445' + }, { + 'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字h@hqbchd.ds1', + 'target': '匹配汉12字包括繁88體字1' + }, { + 'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', + 'target': '所有的非汉字44都1246会被4525去掉' + }] + self.setUp(False, True, False) + self._run_remove_non_chinese_character(samples) + + def test_remove_non_chinese_character4(self): + + samples = [{ + 'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', + 'target': '特殊的康熙部首或者扩展部首会被去除,几' + }, { + 'text': '请问你是谁dasoidhao@1264fg.45om', + 'target': '请问你是谁.' + }, { + 'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字h@hqbchd.ds1', + 'target': '//匹配汉字-包括繁體字.' + }, { + 'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', + 'target': ' 所有的非汉字都会被*&*去掉' + }] + self.setUp(False, False, True) + self._run_remove_non_chinese_character(samples) + + def test_remove_non_chinese_character5(self): + + samples = [{ + 'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', + 'target': '特殊的康熙部首或者扩展部首会被去除几' + }, { + 'text': '请问你是谁dasoidhao@1264fg.45om', + 'target': '请问你是谁dasoidhao1264fg45om' + }, { + 'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字h@hqbchd.ds1', + 'target': 'fexam匹配ple汉12字mani包括rdas繁88體字hhqbchdds1' + }, { + 'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', + 'target': '所有的非汉字a44sh都1246h会被qb4525去掉' + }] + self.setUp(True, True, False) + self._run_remove_non_chinese_character(samples) + + def test_remove_non_chinese_character6(self): + + samples = [{ + 'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', + 'target': '特殊的康熙部首或者扩展部首会被去除,几' + }, { + 'text': '请问你是谁dasoidhao@1264fg.45om', + 'target': '请问你是谁dasoidhaofg.om' + }, { + 'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字h@hqbchd.ds1', + 'target': 'f//exam匹配ple汉字ma-ni包括rdas繁體字hhqbchd.ds' + }, { + 'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', + 'target': ' 所有的非汉字ash都h会被*&*qb去掉' + }] + self.setUp(True, False, True) + self._run_remove_non_chinese_character(samples) + + def test_remove_non_chinese_character7(self): + + samples = [{ + 'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', + 'target': '特殊的康熙部首或者扩展部首会被去除,几' + }, { + 'text': '请问你是谁dasoidhao@1264fg.45om', + 'target': '请问你是谁1264.45' + }, { + 'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字h@hqbchd.ds1', + 'target': '//匹配汉12字-包括繁88體字.1' + }, { + 'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', + 'target': ' 所有的非汉字44都1246会被*&*4525去掉' + }] + self.setUp(False, True, True) + self._run_remove_non_chinese_character(samples) + + def test_remove_non_chinese_character8(self): + + samples = [{ + 'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', + 'target': '特殊的康熙部首或者扩展部首会被去除,几' + }, { + 'text': '请问你是谁dasoidhao@1264fg.45om', + 'target': '请问你是谁dasoidhao1264fg.45om' + }, { + 'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字h@hqbchd.ds1', + 'target': 'f//exam匹配ple汉12字ma-ni包括rdas繁88體字hhqbchd.ds1' + }, { + 'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', + 'target': ' 所有的非汉字a44sh都1246h会被*&*qb4525去掉' + }] + self.setUp(True, True, True) self._run_remove_non_chinese_character(samples)