-
Notifications
You must be signed in to change notification settings - Fork 192
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
151 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,8 +6,9 @@ | |
|
||
class RemoveNonChineseCharacterlMapperrTest(unittest.TestCase): | ||
|
||
def setUp(self): | ||
self.op = RemoveNonChineseCharacterlMapper() | ||
def setUp(self, keep_alphabet=True, keep_number=True, keep_punc=True): | ||
self.op = RemoveNonChineseCharacterlMapper(keep_alphabet, keep_number, | ||
keep_punc) | ||
|
||
def _run_remove_non_chinese_character(self, samples): | ||
for sample in samples: | ||
|
@@ -29,6 +30,133 @@ def test_remove_non_chinese_character(self): | |
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', | ||
'target': '所有的非汉字都会被去掉' | ||
}] | ||
self.setUp(False, False, False) | ||
self._run_remove_non_chinese_character(samples) | ||
|
||
def test_remove_non_chinese_character2(self): | ||
|
||
samples = [{ | ||
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', | ||
'target': '特殊的康熙部首或者扩展部首会被去除几' | ||
}, { | ||
'text': '请问你是谁[email protected]', | ||
'target': '请问你是谁dasoidhaofgom' | ||
}, { | ||
'text': 'ftp://exam匹配ple汉字ma-niè包括rdas繁體字[email protected]', | ||
'target': 'ftpexam匹配ple汉字mani包括rdas繁體字hhqbchdckdhnfescds' | ||
}, { | ||
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', | ||
'target': '所有的非汉字ash都h会被qb去掉' | ||
}] | ||
self.setUp(True, False, False) | ||
self._run_remove_non_chinese_character(samples) | ||
|
||
def test_remove_non_chinese_character3(self): | ||
|
||
samples = [{ | ||
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', | ||
'target': '特殊的康熙部首或者扩展部首会被去除几' | ||
}, { | ||
'text': '请问你是谁[email protected]', | ||
'target': '请问你是谁126445' | ||
}, { | ||
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]', | ||
'target': '匹配汉12字包括繁88體字1' | ||
}, { | ||
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', | ||
'target': '所有的非汉字44都1246会被4525去掉' | ||
}] | ||
self.setUp(False, True, False) | ||
self._run_remove_non_chinese_character(samples) | ||
|
||
def test_remove_non_chinese_character4(self): | ||
|
||
samples = [{ | ||
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', | ||
'target': '特殊的康熙部首或者扩展部首会被去除,几' | ||
}, { | ||
'text': '请问你是谁[email protected]', | ||
'target': '请问你是谁.' | ||
}, { | ||
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]', | ||
'target': '//匹配汉字-包括繁體字.' | ||
}, { | ||
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', | ||
'target': ' 所有的非汉字都会被*&*去掉' | ||
}] | ||
self.setUp(False, False, True) | ||
self._run_remove_non_chinese_character(samples) | ||
|
||
def test_remove_non_chinese_character5(self): | ||
|
||
samples = [{ | ||
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', | ||
'target': '特殊的康熙部首或者扩展部首会被去除几' | ||
}, { | ||
'text': '请问你是谁[email protected]', | ||
'target': '请问你是谁dasoidhao1264fg45om' | ||
}, { | ||
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]', | ||
'target': 'fexam匹配ple汉12字mani包括rdas繁88體字hhqbchdds1' | ||
}, { | ||
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', | ||
'target': '所有的非汉字a44sh都1246h会被qb4525去掉' | ||
}] | ||
self.setUp(True, True, False) | ||
self._run_remove_non_chinese_character(samples) | ||
|
||
def test_remove_non_chinese_character6(self): | ||
|
||
samples = [{ | ||
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', | ||
'target': '特殊的康熙部首或者扩展部首会被去除,几' | ||
}, { | ||
'text': '请问你是谁[email protected]', | ||
'target': '请问你是谁dasoidhaofg.om' | ||
}, { | ||
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]', | ||
'target': 'f//exam匹配ple汉字ma-ni包括rdas繁體字hhqbchd.ds' | ||
}, { | ||
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', | ||
'target': ' 所有的非汉字ash都h会被*&*qb去掉' | ||
}] | ||
self.setUp(True, False, True) | ||
self._run_remove_non_chinese_character(samples) | ||
|
||
def test_remove_non_chinese_character7(self): | ||
|
||
samples = [{ | ||
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', | ||
'target': '特殊的康熙部首或者扩展部首会被去除,几' | ||
}, { | ||
'text': '请问你是谁[email protected]', | ||
'target': '请问你是谁1264.45' | ||
}, { | ||
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]', | ||
'target': '//匹配汉12字-包括繁88體字.1' | ||
}, { | ||
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', | ||
'target': ' 所有的非汉字44都1246会被*&*4525去掉' | ||
}] | ||
self.setUp(False, True, True) | ||
self._run_remove_non_chinese_character(samples) | ||
|
||
def test_remove_non_chinese_character8(self): | ||
|
||
samples = [{ | ||
'text': '特殊的康熙部首或者扩展部首会被去除,⼏几⺇', | ||
'target': '特殊的康熙部首或者扩展部首会被去除,几' | ||
}, { | ||
'text': '请问你是谁[email protected]', | ||
'target': '请问你是谁dasoidhao1264fg.45om' | ||
}, { | ||
'text': 'f://exam匹配ple汉12字ma-niè包括rdas繁88體字[email protected]', | ||
'target': 'f//exam匹配ple汉12字ma-ni包括rdas繁88體字hhqbchd.ds1' | ||
}, { | ||
'text': '👊 所有的非汉字a44sh都12@46h会被*&……*qb^4525去掉', | ||
'target': ' 所有的非汉字a44sh都1246h会被*&*qb4525去掉' | ||
}] | ||
self.setUp(True, True, True) | ||
self._run_remove_non_chinese_character(samples) | ||
|
||
|
||
|