From b24456502959d7e7876a8c4505ff9c5b315b9247 Mon Sep 17 00:00:00 2001 From: "lielin.hyl" Date: Thu, 18 Apr 2024 11:53:43 +0800 Subject: [PATCH 1/3] * fix for "undefined opencc" bug for chinese_convert_mapper --- data_juicer/ops/mapper/chinese_convert_mapper.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 937a3b9ed..3ad124045 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -7,10 +7,13 @@ with AvailabilityChecking(['opencc'], OP_NAME): import opencc # noqa: F401 +OPENCC_CONVERTER = None + def prepare_converter(mode): global OPENCC_CONVERTER - OPENCC_CONVERTER = opencc.OpenCC(mode + '.json') + if OPENCC_CONVERTER is None: + OPENCC_CONVERTER = opencc.OpenCC(mode + '.json') @OPERATORS.register_module(OP_NAME) @@ -70,9 +73,11 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): ] assert mode in mode_list, 'Please make sure mode is one of {}'.format( mode_list) - prepare_converter(mode) + self.mode = mode + prepare_converter(self.mode) def process(self, sample): + prepare_converter(self.mode) sample[self.text_key] = OPENCC_CONVERTER.convert(sample[self.text_key]) return sample From ff78d286ff0b94194f0d246d249d2a0dcb4dc762 Mon Sep 17 00:00:00 2001 From: "lielin.hyl" Date: Thu, 18 Apr 2024 13:02:02 +0800 Subject: [PATCH 2/3] * allow to change the config of opencc converter --- data_juicer/ops/mapper/chinese_convert_mapper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 3ad124045..818f1b1d4 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -11,9 +11,15 @@ def prepare_converter(mode): + mode_path = mode + '.json' global OPENCC_CONVERTER if OPENCC_CONVERTER is None: - OPENCC_CONVERTER = opencc.OpenCC(mode + '.json') + # empty converter + OPENCC_CONVERTER = opencc.OpenCC(mode_path) + if not OPENCC_CONVERTER.config.endswith(mode_path): + # the config is actually a config path + # update and get a new converter with specified mode + OPENCC_CONVERTER = opencc.OpenCC(mode_path) @OPERATORS.register_module(OP_NAME) From 77a8a605b6946cd6e0e4c0d5c1d3d44954ba95e4 Mon Sep 17 00:00:00 2001 From: "lielin.hyl" Date: Fri, 19 Apr 2024 11:46:37 +0800 Subject: [PATCH 3/3] * fix: undefined punctuation_pattern --- data_juicer/ops/deduplicator/document_minhash_deduplicator.py | 1 + data_juicer/ops/deduplicator/document_simhash_deduplicator.py | 1 + 2 files changed, 2 insertions(+) diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py index 96136db0b..b0c5e0a51 100644 --- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py @@ -163,6 +163,7 @@ def __init__( logger.warning('Be careful that tokenization with punctuations ' 'won\'t work if the ignore pattern includes ' 'punctuations.') + self.punctuation_pattern = regex.compile(r'\p{P}') if self.tokenization == 'sentencepiece': if tokenizer_model is None: diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index 3d1afa475..0eaad8edc 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -69,6 +69,7 @@ def __init__(self, logger.warning('Be careful that tokenization with punctuations ' 'won\'t work if the ignore pattern includes ' 'punctuations.') + self.punctuation_pattern = regex.compile(r'\p{P}') # about deduplication self.num_blocks = num_blocks