diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py index 96136db0b..b0c5e0a51 100644 --- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py @@ -163,6 +163,7 @@ def __init__( logger.warning('Be careful that tokenization with punctuations ' 'won\'t work if the ignore pattern includes ' 'punctuations.') + self.punctuation_pattern = regex.compile(r'\p{P}') if self.tokenization == 'sentencepiece': if tokenizer_model is None: diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index 3d1afa475..0eaad8edc 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -69,6 +69,7 @@ def __init__(self, logger.warning('Be careful that tokenization with punctuations ' 'won\'t work if the ignore pattern includes ' 'punctuations.') + self.punctuation_pattern = regex.compile(r'\p{P}') # about deduplication self.num_blocks = num_blocks diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 937a3b9ed..818f1b1d4 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -7,10 +7,19 @@ with AvailabilityChecking(['opencc'], OP_NAME): import opencc # noqa: F401 +OPENCC_CONVERTER = None + def prepare_converter(mode): + mode_path = mode + '.json' global OPENCC_CONVERTER - OPENCC_CONVERTER = opencc.OpenCC(mode + '.json') + if OPENCC_CONVERTER is None: + # empty converter + OPENCC_CONVERTER = opencc.OpenCC(mode_path) + if not OPENCC_CONVERTER.config.endswith(mode_path): + # the config is actually a config path + # update and get a new converter with specified mode + OPENCC_CONVERTER = opencc.OpenCC(mode_path) @OPERATORS.register_module(OP_NAME) @@ -70,9 +79,11 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): ] assert mode in mode_list, 'Please make sure mode is one of {}'.format( mode_list) - prepare_converter(mode) + self.mode = mode + prepare_converter(self.mode) def process(self, sample): + prepare_converter(self.mode) sample[self.text_key] = OPENCC_CONVERTER.convert(sample[self.text_key]) return sample