From b24456502959d7e7876a8c4505ff9c5b315b9247 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Thu, 18 Apr 2024 11:53:43 +0800
Subject: [PATCH 1/3] * fix for "undefined opencc" bug for
 chinese_convert_mapper

---
 data_juicer/ops/mapper/chinese_convert_mapper.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py
index 937a3b9ed..3ad124045 100644
--- a/data_juicer/ops/mapper/chinese_convert_mapper.py
+++ b/data_juicer/ops/mapper/chinese_convert_mapper.py
@@ -7,10 +7,13 @@
 with AvailabilityChecking(['opencc'], OP_NAME):
     import opencc  # noqa: F401
 
+OPENCC_CONVERTER = None
+
 
 def prepare_converter(mode):
     global OPENCC_CONVERTER
-    OPENCC_CONVERTER = opencc.OpenCC(mode + '.json')
+    if OPENCC_CONVERTER is None:
+        OPENCC_CONVERTER = opencc.OpenCC(mode + '.json')
 
 
 @OPERATORS.register_module(OP_NAME)
@@ -70,9 +73,11 @@ def __init__(self, mode: str = 's2t', *args, **kwargs):
         ]
         assert mode in mode_list, 'Please make sure mode is one of {}'.format(
             mode_list)
-        prepare_converter(mode)
+        self.mode = mode
+        prepare_converter(self.mode)
 
     def process(self, sample):
+        prepare_converter(self.mode)
 
         sample[self.text_key] = OPENCC_CONVERTER.convert(sample[self.text_key])
         return sample

From ff78d286ff0b94194f0d246d249d2a0dcb4dc762 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Thu, 18 Apr 2024 13:02:02 +0800
Subject: [PATCH 2/3] * allow to change the config of opencc converter

---
 data_juicer/ops/mapper/chinese_convert_mapper.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py
index 3ad124045..818f1b1d4 100644
--- a/data_juicer/ops/mapper/chinese_convert_mapper.py
+++ b/data_juicer/ops/mapper/chinese_convert_mapper.py
@@ -11,9 +11,15 @@
 
 
 def prepare_converter(mode):
+    mode_path = mode + '.json'
     global OPENCC_CONVERTER
     if OPENCC_CONVERTER is None:
-        OPENCC_CONVERTER = opencc.OpenCC(mode + '.json')
+        # empty converter
+        OPENCC_CONVERTER = opencc.OpenCC(mode_path)
+    if not OPENCC_CONVERTER.config.endswith(mode_path):
+        # the config is actually a config path
+        # update and get a new converter with specified mode
+        OPENCC_CONVERTER = opencc.OpenCC(mode_path)
 
 
 @OPERATORS.register_module(OP_NAME)

From 77a8a605b6946cd6e0e4c0d5c1d3d44954ba95e4 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 19 Apr 2024 11:46:37 +0800
Subject: [PATCH 3/3] * fix: undefined punctuation_pattern

---
 data_juicer/ops/deduplicator/document_minhash_deduplicator.py | 1 +
 data_juicer/ops/deduplicator/document_simhash_deduplicator.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py
index 96136db0b..b0c5e0a51 100644
--- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py
+++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py
@@ -163,6 +163,7 @@ def __init__(
             logger.warning('Be careful that tokenization with punctuations '
                            'won\'t work if the ignore pattern includes '
                            'punctuations.')
+        self.punctuation_pattern = regex.compile(r'\p{P}')
 
         if self.tokenization == 'sentencepiece':
             if tokenizer_model is None:
diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py
index 3d1afa475..0eaad8edc 100644
--- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py
+++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py
@@ -69,6 +69,7 @@ def __init__(self,
             logger.warning('Be careful that tokenization with punctuations '
                            'won\'t work if the ignore pattern includes '
                            'punctuations.')
+        self.punctuation_pattern = regex.compile(r'\p{P}')
 
         # about deduplication
         self.num_blocks = num_blocks