modelscope · zhijianma · Jan 4, 2024 · Jan 3, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,7 +11,6 @@ repos:
     rev: v0.32.0
     hooks:
       - id: yapf
-        args: ['--style', '{column_limit: 79}']
         exclude: data_juicer/ops/common/special_characters.py
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.3.0

diff --git a/data_juicer/analysis/column_wise_analysis.py b/data_juicer/analysis/column_wise_analysis.py
@@ -58,8 +58,7 @@ def __init__(self,
                  dataset,
                  output_path,
                  overall_result=None,
-                 save_stats_in_one_file=True,
-                 ):
+                 save_stats_in_one_file=True):
         """
         Initialization method
         :param dataset: the dataset to be analysed
@@ -168,8 +167,9 @@ def analyse(self, show_percentiles=False, show=False, skip_export=False):
 
                 if not skip_export:
                     self.draw_hist(
-                        axes, data, os.path.join(
-                            self.output_path, f'{column_name}-hist.png'))
+                        axes, data,
+                        os.path.join(self.output_path,
+                                     f'{column_name}-hist.png'))
 
             # add a title to the figure of this stat
             if self.save_stats_in_one_file:

diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -272,14 +272,12 @@ def update_ds_cache_dir_and_related_vars(new_ds_cache_path):
     # and two more PATHS that depend on HF_DATASETS_CACHE
     # - path to store downloaded datasets (e.g. remote datasets)
     config.DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(
-        config.HF_DATASETS_CACHE, config.DOWNLOADED_DATASETS_DIR
-    )
+        config.HF_DATASETS_CACHE, config.DOWNLOADED_DATASETS_DIR)
     config.DOWNLOADED_DATASETS_PATH = Path(
         config.DEFAULT_DOWNLOADED_DATASETS_PATH)
     # - path to store extracted datasets (e.g. xxx.jsonl.zst)
     config.DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(
-        config.DEFAULT_DOWNLOADED_DATASETS_PATH, config.EXTRACTED_DATASETS_DIR
-    )
+        config.DEFAULT_DOWNLOADED_DATASETS_PATH, config.EXTRACTED_DATASETS_DIR)
     config.EXTRACTED_DATASETS_PATH = Path(
         config.DEFAULT_EXTRACTED_DATASETS_PATH)
 
@@ -529,8 +527,13 @@ def display_config(cfg):
     print(table)
 
 
-def export_config(cfg, path, format='yaml', skip_none=True, skip_check=True,
-                  overwrite=False, multifile=True):
+def export_config(cfg,
+                  path,
+                  format='yaml',
+                  skip_none=True,
+                  skip_check=True,
+                  overwrite=False,
+                  multifile=True):
     """
         save the config object, some params are from jsonargparse
     :param cfg: cfg object to save (Namespace type)
@@ -552,9 +555,13 @@ def export_config(cfg, path, format='yaml', skip_none=True, skip_check=True,
     global global_parser
     if not global_parser:
         init_configs()  # enable the customized type parser
-    global_parser.save(
-        cfg=cfg_to_export, path=path, format=format, skip_none=skip_none,
-        skip_check=skip_check, overwrite=overwrite, multifile=multifile)
+    global_parser.save(cfg=cfg_to_export,
+                       path=path,
+                       format=format,
+                       skip_none=skip_none,
+                       skip_check=skip_check,
+                       overwrite=overwrite,
+                       multifile=multifile)
 
     logger.info(f'Saved the configuration in {path}')
 

diff --git a/data_juicer/core/analyser.py b/data_juicer/core/analyser.py
@@ -121,8 +121,8 @@ def run(self, load_data_np=None, skip_export=False):
 
         logger.info('Applying overall analysis on stats...')
         overall_analysis = OverallAnalysis(dataset, self.analysis_path)
-        self.overall_result = overall_analysis.analyse(
-            num_proc=self.cfg.np, skip_export=skip_export)
+        self.overall_result = overall_analysis.analyse(num_proc=self.cfg.np,
+                                                       skip_export=skip_export)
 
         logger.info(f'The overall analysis results are: {self.overall_result}')
 

diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
@@ -1,3 +1,4 @@
+# yapf: disable
 from . import (alphanumeric_filter, average_line_length_filter,
                character_repetition_filter, face_area_filter,
                flagged_words_filter, image_aspect_ratio_filter,
@@ -9,3 +10,5 @@
                stopwords_filter, suffix_filter, text_action_filter,
                text_entity_dependency_filter, text_length_filter,
                token_num_filter, word_num_filter, word_repetition_filter)
+
+# yapf: enable
diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py
@@ -54,8 +54,7 @@ def compute_stats(self, sample):
             return sample
 
         text = sample[self.text_key].lower().replace('\n', ' ')
-        ft_model = get_model(self.model_key,
-                             model_type='fasttext')
+        ft_model = get_model(self.model_key, model_type='fasttext')
         if ft_model is None:
             err_msg = 'Model not loaded. Please retry later.'
             logger.error(err_msg)

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -1,3 +1,4 @@
+# yapf: disable
 from . import (chinese_convert_mapper, clean_copyright_mapper,
                clean_email_mapper, clean_html_mapper, clean_ip_mapper,
                clean_links_mapper, expand_macro_mapper, fix_unicode_mapper,
@@ -10,3 +11,5 @@
                remove_words_with_incorrect_substrings_mapper,
                replace_content_mapper, sentence_split_mapper,
                whitespace_normalization_mapper)
+
+# yapf: enable
diff --git a/data_juicer/ops/mapper/clean_email_mapper.py b/data_juicer/ops/mapper/clean_email_mapper.py
@@ -23,7 +23,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
             self.pattern = pattern
             if ((len(pattern) > 2) and
                 (pattern.startswith("r'") and pattern.endswith("'")
-                    or pattern.startswith('r"') and pattern.endswith('"'))):
+                 or pattern.startswith('r"') and pattern.endswith('"'))):
                 self.pattern = pattern[2:-1]
 
         self.repl = repl

diff --git a/data_juicer/ops/mapper/clean_ip_mapper.py b/data_juicer/ops/mapper/clean_ip_mapper.py
@@ -28,7 +28,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
             self.pattern = pattern
             if ((len(pattern) > 2) and
                 (pattern.startswith("r'") and pattern.endswith("'")
-                    or pattern.startswith('r"') and pattern.endswith('"'))):
+                 or pattern.startswith('r"') and pattern.endswith('"'))):
                 self.pattern = pattern[2:-1]
         self.repl = repl
 

diff --git a/data_juicer/ops/mapper/clean_links_mapper.py b/data_juicer/ops/mapper/clean_links_mapper.py
@@ -34,7 +34,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
             self.pattern = pattern
             if ((len(pattern) > 2) and
                 (pattern.startswith("r'") and pattern.endswith("'")
-                    or pattern.startswith('r"') and pattern.endswith('"'))):
+                 or pattern.startswith('r"') and pattern.endswith('"'))):
                 self.pattern = pattern[2:-1]
         self.repl = repl
 

diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py
@@ -34,7 +34,6 @@ def __init__(self, normalization: str = None, *args, **kwargs):
                              '["NFC", "NFKC", "NFD", "NFKD"]')
 
     def process(self, sample):
-        sample[self.text_key] = ftfy.fix_text(
-                                    sample[self.text_key],
-                                    normalization=self.normalization)
+        sample[self.text_key] = ftfy.fix_text(sample[self.text_key],
+                                              normalization=self.normalization)
         return sample
diff --git a/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py b/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
@@ -4,10 +4,10 @@
 
 
 def split_sentence(text):
-    text = re.sub('([.。！!？\?])([^’”])',r'\1\n\2',text)                # noqa
-    text = re.sub('(\.{6})([^’”])',r'\1\n\2',text)                      # noqa
-    text = re.sub('(\…{2})([^’”])',r'\1\n\2',text)                      # noqa
-    text = re.sub('([.。!！？\?\.{6}\…{2}][’”])([^’”])',r'\1\n\2',text)  # noqa
+    text = re.sub('([.。！!？\?])([^’”])', r'\1\n\2', text)  # noqa
+    text = re.sub('(\.{6})([^’”])', r'\1\n\2', text)  # noqa
+    text = re.sub('(\…{2})([^’”])', r'\1\n\2', text)  # noqa
+    text = re.sub('([.。!！？\?\.{6}\…{2}][’”])([^’”])', r'\1\n\2', text)  # noqa
     return text.split('\n')
 
 
@@ -40,9 +40,8 @@ def __init__(self,
         super().__init__(*args, **kwargs)
         self.lowercase = lowercase
         self.min_repeat_sentence_length = min_repeat_sentence_length
-        self.remove_regex = re.compile(
-            r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]'
-        ) if ignore_special_character else None
+        self.remove_regex = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]'
+                                       ) if ignore_special_character else None
 
     def process(self, sample):
 

diff --git a/data_juicer/ops/mapper/replace_content_mapper.py b/data_juicer/ops/mapper/replace_content_mapper.py
@@ -20,9 +20,9 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
         """
         super().__init__(*args, **kwargs)
         self.pattern = pattern
-        if ((pattern is not None and len(pattern) > 2) and
-            (pattern.startswith("r'") and pattern.endswith("'") or
-                pattern.startswith('r"') and pattern.endswith('"'))):
+        if ((pattern is not None and len(pattern) > 2)
+                and (pattern.startswith("r'") and pattern.endswith("'")
+                     or pattern.startswith('r"') and pattern.endswith('"'))):
             self.pattern = pattern[2:-1]
         self.repl = repl
 

diff --git a/data_juicer/utils/availability_utils.py b/data_juicer/utils/availability_utils.py
@@ -1,5 +1,3 @@
-
-
 from loguru import logger
 
 UNAVAILABLE_OPERATORS = {}

diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py
@@ -63,8 +63,8 @@ def get_access_log(cls, dj_cfg=None):
                         # Create a stream reader for the file and decode the
                         # first line
                         with dctx.stream_reader(compressed_file) as reader:
-                            text_stream = io.TextIOWrapper(
-                                reader, encoding='utf-8')
+                            text_stream = io.TextIOWrapper(reader,
+                                                           encoding='utf-8')
                             first_line = text_stream.readline()
                 elif 'jsonl' in dj_cfg.dataset_path:
                     tmp_f_name = dj_cfg.dataset_path. \
Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,3 @@


		from loguru import logger

		UNAVAILABLE_OPERATORS = {}
Expand Down