From 92b104c581b6d7ba93b5e18e8cb4292cbf44aa23 Mon Sep 17 00:00:00 2001 From: zhijianma Date: Wed, 3 Jan 2024 18:24:57 +0800 Subject: [PATCH] fix: yapf does not work --- .pre-commit-config.yaml | 1 - data_juicer/analysis/column_wise_analysis.py | 8 +++--- data_juicer/config/config.py | 25 ++++++++++++------- data_juicer/core/analyser.py | 4 +-- data_juicer/ops/filter/__init__.py | 3 +++ .../ops/filter/language_id_score_filter.py | 3 +-- data_juicer/ops/mapper/__init__.py | 3 +++ data_juicer/ops/mapper/clean_email_mapper.py | 2 +- data_juicer/ops/mapper/clean_ip_mapper.py | 2 +- data_juicer/ops/mapper/clean_links_mapper.py | 2 +- data_juicer/ops/mapper/fix_unicode_mapper.py | 5 ++-- .../mapper/remove_repeat_sentences_mapper.py | 13 +++++----- .../ops/mapper/replace_content_mapper.py | 6 ++--- data_juicer/utils/availability_utils.py | 2 -- data_juicer/utils/constant.py | 4 +-- 15 files changed, 45 insertions(+), 38 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 29fd98cba..cb4b56d42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,6 @@ repos: rev: v0.32.0 hooks: - id: yapf - args: ['--style', '{column_limit: 79}'] exclude: data_juicer/ops/common/special_characters.py - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.3.0 diff --git a/data_juicer/analysis/column_wise_analysis.py b/data_juicer/analysis/column_wise_analysis.py index 194bca503..01b8d6e5b 100644 --- a/data_juicer/analysis/column_wise_analysis.py +++ b/data_juicer/analysis/column_wise_analysis.py @@ -58,8 +58,7 @@ def __init__(self, dataset, output_path, overall_result=None, - save_stats_in_one_file=True, - ): + save_stats_in_one_file=True): """ Initialization method :param dataset: the dataset to be analysed @@ -168,8 +167,9 @@ def analyse(self, show_percentiles=False, show=False, skip_export=False): if not skip_export: self.draw_hist( - axes, data, os.path.join( - self.output_path, f'{column_name}-hist.png')) + axes, data, + os.path.join(self.output_path, + f'{column_name}-hist.png')) # add a title to the figure of this stat if self.save_stats_in_one_file: diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py index e3d0de80c..f45c388a5 100644 --- a/data_juicer/config/config.py +++ b/data_juicer/config/config.py @@ -272,14 +272,12 @@ def update_ds_cache_dir_and_related_vars(new_ds_cache_path): # and two more PATHS that depend on HF_DATASETS_CACHE # - path to store downloaded datasets (e.g. remote datasets) config.DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join( - config.HF_DATASETS_CACHE, config.DOWNLOADED_DATASETS_DIR - ) + config.HF_DATASETS_CACHE, config.DOWNLOADED_DATASETS_DIR) config.DOWNLOADED_DATASETS_PATH = Path( config.DEFAULT_DOWNLOADED_DATASETS_PATH) # - path to store extracted datasets (e.g. xxx.jsonl.zst) config.DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join( - config.DEFAULT_DOWNLOADED_DATASETS_PATH, config.EXTRACTED_DATASETS_DIR - ) + config.DEFAULT_DOWNLOADED_DATASETS_PATH, config.EXTRACTED_DATASETS_DIR) config.EXTRACTED_DATASETS_PATH = Path( config.DEFAULT_EXTRACTED_DATASETS_PATH) @@ -529,8 +527,13 @@ def display_config(cfg): print(table) -def export_config(cfg, path, format='yaml', skip_none=True, skip_check=True, - overwrite=False, multifile=True): +def export_config(cfg, + path, + format='yaml', + skip_none=True, + skip_check=True, + overwrite=False, + multifile=True): """ save the config object, some params are from jsonargparse :param cfg: cfg object to save (Namespace type) @@ -552,9 +555,13 @@ def export_config(cfg, path, format='yaml', skip_none=True, skip_check=True, global global_parser if not global_parser: init_configs() # enable the customized type parser - global_parser.save( - cfg=cfg_to_export, path=path, format=format, skip_none=skip_none, - skip_check=skip_check, overwrite=overwrite, multifile=multifile) + global_parser.save(cfg=cfg_to_export, + path=path, + format=format, + skip_none=skip_none, + skip_check=skip_check, + overwrite=overwrite, + multifile=multifile) logger.info(f'Saved the configuration in {path}') diff --git a/data_juicer/core/analyser.py b/data_juicer/core/analyser.py index 903a3d8e3..04c48b33b 100644 --- a/data_juicer/core/analyser.py +++ b/data_juicer/core/analyser.py @@ -121,8 +121,8 @@ def run(self, load_data_np=None, skip_export=False): logger.info('Applying overall analysis on stats...') overall_analysis = OverallAnalysis(dataset, self.analysis_path) - self.overall_result = overall_analysis.analyse( - num_proc=self.cfg.np, skip_export=skip_export) + self.overall_result = overall_analysis.analyse(num_proc=self.cfg.np, + skip_export=skip_export) logger.info(f'The overall analysis results are: {self.overall_result}') diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py index 3770854de..82148ca0e 100644 --- a/data_juicer/ops/filter/__init__.py +++ b/data_juicer/ops/filter/__init__.py @@ -1,3 +1,4 @@ +# yapf: disable from . import (alphanumeric_filter, average_line_length_filter, character_repetition_filter, face_area_filter, flagged_words_filter, image_aspect_ratio_filter, @@ -9,3 +10,5 @@ stopwords_filter, suffix_filter, text_action_filter, text_entity_dependency_filter, text_length_filter, token_num_filter, word_num_filter, word_repetition_filter) + +# yapf: enable diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index ed76ef99c..cae60f99b 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -54,8 +54,7 @@ def compute_stats(self, sample): return sample text = sample[self.text_key].lower().replace('\n', ' ') - ft_model = get_model(self.model_key, - model_type='fasttext') + ft_model = get_model(self.model_key, model_type='fasttext') if ft_model is None: err_msg = 'Model not loaded. Please retry later.' logger.error(err_msg) diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index b78be1884..03beb99dd 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -1,3 +1,4 @@ +# yapf: disable from . import (chinese_convert_mapper, clean_copyright_mapper, clean_email_mapper, clean_html_mapper, clean_ip_mapper, clean_links_mapper, expand_macro_mapper, fix_unicode_mapper, @@ -10,3 +11,5 @@ remove_words_with_incorrect_substrings_mapper, replace_content_mapper, sentence_split_mapper, whitespace_normalization_mapper) + +# yapf: enable diff --git a/data_juicer/ops/mapper/clean_email_mapper.py b/data_juicer/ops/mapper/clean_email_mapper.py index 8e340241c..9708363e5 100644 --- a/data_juicer/ops/mapper/clean_email_mapper.py +++ b/data_juicer/ops/mapper/clean_email_mapper.py @@ -23,7 +23,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs): self.pattern = pattern if ((len(pattern) > 2) and (pattern.startswith("r'") and pattern.endswith("'") - or pattern.startswith('r"') and pattern.endswith('"'))): + or pattern.startswith('r"') and pattern.endswith('"'))): self.pattern = pattern[2:-1] self.repl = repl diff --git a/data_juicer/ops/mapper/clean_ip_mapper.py b/data_juicer/ops/mapper/clean_ip_mapper.py index 893296ced..607aeb585 100644 --- a/data_juicer/ops/mapper/clean_ip_mapper.py +++ b/data_juicer/ops/mapper/clean_ip_mapper.py @@ -28,7 +28,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs): self.pattern = pattern if ((len(pattern) > 2) and (pattern.startswith("r'") and pattern.endswith("'") - or pattern.startswith('r"') and pattern.endswith('"'))): + or pattern.startswith('r"') and pattern.endswith('"'))): self.pattern = pattern[2:-1] self.repl = repl diff --git a/data_juicer/ops/mapper/clean_links_mapper.py b/data_juicer/ops/mapper/clean_links_mapper.py index 780289d4f..bcd90d524 100644 --- a/data_juicer/ops/mapper/clean_links_mapper.py +++ b/data_juicer/ops/mapper/clean_links_mapper.py @@ -34,7 +34,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs): self.pattern = pattern if ((len(pattern) > 2) and (pattern.startswith("r'") and pattern.endswith("'") - or pattern.startswith('r"') and pattern.endswith('"'))): + or pattern.startswith('r"') and pattern.endswith('"'))): self.pattern = pattern[2:-1] self.repl = repl diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py index 1f2685c1f..dc30777d1 100644 --- a/data_juicer/ops/mapper/fix_unicode_mapper.py +++ b/data_juicer/ops/mapper/fix_unicode_mapper.py @@ -34,7 +34,6 @@ def __init__(self, normalization: str = None, *args, **kwargs): '["NFC", "NFKC", "NFD", "NFKD"]') def process(self, sample): - sample[self.text_key] = ftfy.fix_text( - sample[self.text_key], - normalization=self.normalization) + sample[self.text_key] = ftfy.fix_text(sample[self.text_key], + normalization=self.normalization) return sample diff --git a/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py b/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py index a844dccca..a1069d24d 100644 --- a/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py +++ b/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py @@ -4,10 +4,10 @@ def split_sentence(text): - text = re.sub('([.。!!?\?])([^’”])',r'\1\n\2',text) # noqa - text = re.sub('(\.{6})([^’”])',r'\1\n\2',text) # noqa - text = re.sub('(\…{2})([^’”])',r'\1\n\2',text) # noqa - text = re.sub('([.。!!?\?\.{6}\…{2}][’”])([^’”])',r'\1\n\2',text) # noqa + text = re.sub('([.。!!?\?])([^’”])', r'\1\n\2', text) # noqa + text = re.sub('(\.{6})([^’”])', r'\1\n\2', text) # noqa + text = re.sub('(\…{2})([^’”])', r'\1\n\2', text) # noqa + text = re.sub('([.。!!?\?\.{6}\…{2}][’”])([^’”])', r'\1\n\2', text) # noqa return text.split('\n') @@ -40,9 +40,8 @@ def __init__(self, super().__init__(*args, **kwargs) self.lowercase = lowercase self.min_repeat_sentence_length = min_repeat_sentence_length - self.remove_regex = re.compile( - r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]' - ) if ignore_special_character else None + self.remove_regex = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]' + ) if ignore_special_character else None def process(self, sample): diff --git a/data_juicer/ops/mapper/replace_content_mapper.py b/data_juicer/ops/mapper/replace_content_mapper.py index 0d4b2cd4a..703405001 100644 --- a/data_juicer/ops/mapper/replace_content_mapper.py +++ b/data_juicer/ops/mapper/replace_content_mapper.py @@ -20,9 +20,9 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs): """ super().__init__(*args, **kwargs) self.pattern = pattern - if ((pattern is not None and len(pattern) > 2) and - (pattern.startswith("r'") and pattern.endswith("'") or - pattern.startswith('r"') and pattern.endswith('"'))): + if ((pattern is not None and len(pattern) > 2) + and (pattern.startswith("r'") and pattern.endswith("'") + or pattern.startswith('r"') and pattern.endswith('"'))): self.pattern = pattern[2:-1] self.repl = repl diff --git a/data_juicer/utils/availability_utils.py b/data_juicer/utils/availability_utils.py index 17b718403..44b8a4a31 100644 --- a/data_juicer/utils/availability_utils.py +++ b/data_juicer/utils/availability_utils.py @@ -1,5 +1,3 @@ - - from loguru import logger UNAVAILABLE_OPERATORS = {} diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py index 6fa190263..3273d1ec1 100644 --- a/data_juicer/utils/constant.py +++ b/data_juicer/utils/constant.py @@ -63,8 +63,8 @@ def get_access_log(cls, dj_cfg=None): # Create a stream reader for the file and decode the # first line with dctx.stream_reader(compressed_file) as reader: - text_stream = io.TextIOWrapper( - reader, encoding='utf-8') + text_stream = io.TextIOWrapper(reader, + encoding='utf-8') first_line = text_stream.readline() elif 'jsonl' in dj_cfg.dataset_path: tmp_f_name = dj_cfg.dataset_path. \