Skip to content

Commit

Permalink
fix: yapf does not work
Browse files Browse the repository at this point in the history
  • Loading branch information
zhijianma committed Jan 3, 2024
1 parent 95ca8b0 commit 92b104c
Show file tree
Hide file tree
Showing 15 changed files with 45 additions and 38 deletions.
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ repos:
rev: v0.32.0
hooks:
- id: yapf
args: ['--style', '{column_limit: 79}']
exclude: data_juicer/ops/common/special_characters.py
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/analysis/column_wise_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ def __init__(self,
dataset,
output_path,
overall_result=None,
save_stats_in_one_file=True,
):
save_stats_in_one_file=True):
"""
Initialization method
:param dataset: the dataset to be analysed
Expand Down Expand Up @@ -168,8 +167,9 @@ def analyse(self, show_percentiles=False, show=False, skip_export=False):

if not skip_export:
self.draw_hist(
axes, data, os.path.join(
self.output_path, f'{column_name}-hist.png'))
axes, data,
os.path.join(self.output_path,
f'{column_name}-hist.png'))

# add a title to the figure of this stat
if self.save_stats_in_one_file:
Expand Down
25 changes: 16 additions & 9 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,12 @@ def update_ds_cache_dir_and_related_vars(new_ds_cache_path):
# and two more PATHS that depend on HF_DATASETS_CACHE
# - path to store downloaded datasets (e.g. remote datasets)
config.DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(
config.HF_DATASETS_CACHE, config.DOWNLOADED_DATASETS_DIR
)
config.HF_DATASETS_CACHE, config.DOWNLOADED_DATASETS_DIR)
config.DOWNLOADED_DATASETS_PATH = Path(
config.DEFAULT_DOWNLOADED_DATASETS_PATH)
# - path to store extracted datasets (e.g. xxx.jsonl.zst)
config.DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(
config.DEFAULT_DOWNLOADED_DATASETS_PATH, config.EXTRACTED_DATASETS_DIR
)
config.DEFAULT_DOWNLOADED_DATASETS_PATH, config.EXTRACTED_DATASETS_DIR)
config.EXTRACTED_DATASETS_PATH = Path(
config.DEFAULT_EXTRACTED_DATASETS_PATH)

Expand Down Expand Up @@ -529,8 +527,13 @@ def display_config(cfg):
print(table)


def export_config(cfg, path, format='yaml', skip_none=True, skip_check=True,
overwrite=False, multifile=True):
def export_config(cfg,
path,
format='yaml',
skip_none=True,
skip_check=True,
overwrite=False,
multifile=True):
"""
save the config object, some params are from jsonargparse
:param cfg: cfg object to save (Namespace type)
Expand All @@ -552,9 +555,13 @@ def export_config(cfg, path, format='yaml', skip_none=True, skip_check=True,
global global_parser
if not global_parser:
init_configs() # enable the customized type parser
global_parser.save(
cfg=cfg_to_export, path=path, format=format, skip_none=skip_none,
skip_check=skip_check, overwrite=overwrite, multifile=multifile)
global_parser.save(cfg=cfg_to_export,
path=path,
format=format,
skip_none=skip_none,
skip_check=skip_check,
overwrite=overwrite,
multifile=multifile)

logger.info(f'Saved the configuration in {path}')

Expand Down
4 changes: 2 additions & 2 deletions data_juicer/core/analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ def run(self, load_data_np=None, skip_export=False):

logger.info('Applying overall analysis on stats...')
overall_analysis = OverallAnalysis(dataset, self.analysis_path)
self.overall_result = overall_analysis.analyse(
num_proc=self.cfg.np, skip_export=skip_export)
self.overall_result = overall_analysis.analyse(num_proc=self.cfg.np,
skip_export=skip_export)

logger.info(f'The overall analysis results are: {self.overall_result}')

Expand Down
3 changes: 3 additions & 0 deletions data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# yapf: disable
from . import (alphanumeric_filter, average_line_length_filter,
character_repetition_filter, face_area_filter,
flagged_words_filter, image_aspect_ratio_filter,
Expand All @@ -9,3 +10,5 @@
stopwords_filter, suffix_filter, text_action_filter,
text_entity_dependency_filter, text_length_filter,
token_num_filter, word_num_filter, word_repetition_filter)

# yapf: enable
3 changes: 1 addition & 2 deletions data_juicer/ops/filter/language_id_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ def compute_stats(self, sample):
return sample

text = sample[self.text_key].lower().replace('\n', ' ')
ft_model = get_model(self.model_key,
model_type='fasttext')
ft_model = get_model(self.model_key, model_type='fasttext')
if ft_model is None:
err_msg = 'Model not loaded. Please retry later.'
logger.error(err_msg)
Expand Down
3 changes: 3 additions & 0 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# yapf: disable
from . import (chinese_convert_mapper, clean_copyright_mapper,
clean_email_mapper, clean_html_mapper, clean_ip_mapper,
clean_links_mapper, expand_macro_mapper, fix_unicode_mapper,
Expand All @@ -10,3 +11,5 @@
remove_words_with_incorrect_substrings_mapper,
replace_content_mapper, sentence_split_mapper,
whitespace_normalization_mapper)

# yapf: enable
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/clean_email_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
self.pattern = pattern
if ((len(pattern) > 2) and
(pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"'))):
or pattern.startswith('r"') and pattern.endswith('"'))):
self.pattern = pattern[2:-1]

self.repl = repl
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/clean_ip_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
self.pattern = pattern
if ((len(pattern) > 2) and
(pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"'))):
or pattern.startswith('r"') and pattern.endswith('"'))):
self.pattern = pattern[2:-1]
self.repl = repl

Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/clean_links_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
self.pattern = pattern
if ((len(pattern) > 2) and
(pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"'))):
or pattern.startswith('r"') and pattern.endswith('"'))):
self.pattern = pattern[2:-1]
self.repl = repl

Expand Down
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/fix_unicode_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def __init__(self, normalization: str = None, *args, **kwargs):
'["NFC", "NFKC", "NFD", "NFKD"]')

def process(self, sample):
sample[self.text_key] = ftfy.fix_text(
sample[self.text_key],
normalization=self.normalization)
sample[self.text_key] = ftfy.fix_text(sample[self.text_key],
normalization=self.normalization)
return sample
13 changes: 6 additions & 7 deletions data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@


def split_sentence(text):
text = re.sub('([.。!!?\?])([^’”])',r'\1\n\2',text) # noqa
text = re.sub('(\.{6})([^’”])',r'\1\n\2',text) # noqa
text = re.sub('(\…{2})([^’”])',r'\1\n\2',text) # noqa
text = re.sub('([.。!!?\?\.{6}\…{2}][’”])([^’”])',r'\1\n\2',text) # noqa
text = re.sub('([.。!!?\?])([^’”])', r'\1\n\2', text) # noqa
text = re.sub('(\.{6})([^’”])', r'\1\n\2', text) # noqa
text = re.sub('(\…{2})([^’”])', r'\1\n\2', text) # noqa
text = re.sub('([.。!!?\?\.{6}\…{2}][’”])([^’”])', r'\1\n\2', text) # noqa
return text.split('\n')


Expand Down Expand Up @@ -40,9 +40,8 @@ def __init__(self,
super().__init__(*args, **kwargs)
self.lowercase = lowercase
self.min_repeat_sentence_length = min_repeat_sentence_length
self.remove_regex = re.compile(
r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]'
) if ignore_special_character else None
self.remove_regex = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]'
) if ignore_special_character else None

def process(self, sample):

Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/mapper/replace_content_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
"""
super().__init__(*args, **kwargs)
self.pattern = pattern
if ((pattern is not None and len(pattern) > 2) and
(pattern.startswith("r'") and pattern.endswith("'") or
pattern.startswith('r"') and pattern.endswith('"'))):
if ((pattern is not None and len(pattern) > 2)
and (pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"'))):
self.pattern = pattern[2:-1]
self.repl = repl

Expand Down
2 changes: 0 additions & 2 deletions data_juicer/utils/availability_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@


from loguru import logger

UNAVAILABLE_OPERATORS = {}
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def get_access_log(cls, dj_cfg=None):
# Create a stream reader for the file and decode the
# first line
with dctx.stream_reader(compressed_file) as reader:
text_stream = io.TextIOWrapper(
reader, encoding='utf-8')
text_stream = io.TextIOWrapper(reader,
encoding='utf-8')
first_line = text_stream.readline()
elif 'jsonl' in dj_cfg.dataset_path:
tmp_f_name = dj_cfg.dataset_path. \
Expand Down

0 comments on commit 92b104c

Please sign in to comment.