Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: yapf does not work #167

Merged
merged 1 commit into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ repos:
rev: v0.32.0
hooks:
- id: yapf
args: ['--style', '{column_limit: 79}']
exclude: data_juicer/ops/common/special_characters.py
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/analysis/column_wise_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ def __init__(self,
dataset,
output_path,
overall_result=None,
save_stats_in_one_file=True,
):
save_stats_in_one_file=True):
"""
Initialization method
:param dataset: the dataset to be analysed
Expand Down Expand Up @@ -168,8 +167,9 @@ def analyse(self, show_percentiles=False, show=False, skip_export=False):

if not skip_export:
self.draw_hist(
axes, data, os.path.join(
self.output_path, f'{column_name}-hist.png'))
axes, data,
os.path.join(self.output_path,
f'{column_name}-hist.png'))

# add a title to the figure of this stat
if self.save_stats_in_one_file:
Expand Down
25 changes: 16 additions & 9 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,12 @@ def update_ds_cache_dir_and_related_vars(new_ds_cache_path):
# and two more PATHS that depend on HF_DATASETS_CACHE
# - path to store downloaded datasets (e.g. remote datasets)
config.DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(
config.HF_DATASETS_CACHE, config.DOWNLOADED_DATASETS_DIR
)
config.HF_DATASETS_CACHE, config.DOWNLOADED_DATASETS_DIR)
config.DOWNLOADED_DATASETS_PATH = Path(
config.DEFAULT_DOWNLOADED_DATASETS_PATH)
# - path to store extracted datasets (e.g. xxx.jsonl.zst)
config.DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(
config.DEFAULT_DOWNLOADED_DATASETS_PATH, config.EXTRACTED_DATASETS_DIR
)
config.DEFAULT_DOWNLOADED_DATASETS_PATH, config.EXTRACTED_DATASETS_DIR)
config.EXTRACTED_DATASETS_PATH = Path(
config.DEFAULT_EXTRACTED_DATASETS_PATH)

Expand Down Expand Up @@ -529,8 +527,13 @@ def display_config(cfg):
print(table)


def export_config(cfg, path, format='yaml', skip_none=True, skip_check=True,
overwrite=False, multifile=True):
def export_config(cfg,
path,
format='yaml',
skip_none=True,
skip_check=True,
overwrite=False,
multifile=True):
"""
save the config object, some params are from jsonargparse
:param cfg: cfg object to save (Namespace type)
Expand All @@ -552,9 +555,13 @@ def export_config(cfg, path, format='yaml', skip_none=True, skip_check=True,
global global_parser
if not global_parser:
init_configs() # enable the customized type parser
global_parser.save(
cfg=cfg_to_export, path=path, format=format, skip_none=skip_none,
skip_check=skip_check, overwrite=overwrite, multifile=multifile)
global_parser.save(cfg=cfg_to_export,
path=path,
format=format,
skip_none=skip_none,
skip_check=skip_check,
overwrite=overwrite,
multifile=multifile)

logger.info(f'Saved the configuration in {path}')

Expand Down
4 changes: 2 additions & 2 deletions data_juicer/core/analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ def run(self, load_data_np=None, skip_export=False):

logger.info('Applying overall analysis on stats...')
overall_analysis = OverallAnalysis(dataset, self.analysis_path)
self.overall_result = overall_analysis.analyse(
num_proc=self.cfg.np, skip_export=skip_export)
self.overall_result = overall_analysis.analyse(num_proc=self.cfg.np,
skip_export=skip_export)

logger.info(f'The overall analysis results are: {self.overall_result}')

Expand Down
3 changes: 3 additions & 0 deletions data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# yapf: disable
from . import (alphanumeric_filter, average_line_length_filter,
character_repetition_filter, face_area_filter,
flagged_words_filter, image_aspect_ratio_filter,
Expand All @@ -9,3 +10,5 @@
stopwords_filter, suffix_filter, text_action_filter,
text_entity_dependency_filter, text_length_filter,
token_num_filter, word_num_filter, word_repetition_filter)

# yapf: enable
3 changes: 1 addition & 2 deletions data_juicer/ops/filter/language_id_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ def compute_stats(self, sample):
return sample

text = sample[self.text_key].lower().replace('\n', ' ')
ft_model = get_model(self.model_key,
model_type='fasttext')
ft_model = get_model(self.model_key, model_type='fasttext')
if ft_model is None:
err_msg = 'Model not loaded. Please retry later.'
logger.error(err_msg)
Expand Down
3 changes: 3 additions & 0 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# yapf: disable
from . import (chinese_convert_mapper, clean_copyright_mapper,
clean_email_mapper, clean_html_mapper, clean_ip_mapper,
clean_links_mapper, expand_macro_mapper, fix_unicode_mapper,
Expand All @@ -10,3 +11,5 @@
remove_words_with_incorrect_substrings_mapper,
replace_content_mapper, sentence_split_mapper,
whitespace_normalization_mapper)

# yapf: enable
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/clean_email_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
self.pattern = pattern
if ((len(pattern) > 2) and
(pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"'))):
or pattern.startswith('r"') and pattern.endswith('"'))):
self.pattern = pattern[2:-1]

self.repl = repl
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/clean_ip_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
self.pattern = pattern
if ((len(pattern) > 2) and
(pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"'))):
or pattern.startswith('r"') and pattern.endswith('"'))):
self.pattern = pattern[2:-1]
self.repl = repl

Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/clean_links_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
self.pattern = pattern
if ((len(pattern) > 2) and
(pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"'))):
or pattern.startswith('r"') and pattern.endswith('"'))):
self.pattern = pattern[2:-1]
self.repl = repl

Expand Down
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/fix_unicode_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def __init__(self, normalization: str = None, *args, **kwargs):
'["NFC", "NFKC", "NFD", "NFKD"]')

def process(self, sample):
sample[self.text_key] = ftfy.fix_text(
sample[self.text_key],
normalization=self.normalization)
sample[self.text_key] = ftfy.fix_text(sample[self.text_key],
normalization=self.normalization)
return sample
13 changes: 6 additions & 7 deletions data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@


def split_sentence(text):
text = re.sub('([.。!!?\?])([^’”])',r'\1\n\2',text) # noqa
text = re.sub('(\.{6})([^’”])',r'\1\n\2',text) # noqa
text = re.sub('(\…{2})([^’”])',r'\1\n\2',text) # noqa
text = re.sub('([.。!!?\?\.{6}\…{2}][’”])([^’”])',r'\1\n\2',text) # noqa
text = re.sub('([.。!!?\?])([^’”])', r'\1\n\2', text) # noqa
text = re.sub('(\.{6})([^’”])', r'\1\n\2', text) # noqa
text = re.sub('(\…{2})([^’”])', r'\1\n\2', text) # noqa
text = re.sub('([.。!!?\?\.{6}\…{2}][’”])([^’”])', r'\1\n\2', text) # noqa
return text.split('\n')


Expand Down Expand Up @@ -40,9 +40,8 @@ def __init__(self,
super().__init__(*args, **kwargs)
self.lowercase = lowercase
self.min_repeat_sentence_length = min_repeat_sentence_length
self.remove_regex = re.compile(
r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]'
) if ignore_special_character else None
self.remove_regex = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]'
) if ignore_special_character else None

def process(self, sample):

Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/mapper/replace_content_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
"""
super().__init__(*args, **kwargs)
self.pattern = pattern
if ((pattern is not None and len(pattern) > 2) and
(pattern.startswith("r'") and pattern.endswith("'") or
pattern.startswith('r"') and pattern.endswith('"'))):
if ((pattern is not None and len(pattern) > 2)
and (pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"'))):
self.pattern = pattern[2:-1]
self.repl = repl

Expand Down
2 changes: 0 additions & 2 deletions data_juicer/utils/availability_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@


from loguru import logger

UNAVAILABLE_OPERATORS = {}
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def get_access_log(cls, dj_cfg=None):
# Create a stream reader for the file and decode the
# first line
with dctx.stream_reader(compressed_file) as reader:
text_stream = io.TextIOWrapper(
reader, encoding='utf-8')
text_stream = io.TextIOWrapper(reader,
encoding='utf-8')
first_line = text_stream.readline()
elif 'jsonl' in dj_cfg.dataset_path:
tmp_f_name = dj_cfg.dataset_path. \
Expand Down