Skip to content

Commit

Permalink
* fix pre-commit failures
Browse files Browse the repository at this point in the history
+ add pre-commit action
  • Loading branch information
HYLcool committed Nov 13, 2023
1 parent 16d159f commit 1595714
Show file tree
Hide file tree
Showing 25 changed files with 301 additions and 254 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

name: pre-commit

on: [push, pull_request]

jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
- uses: pre-commit/[email protected]
8 changes: 7 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,10 @@ repos:
exclude: thirdparty/
args: [ "--fix=lf" ]

exclude: 'docs/.*'
exclude: |
(?x)^(
docs/.*|
tests/.*|
demos/.*|
.*\.md
)$
12 changes: 6 additions & 6 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ def init_configs(args=None):
type=str,
default=SpecialTokens.image,
help='The special token that represents an image in the text. In '
'default, it\'s "<__dj__image>". You can specify your own special'
' token according to your input dataset.')
'default, it\'s "<__dj__image>". You can specify your own special'
' token according to your input dataset.')
parser.add_argument(
'--eoc_special_token',
type=str,
default=SpecialTokens.eoc,
help='The special token that represents the end of a chunk in the '
'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
'own special token according to your input dataset.')
'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
'own special token according to your input dataset.')
parser.add_argument(
'--suffixes',
type=Union[str, List[str], Tuple[str]],
Expand Down Expand Up @@ -314,8 +314,8 @@ def init_setup_from_cfg(cfg):
if os.path.isdir(cfg.dataset_path):
cfg.dataset_dir = os.path.abspath(cfg.dataset_path)
else:
cfg.dataset_dir = os.path.abspath(
os.path.dirname(cfg.dataset_path))
cfg.dataset_dir = os.path.abspath(os.path.dirname(
cfg.dataset_path))
else:
logger.error(f'Input dataset_path [{cfg.dataset_path}] is invalid. '
f'Please check and retry.')
Expand Down
5 changes: 4 additions & 1 deletion data_juicer/core/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,10 @@ def to_json(dataset, export_path, num_proc=1, **kwargs):
:param kwargs: extra arguments.
:return:
"""
dataset.to_json(export_path, force_ascii=False, num_proc=num_proc, lines=False)
dataset.to_json(export_path,
force_ascii=False,
num_proc=num_proc,
lines=False)

@staticmethod
def to_parquet(dataset, export_path, **kwargs):
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/core/ray_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def run(self, load_data_np=None):
dataset = dataset.filter(op.process)
else:
logger.error(
'Ray executor only support Filter and Mapper OPs for now'
)
'Ray executor only support Filter and Mapper OPs for '
'now')
raise NotImplementedError
except: # noqa: E722
logger.error(f'An error occurred during Op [{op_name}].')
Expand Down
22 changes: 10 additions & 12 deletions data_juicer/format/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def __init__(
self.data_files = find_files_with_suffix(dataset_path, suffixes)
self.add_suffix = add_suffix

def load_dataset(self,
num_proc: int = 1,
global_cfg=None) -> Dataset:
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from dataset file or dataset directory, and unify its
format.
Expand Down Expand Up @@ -103,9 +101,7 @@ def __init__(self,
self.text_keys = text_keys
self.kwargs = kwargs

def load_dataset(self,
num_proc: int = 1,
global_cfg=None) -> Dataset:
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from HuggingFace, and unify its format.
Expand Down Expand Up @@ -226,8 +222,10 @@ def rel2abs(sample, path_keys, dataset_dir):
paths = sample[path_key]
if not paths:
continue
new_paths = [os.path.join(dataset_dir, path)
for path in paths if not os.path.isabs(path)]
new_paths = [
os.path.join(dataset_dir, path) for path in paths
if not os.path.isabs(path)
]
sample[path_key] = new_paths
return sample

Expand All @@ -240,10 +238,10 @@ def rel2abs(sample, path_keys, dataset_dir):
'dataset_dir': ds_dir
})
else:
logger.warning(f'No global config passed into unify_format function. '
f'Relative paths in the dataset might not be converted '
f'to their absolute versions. Data of other modalities '
f'might not be able to find by Data-Juicer.')
logger.warning('No global config passed into unify_format function. '
'Relative paths in the dataset might not be converted '
'to their absolute versions. Data of other modalities '
'might not be able to find by Data-Juicer.')

return dataset

Expand Down
4 changes: 1 addition & 3 deletions data_juicer/format/text_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,7 @@ def __init__(self,
self.dataset_path = dataset_path
self.add_suffix = add_suffix

def load_dataset(self,
num_proc: int = 1,
global_cfg=None) -> Dataset:
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from local text-type files.
Expand Down
48 changes: 28 additions & 20 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

OPERATORS = Registry('Operators')


class OP:
def __init__(self,
text_key: str = None,
image_key: str = None,
):

def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class of operators.
Expand All @@ -29,12 +32,14 @@ def __init__(self,
def process(self, *args, **kwargs):
raise NotImplementedError


class Mapper(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that conducts data editing.
Expand Down Expand Up @@ -63,10 +68,11 @@ def is_batched_op(self):

class Filter(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that removes specific info.
Expand Down Expand Up @@ -104,10 +110,11 @@ def process(self, sample):

class Deduplicator(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that conducts deduplication.
Expand Down Expand Up @@ -144,10 +151,11 @@ def process(self, dataset, show_num=0):

class Selector(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that conducts selection in dataset-level.
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/character_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
@OPERATORS.register_module('character_repetition_filter')
class CharacterRepetitionFilter(Filter):
"""Filter to keep samples with char-level n-gram repetition ratio within a
\ specific range."""
specific range."""

def __init__(self,
rep_len: PositiveInt = 10,
Expand Down
8 changes: 3 additions & 5 deletions data_juicer/ops/filter/image_aspect_ratio_filter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@

import numpy as np

from jsonargparse.typing import PositiveFloat

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import load_image

from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES
from data_juicer.utils.mm_utils import load_image


@OPERATORS.register_module('image_aspect_ratio_filter')
Expand Down Expand Up @@ -85,7 +83,8 @@ def process(self, sample):
aspect_ratios = sample[Fields.stats][StatsKeys.aspect_ratios]
keep_bools = np.array([
self.min_ratio <= aspect_ratio <= self.max_ratio
for aspect_ratio in aspect_ratios])
for aspect_ratio in aspect_ratios
])
if len(keep_bools) <= 0:
return True

Expand All @@ -94,4 +93,3 @@ def process(self, sample):
return keep_bools.any()
else:
return keep_bools.all()

3 changes: 2 additions & 1 deletion data_juicer/ops/filter/language_id_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def compute_stats(self, sample):
def process(self, sample):
if self.lang:
return sample[Fields.stats][StatsKeys.lang] == self.lang \
and sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
and sample[Fields.stats][StatsKeys.lang_score] >= \
self.min_score
else:
return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
3 changes: 2 additions & 1 deletion data_juicer/ops/filter/special_characters_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def compute_stats(self, sample):
return sample

def process(self, sample):
if self.min_ratio <= sample[Fields.stats][StatsKeys.special_char_ratio] \
if self.min_ratio <= \
sample[Fields.stats][StatsKeys.special_char_ratio] \
<= self.max_ratio:
return True
else:
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/word_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
@INTER_WORDS.register_module('word_repetition_filter')
class WordRepetitionFilter(Filter):
"""Filter to keep samples with word-level n-gram repetition ratio within a
\ specific range."""
specific range."""

def __init__(self,
lang: str = 'en',
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/punctuation_normalization_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
@OPERATORS.register_module('punctuation_normalization_mapper')
class PunctuationNormalizationMapper(Mapper):
"""Mapper to normalize unicode punctuations to English punctuations in text
\ samples."""
samples."""

def __init__(self, *args, **kwargs):
"""
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/remove_comments_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class RemoveCommentsMapper(Mapper):
"""
Mapper to remove comments in different kinds of documents.
Only support 'tex' \ for now.
Only support 'tex' for now.
"""

def __init__(self,
Expand Down
4 changes: 3 additions & 1 deletion data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

from datasets import Image

from data_juicer.utils.constant import DEFAULT_PREFIX


# A class to keep special tokens for multimodal information in the texts
# The tokens in this class can be updated by corresponding arguments in config
class SpecialTokens(object):
Expand All @@ -12,9 +12,11 @@ class SpecialTokens(object):
# others
eoc = f'<|{DEFAULT_PREFIX}eoc|>'


def load_images(paths):
return [load_image(path) for path in paths]


def load_image(path):
img_feature = Image()
img = img_feature.decode_example(img_feature.encode_example(path))
Expand Down
Loading

0 comments on commit 1595714

Please sign in to comment.