Skip to content

Commit

Permalink
pre-commit checked
Browse files Browse the repository at this point in the history
  • Loading branch information
yxdyc committed Nov 14, 2023
1 parent d044c5b commit df4911c
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 22 deletions.
11 changes: 5 additions & 6 deletions data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from . import (alphanumeric_filter, average_line_length_filter,
character_repetition_filter, flagged_words_filter,
image_aspect_ratio_filter, image_size_filter,
language_id_score_filter,
maximum_line_length_filter, perplexity_filter,
special_characters_filter, specified_field_filter,
specified_numeric_field_filter, stopwords_filter, suffix_filter,
text_length_filter, token_num_filter, word_num_filter,
word_repetition_filter)
language_id_score_filter, maximum_line_length_filter,
perplexity_filter, special_characters_filter,
specified_field_filter, specified_numeric_field_filter,
stopwords_filter, suffix_filter, text_length_filter,
token_num_filter, word_num_filter, word_repetition_filter)
18 changes: 7 additions & 11 deletions data_juicer/ops/filter/image_size_filter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@

import numpy as np

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import get_image_size, size_to_bytes

from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES
from data_juicer.utils.mm_utils import get_image_size, size_to_bytes


@OPERATORS.register_module('image_size_filter')
Expand All @@ -16,8 +15,8 @@ class ImageSizeFilter(Filter):
"""

def __init__(self,
min_size: str = "0",
max_size: str = "1Tb",
min_size: str = '0',
max_size: str = '1Tb',
any_or_all: str = 'any',
*args,
**kwargs):
Expand Down Expand Up @@ -56,19 +55,17 @@ def compute_stats(self, sample, context=False):

# for size calculation, no need to load images into memory
sample[Fields.stats][StatsKeys.image_sizes] = [
get_image_size(img_path)
for img_path in sample[self.image_key]
get_image_size(img_path) for img_path in sample[self.image_key]
]

return sample

def process(self, sample):
image_sizes = sample[Fields.stats][StatsKeys.image_sizes]
keep_bools = np.array([
size_to_bytes(self.min_size)
<= image_size <=
size_to_bytes(self.max_size)
for image_size in image_sizes])
size_to_bytes(self.min_size) <= image_size <= size_to_bytes(
self.max_size) for image_size in image_sizes
])
if len(keep_bools) <= 0:
return True

Expand All @@ -77,4 +74,3 @@ def process(self, sample):
return keep_bools.any()
else:
return keep_bools.all()

9 changes: 4 additions & 5 deletions data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def size_to_bytes(size):
numbers_list = [char for char in size if char.isdigit()]

if len(numbers_list) == 0:
raise ValueError(f"Your input `size` does not contain numbers: {size}")
raise ValueError(f'Your input `size` does not contain numbers: {size}')

size_numbers = int(float(''.join(numbers_list)))

Expand Down Expand Up @@ -61,7 +61,6 @@ def size_to_bytes(size):
elif suffix == 'yb' or suffix == 'yib':
return size_numbers << 80
else:
raise ValueError(f"You specified unidentifiable unit: {suffix}, "
f"expected in [KB, MB, GB, TB, PB, EB, ZB, YB, "
f"KiB, MiB, GiB, TiB, PiB, EiB, ZiB, YiB]")

raise ValueError(f'You specified unidentifiable unit: {suffix}, '
f'expected in [KB, MB, GB, TB, PB, EB, ZB, YB, '
f'KiB, MiB, GiB, TiB, PiB, EiB, ZiB, YiB]')

0 comments on commit df4911c

Please sign in to comment.