diff --git a/configs/config_all.yaml b/configs/config_all.yaml index 08d447e5b..550d570ae 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -110,6 +110,10 @@ process: rep_len: 10 # repetition length for char-level n-gram min_ratio: 0.0 # the min ratio of filter range max_ratio: 0.5 # the max ratio of filter range + - face_area_filter: # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one. + min_ratio: 0.0 # the min face area ratio of filter range + max_ratio: 0.4 # the max face area ratio of filter range + upsample_num_times: 0 # optional argument passing to the underlying dlib face detector - flagged_words_filter: # filter text with the flagged-word ratio larger than a specific max value lang: en # consider flagged words in what language tokenization: false # whether to use model to tokenize documents diff --git a/data_juicer/ops/filter/face_area_filter.py b/data_juicer/ops/filter/face_area_filter.py index 59ef9fa55..ed5623c9f 100644 --- a/data_juicer/ops/filter/face_area_filter.py +++ b/data_juicer/ops/filter/face_area_filter.py @@ -63,10 +63,10 @@ def compute_stats(self, sample, context=False): if StatsKeys.face_ratios in sample[Fields.stats]: return sample - # there is no image in this sample, still default ratio 0.0 + # there is no image in this sample if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.face_ratios] = np.empty(0, - dtype=float) + sample[Fields.stats][StatsKeys.face_ratios] = np.array( + [], dtype=np.float64) return sample # load images @@ -86,38 +86,29 @@ def compute_stats(self, sample, context=False): # store the image data into context sample[Fields.context][loaded_image_key] = image - # check if faces detected already - if StatsKeys.face_detections not in sample[Fields.stats]: - face_detections = {} - for key, image in images.items(): - img = pil_to_opencv(image) - dets = self.detector(img, **self.detector_kwargs) - dets_formatted = [[ - det.left(), - det.top(), - det.width(), - det.height() - ] for det in dets] if dets else [[0, 0, 0, 0]] - face_detections[key] = dets_formatted - sample[Fields.stats][StatsKeys.face_detections] = [ - face_detections[key] for key in loaded_image_keys - ] - - max_face_ratios = [] - for key, dets in zip(loaded_image_keys, - sample[Fields.stats][StatsKeys.face_detections]): - img_area = images[key].width * images[key].height - # Calculate the max face ratio for the current image - max_face_ratios.append( - max([w * h / img_area for _, _, w, h in dets], default=0.0)) - sample[Fields.stats][StatsKeys.face_ratios] = max_face_ratios - + # detect faces + face_detections = {} + for key, image in images.items(): + img = pil_to_opencv(image) + dets = self.detector(img, **self.detector_kwargs) + face_detections[key] = [[ + det.left(), det.top(), + det.width(), det.height() + ] for det in dets] + + # compute face area ratios for each image considering the largest face + face_area_ratios = {} + for key, dets in face_detections.items(): + image_area = images[key].width * images[key].height + face_area_ratios[key] = max( + [w * h / image_area for _, _, w, h in dets], default=0.0) + + sample[Fields.stats][StatsKeys.face_ratios] = [ + face_area_ratios[key] for key in loaded_image_keys + ] return sample def process(self, sample): - if self.image_key not in sample or not sample[self.image_key]: - return True - face_ratios = sample[Fields.stats][StatsKeys.face_ratios] if len(face_ratios) <= 0: return True diff --git a/tests/ops/filter/test_face_area_filter.py b/tests/ops/filter/test_face_area_filter.py index c911bf099..0008c9377 100644 --- a/tests/ops/filter/test_face_area_filter.py +++ b/tests/ops/filter/test_face_area_filter.py @@ -2,7 +2,6 @@ import unittest from datasets import Dataset -# from data_juicer.core.data import NestedDataset as Dataset from data_juicer.ops.filter.face_area_filter import FaceAreaFilter from data_juicer.utils.constant import Fields