modelscope · drcege · Mar 14, 2024 · Mar 13, 2024 · Mar 13, 2024 · Mar 14, 2024
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -83,6 +83,9 @@ process:
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated images in the final datasets and the original images will be removed. It's True in default.
       caption_key: null                                       # the key name of fields in samples to store captions for each images, the caption guide the diffusion model to produce what the image is
       hf_img2seq: 'Salesforce/blip2-opt-2.7b'                 # model name on huggingface to generate caption if caption_key is null
+  - image_face_blur_mapper:                                 # mapper to blur faces detected in images.
+      blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
+      radius: 2                                               # radius of blur kernel
   - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
       sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
       aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
@@ -210,10 +213,6 @@ process:
       rep_len: 10                                             # repetition length for char-level n-gram
       min_ratio: 0.0                                          # the min ratio of filter range
       max_ratio: 0.5                                          # the max ratio of filter range
-  - face_area_filter:                                       # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one.
-      min_ratio: 0.0                                          # the min face area ratio of filter range
-      max_ratio: 0.4                                          # the max face area ratio of filter range
-      upsample_num_times: 0                                   # optional argument passing to the underlying dlib face detector
   - flagged_words_filter:                                   # filter text with the flagged-word ratio larger than a specific max value
       lang: en                                                # consider flagged words in what language
       tokenization: false                                     # whether to use model to tokenize documents
@@ -222,15 +221,18 @@ process:
       use_words_aug: false                                    # whether to augment words, especially for Chinese and Vietnamese
       words_aug_group_sizes: [2]                              # the group size of words to augment
       words_aug_join_char: ""                                 # the join char between words to augment
-  - image_aspect_ratio_filter:                              # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them
-      min_ratio: 0.333                                        # the min aspect ratio of filter range
-      max_ratio: 3.0                                          # the max aspect ratio of filter range
-      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
   - image_aesthetics_filter:                                # filter samples according to the aesthetics score of images.
       hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor
       min_score: 0.3                                          # the min aesthetics score of filter range
       max_score: 1.0                                          # the max aesthetics score of filter range
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
+  - image_aspect_ratio_filter:                              # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them
+      min_ratio: 0.333                                        # the min aspect ratio of filter range
+      max_ratio: 3.0                                          # the max aspect ratio of filter range
+      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
+  - image_face_ratio_filter:                                # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one.
+      min_ratio: 0.0                                          # the min face area ratio of filter range
+      max_ratio: 0.4                                          # the max face area ratio of filter range
   - image_shape_filter:                                     # filter samples according to the widths and heights of images in them
       min_width: 200                                          # the min width of width filter range
       max_width: 5000                                         # the max width of width filter range

diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
@@ -2,16 +2,16 @@
 from . import (alphanumeric_filter, audio_duration_filter,
                audio_nmf_snr_filter, audio_size_filter,
                average_line_length_filter, character_repetition_filter,
-               face_area_filter, flagged_words_filter, image_aesthetics_filter,
-               image_aspect_ratio_filter, image_shape_filter,
-               image_size_filter, image_text_matching_filter,
-               image_text_similarity_filter, language_id_score_filter,
-               maximum_line_length_filter, perplexity_filter,
-               phrase_grounding_recall_filter, special_characters_filter,
-               specified_field_filter, specified_numeric_field_filter,
-               stopwords_filter, suffix_filter, text_action_filter,
-               text_entity_dependency_filter, text_length_filter,
-               token_num_filter, video_aesthetics_filter,
+               flagged_words_filter, image_aesthetics_filter,
+               image_aspect_ratio_filter, image_face_ratio_filter,
+               image_shape_filter, image_size_filter,
+               image_text_matching_filter, image_text_similarity_filter,
+               language_id_score_filter, maximum_line_length_filter,
+               perplexity_filter, phrase_grounding_recall_filter,
+               special_characters_filter, specified_field_filter,
+               specified_numeric_field_filter, stopwords_filter, suffix_filter,
+               text_action_filter, text_entity_dependency_filter,
+               text_length_filter, token_num_filter, video_aesthetics_filter,
                video_aspect_ratio_filter, video_duration_filter,
                video_frames_text_similarity_filter, video_motion_score_filter,
                video_ocr_area_ratio_filter, video_resolution_filter,

diff --git a/data_juicer/ops/filter/face_area_filter.py → ...cer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/face_area_filter.py → ...cer/ops/filter/image_face_ratio_filter.py
@@ -10,18 +10,20 @@
 from ..base_op import OPERATORS, Filter
 from ..op_fusion import LOADED_IMAGES
 
-OP_NAME = 'face_area_filter'
+OP_NAME = 'image_face_ratio_filter'
 
 with AvailabilityChecking(['dlib'], OP_NAME):
     import dlib
 
 
 @OPERATORS.register_module(OP_NAME)
 @LOADED_IMAGES.register_module(OP_NAME)
-class FaceAreaFilter(Filter):
-    """Filter to keep samples with face area ratio within a specific range.
+class ImageFaceRatioFilter(Filter):
+    """Filter to keep samples with face area ratios within a specific range.
     """
 
+    _default_kwargs = {'upsample_num_times': 0}
+
     def __init__(self,
                  min_ratio: ClosedUnitInterval = 0.0,
                  max_ratio: ClosedUnitInterval = 0.4,
@@ -40,18 +42,15 @@ def __init__(self,
         :param args: Extra positional arguments.
         :param kwargs: Extra keyword arguments.
         """
-
-        # Extract face detector arguments from kwargs
-        detector_keys = ['upsample_num_times']
-        self.detector_kwargs = {
-            key: kwargs.pop(key)
-            for key in detector_keys if key in kwargs
-        }
-
         super().__init__(*args, **kwargs)
         self.min_ratio = min_ratio
         self.max_ratio = max_ratio
 
+        self.extra_kwargs = {
+            k: kwargs.get(k, v)
+            for k, v in self._default_kwargs.items()
+        }
+
         if any_or_all not in ['any', 'all']:
             raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
                              f'Can only be one of ["any", "all"].')
@@ -80,7 +79,7 @@ def compute_stats(self, sample, context=False):
         face_detections = {}
         for key, image in images.items():
             img = pil_to_opencv(image)
-            dets = self.detector(img, **self.detector_kwargs)
+            dets = self.detector(img, **self.extra_kwargs)
             face_detections[key] = [[
                 max(det.left(), 0),
                 max(det.top(), 0),

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -4,7 +4,8 @@
                clean_ip_mapper, clean_links_mapper, expand_macro_mapper,
                fix_unicode_mapper, image_blur_mapper,
                image_captioning_from_gpt4v_mapper, image_captioning_mapper,
-               image_diffusion_mapper, nlpaug_en_mapper, nlpcda_zh_mapper,
+               image_diffusion_mapper, image_face_blur_mapper,
+               nlpaug_en_mapper, nlpcda_zh_mapper,
                punctuation_normalization_mapper, remove_bibliography_mapper,
                remove_comments_mapper, remove_header_mapper,
                remove_long_words_mapper, remove_non_chinese_character_mapper,