From 2fdf484143762f746cf560f21fb13c03bc97fbf5 Mon Sep 17 00:00:00 2001
From: Yilun Huang <lielin.hyl@alibaba-inc.com>
Date: Fri, 20 Dec 2024 11:03:00 +0800
Subject: [PATCH] [Feature] add auto mode for analyzer (#512)

* + add auto mode for analyzer: load all filters that produce stats to analyze the target dataset

* + add default mem_required for those model-based OPs

* - support wordcloud drawing for str or str list fields in stats
- support set the number of samples to be analyzed in auto mode. It's 1k in default.

* - take the minimum one of dataset length and auto num

* * update default export path

* * set version limit for wandb to avoid exception

* + add docs for auto mode
---
 README.md                                     |  5 ++
 README_ZH.md                                  |  4 ++
 configs/config_all.yaml                       |  2 +-
 data_juicer/analysis/column_wise_analysis.py  | 69 ++++++++++++++-----
 data_juicer/config/config.py                  | 52 ++++++++++++--
 data_juicer/core/analyzer.py                  |  6 +-
 data_juicer/ops/filter/__init__.py            |  7 ++
 .../ops/filter/image_aesthetics_filter.py     |  2 +-
 data_juicer/ops/filter/image_nsfw_filter.py   |  1 +
 .../ops/filter/image_text_matching_filter.py  |  1 +
 .../filter/image_text_similarity_filter.py    |  1 +
 .../ops/filter/image_watermark_filter.py      |  1 +
 .../filter/phrase_grounding_recall_filter.py  |  1 +
 .../ops/filter/video_aesthetics_filter.py     |  2 +-
 .../video_frames_text_similarity_filter.py    |  1 +
 data_juicer/ops/filter/video_nsfw_filter.py   |  1 +
 .../video_tagging_from_frames_filter.py       |  1 +
 .../ops/filter/video_watermark_filter.py      |  1 +
 .../ops/mapper/image_captioning_mapper.py     |  2 +
 .../ops/mapper/image_diffusion_mapper.py      |  1 +
 .../ops/mapper/image_tagging_mapper.py        |  1 +
 .../video_captioning_from_audio_mapper.py     |  1 +
 .../video_captioning_from_frames_mapper.py    |  1 +
 ...video_captioning_from_summarizer_mapper.py |  1 +
 .../video_captioning_from_video_mapper.py     |  1 +
 .../mapper/video_tagging_from_audio_mapper.py |  1 +
 .../video_tagging_from_frames_mapper.py       |  1 +
 environments/dev_requires.txt                 |  2 +-
 environments/minimal_requires.txt             |  1 +
 environments/sandbox_requires.txt             |  1 -
 30 files changed, 143 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 518e54713..d891ac332 100644
--- a/README.md
+++ b/README.md
@@ -333,6 +333,11 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml
 
 # use command line tool
 dj-analyze --config configs/demo/analyzer.yaml
+
+# you can also use auto mode to avoid writing a recipe. It will analyze a small
+# part (e.g. 1000 samples, specified by argument `auto_num`) of your dataset 
+# with all Filters that produce stats.
+dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000]
 ```
 
 - **Note:** Analyzer only compute stats of Filter ops. So extra Mapper or Deduplicator ops will be ignored in the analysis process.
diff --git a/README_ZH.md b/README_ZH.md
index 366fcb004..01633731b 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -310,6 +310,10 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml
 
 # 使用命令行工具
 dj-analyze --config configs/demo/analyzer.yaml
+
+# 你也可以使用"自动"模式来避免写一个新的数据菜谱。它会使用全部可产出统计信息的 Filter 来分析
+# 你的数据集的一小部分（如1000条样本，可通过 `auto_num` 参数指定）
+dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000]
 ```
 
 * **注意**：Analyzer 只计算 Filter 算子的状态，其他的算子（例如 Mapper 和 Deduplicator）会在分析过程中被忽略。
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index 9811b0e97..82cd6824e 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -594,7 +594,7 @@ process:
       vertical_flip: false                                    # flip frame image vertically (top to bottom).
       reduce_mode: avg                                        # reduce mode when one text corresponds to multiple videos in a chunk,  must be one of ['avg','max', 'min'].
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
-      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
   - video_motion_score_filter:                              # Keep samples with video motion scores within a specific range.
       min_score: 0.25                                         # the minimum motion score to keep samples
       max_score: 10000.0                                      # the maximum motion score to keep samples
diff --git a/data_juicer/analysis/column_wise_analysis.py b/data_juicer/analysis/column_wise_analysis.py
index 775b42683..825d9b4dd 100644
--- a/data_juicer/analysis/column_wise_analysis.py
+++ b/data_juicer/analysis/column_wise_analysis.py
@@ -4,6 +4,7 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 from tqdm import tqdm
+from wordcloud import WordCloud
 
 from data_juicer.utils.constant import Fields
 
@@ -145,33 +146,39 @@ def analyze(self, show_percentiles=False, show=False, skip_export=False):
                 else:
                     axes = [None] * num_subcol
 
-                # draw histogram
-                self.draw_hist(axes[0],
-                               data,
-                               os.path.join(self.output_path,
-                                            f'{column_name}-hist.png'),
-                               percentiles=percentiles)
-
-                # draw box
-                self.draw_box(axes[1],
-                              data,
-                              os.path.join(self.output_path,
-                                           f'{column_name}-box.png'),
-                              percentiles=percentiles)
+                if not skip_export:
+                    # draw histogram
+                    self.draw_hist(axes[0],
+                                   data,
+                                   os.path.join(self.output_path,
+                                                f'{column_name}-hist.png'),
+                                   percentiles=percentiles)
+
+                    # draw box
+                    self.draw_box(axes[1],
+                                  data,
+                                  os.path.join(self.output_path,
+                                               f'{column_name}-box.png'),
+                                  percentiles=percentiles)
             else:
                 # object (string) or string list -- only draw histogram for
                 # this stat
                 if self.save_stats_in_one_file:
-                    axes = subfig.subplots(1, 1)
+                    axes = subfig.subplots(1, num_subcol)
                 else:
-                    axes = None
+                    axes = [None] * num_subcol
 
                 if not skip_export:
                     self.draw_hist(
-                        axes, data,
+                        axes[0], data,
                         os.path.join(self.output_path,
                                      f'{column_name}-hist.png'))
 
+                    self.draw_wordcloud(
+                        axes[1], data,
+                        os.path.join(self.output_path,
+                                     f'{column_name}-wordcloud.png'))
+
             # add a title to the figure of this stat
             if self.save_stats_in_one_file:
                 subfig.suptitle(f'{data.name}',
@@ -297,3 +304,33 @@ def draw_box(self, ax, data, save_path, percentiles=None, show=False):
                 # accumulated overlapped figures in different draw_xxx function
                 # calling
                 ax.clear()
+
+    def draw_wordcloud(self, ax, data, save_path, show=False):
+        word_list = data.tolist()
+        word_nums = {}
+        for w in word_list:
+            if w in word_nums:
+                word_nums[w] += 1
+            else:
+                word_nums[w] = 1
+
+        wc = WordCloud(width=400, height=320)
+        wc.generate_from_frequencies(word_nums)
+
+        if ax is None:
+            ax = plt.figure(figsize=(20, 16))
+        else:
+            ax.imshow(wc, interpolation='bilinear')
+            ax.axis('off')
+
+        if not self.save_stats_in_one_file:
+            # save into file
+            wc.to_file(save_path)
+
+            if show:
+                plt.show()
+            else:
+                # if no showing, we need to clear this axes to avoid
+                # accumulated overlapped figures in different draw_xxx function
+                # calling
+                ax.clear()
diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
index 6cf30b5a9..c7f0aaf38 100644
--- a/data_juicer/config/config.py
+++ b/data_juicer/config/config.py
@@ -23,7 +23,7 @@
 global_parser = None
 
 
-def init_configs(args: Optional[List[str]] = None):
+def init_configs(args: Optional[List[str]] = None, which_entry: object = None):
     """
     initialize the jsonargparse parser and parse configs from one of:
         1. POSIX-style commands line args;
@@ -32,14 +32,29 @@ def init_configs(args: Optional[List[str]] = None):
         4. hard-coded defaults
 
     :param args: list of params, e.g., ['--conifg', 'cfg.yaml'], defaut None.
+    :param which_entry: which entry to init configs (executor/analyzer)
     :return: a global cfg object used by the Executor or Analyzer
     """
     parser = ArgumentParser(default_env=True, default_config_files=None)
 
-    parser.add_argument('--config',
-                        action=ActionConfigFile,
-                        help='Path to a dj basic configuration file.',
-                        required=True)
+    # required but mutually exclusive args group
+    required_group = parser.add_mutually_exclusive_group(required=True)
+    required_group.add_argument('--config',
+                                action=ActionConfigFile,
+                                help='Path to a dj basic configuration file.')
+    required_group.add_argument('--auto',
+                                action='store_true',
+                                help='Weather to use an auto analyzing '
+                                'strategy instead of a specific data '
+                                'recipe. If a specific config file is '
+                                'given by --config arg, this arg is '
+                                'disabled. Only available for Analyzer.')
+
+    parser.add_argument('--auto_num',
+                        type=PositiveInt,
+                        default=1000,
+                        help='The number of samples to be analyzed '
+                        'automatically. It\'s 1000 in default.')
 
     parser.add_argument(
         '--hpo_config',
@@ -97,7 +112,7 @@ def init_configs(args: Optional[List[str]] = None):
     parser.add_argument(
         '--export_path',
         type=str,
-        default='./outputs/hello_world.jsonl',
+        default='./outputs/hello_world/hello_world.jsonl',
         help='Path to export and save the output processed dataset. The '
         'directory to store the processed dataset will be the work '
         'directory of this process.')
@@ -339,6 +354,14 @@ def init_configs(args: Optional[List[str]] = None):
 
     try:
         cfg = parser.parse_args(args=args)
+
+        # check the entry
+        from data_juicer.core.analyzer import Analyzer
+        if not isinstance(which_entry, Analyzer) and cfg.auto:
+            err_msg = '--auto argument can only be used for analyzer!'
+            logger.error(err_msg)
+            raise NotImplementedError(err_msg)
+
         cfg = init_setup_from_cfg(cfg)
         cfg = update_op_process(cfg, parser)
 
@@ -488,6 +511,16 @@ def init_setup_from_cfg(cfg: Namespace):
     SpecialTokens.image = cfg.image_special_token
     SpecialTokens.eoc = cfg.eoc_special_token
 
+    # add all filters that produce stats
+    if cfg.auto:
+        import pkgutil
+
+        import data_juicer.ops.filter as djfilters
+        cfg.process = [{
+            filter_name: {}
+        } for _, filter_name, _ in pkgutil.iter_modules(djfilters.__path__)
+                       if filter_name not in djfilters.NON_STATS_FILTERS]
+
     # Apply text_key modification during initializing configs
     # users can freely specify text_key for different ops using `text_key`
     # otherwise, set arg text_key of each op to text_keys
@@ -636,7 +669,10 @@ def update_op_process(cfg, parser):
     temp_args = namespace_to_arg_list(temp_cfg,
                                       includes=recognized_args,
                                       excludes=['config'])
-    temp_args = ['--config', temp_cfg.config[0].absolute] + temp_args
+    if temp_cfg.config:
+        temp_args = ['--config', temp_cfg.config[0].absolute] + temp_args
+    else:
+        temp_args = ['--auto'] + temp_args
     temp_parser.parse_args(temp_args)
     return cfg
 
@@ -662,6 +698,8 @@ def namespace_to_arg_list(namespace, prefix='', includes=None, excludes=None):
 
 
 def config_backup(cfg: Namespace):
+    if not cfg.config:
+        return
     cfg_path = cfg.config[0].absolute
     work_dir = cfg.work_dir
     target_path = os.path.join(work_dir, os.path.basename(cfg_path))
diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py
index 2ae4d3511..63e512d41 100644
--- a/data_juicer/core/analyzer.py
+++ b/data_juicer/core/analyzer.py
@@ -33,7 +33,7 @@ def __init__(self, cfg: Optional[Namespace] = None):
 
         :param cfg: optional jsonargparse Namespace dict.
         """
-        self.cfg = init_configs() if cfg is None else cfg
+        self.cfg = init_configs(which_entry=self) if cfg is None else cfg
 
         self.work_dir = self.cfg.work_dir
 
@@ -87,6 +87,10 @@ def run(self,
         if load_data_np is None:
             load_data_np = self.cfg.np
         dataset = self.formatter.load_dataset(load_data_np, self.cfg)
+        if self.cfg.auto:
+            # if it's auto analysis, only analyze for a minor part of the input
+            # dataset to save time and computing resource
+            dataset = dataset.take(min(len(dataset), self.cfg.auto_num))
 
         # extract processes
         logger.info('Preparing process operators...')
diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
index dad6818e1..8cb986b2b 100644
--- a/data_juicer/ops/filter/__init__.py
+++ b/data_juicer/ops/filter/__init__.py
@@ -63,3 +63,10 @@
     'VideoTaggingFromFramesFilter', 'VideoWatermarkFilter',
     'WordRepetitionFilter', 'WordsNumFilter'
 ]
+
+NON_STATS_FILTERS = [
+    'specified_field_filter',
+    'specified_numeric_field_filter',
+    'suffix_filter',
+    'video_tagging_from_frames_filter',
+]
diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py
index bbaba15eb..723845a5d 100644
--- a/data_juicer/ops/filter/image_aesthetics_filter.py
+++ b/data_juicer/ops/filter/image_aesthetics_filter.py
@@ -46,7 +46,7 @@ def __init__(self,
         :param args: Extra positional arguments.
         :param kwargs: Extra keyword arguments.
         """
-
+        kwargs.setdefault('mem_required', '1500MB')
         super().__init__(*args, **kwargs)
         if hf_scorer_model == '':
             hf_scorer_model = \
diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py
index 603a48518..aea409ec4 100644
--- a/data_juicer/ops/filter/image_nsfw_filter.py
+++ b/data_juicer/ops/filter/image_nsfw_filter.py
@@ -41,6 +41,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '1GB')
         super().__init__(*args, **kwargs)
         self.score_threshold = score_threshold
         if any_or_all not in ['any', 'all']:
diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py
index dc36cd68a..6881eccf5 100644
--- a/data_juicer/ops/filter/image_text_matching_filter.py
+++ b/data_juicer/ops/filter/image_text_matching_filter.py
@@ -52,6 +52,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '1500MB')
         super().__init__(*args, **kwargs)
         self.min_score = min_score
         self.max_score = max_score
diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py
index d43c9bc3f..9a3f9361b 100644
--- a/data_juicer/ops/filter/image_text_similarity_filter.py
+++ b/data_juicer/ops/filter/image_text_similarity_filter.py
@@ -53,6 +53,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '1500MB')
         super().__init__(*args, **kwargs)
         self.min_score = min_score
         self.max_score = max_score
diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py
index 0d9eead6a..b752736a4 100644
--- a/data_juicer/ops/filter/image_watermark_filter.py
+++ b/data_juicer/ops/filter/image_watermark_filter.py
@@ -45,6 +45,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '500MB')
         super().__init__(*args, **kwargs)
         self.prob_threshold = prob_threshold
         if any_or_all not in ['any', 'all']:
diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py
index 98a2dfb1f..9dec0dc3c 100644
--- a/data_juicer/ops/filter/phrase_grounding_recall_filter.py
+++ b/data_juicer/ops/filter/phrase_grounding_recall_filter.py
@@ -114,6 +114,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '1GB')
         super().__init__(*args, **kwargs)
         self.min_recall = min_recall
         self.max_recall = max_recall
diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py
index 5e674162d..f65334f56 100644
--- a/data_juicer/ops/filter/video_aesthetics_filter.py
+++ b/data_juicer/ops/filter/video_aesthetics_filter.py
@@ -73,7 +73,7 @@ def __init__(self,
         :param args: Extra positional arguments.
         :param kwargs: Extra keyword arguments.
         """
-
+        kwargs.setdefault('mem_required', '1500MB')
         super().__init__(*args, **kwargs)
         if hf_scorer_model == '':
             hf_scorer_model = \
diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py
index 6b3e92641..da793ccf4 100644
--- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py
+++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py
@@ -74,6 +74,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '1500MB')
         super().__init__(*args, **kwargs)
         self.min_score = min_score
         self.max_score = max_score
diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py
index 27bafe1d0..a1dd9d214 100644
--- a/data_juicer/ops/filter/video_nsfw_filter.py
+++ b/data_juicer/ops/filter/video_nsfw_filter.py
@@ -65,6 +65,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '1GB')
         super().__init__(*args, **kwargs)
         self.score_threshold = score_threshold
         if frame_sampling_method not in ['all_keyframes', 'uniform']:
diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py
index 7c41b5521..8872aab32 100644
--- a/data_juicer/ops/filter/video_tagging_from_frames_filter.py
+++ b/data_juicer/ops/filter/video_tagging_from_frames_filter.py
@@ -61,6 +61,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '9GB')
         super().__init__(*args, **kwargs)
         if contain not in ['any', 'all']:
             raise ValueError(f'the containing type [{contain}] is not '
diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py
index 2b7e30f8f..959c91e23 100644
--- a/data_juicer/ops/filter/video_watermark_filter.py
+++ b/data_juicer/ops/filter/video_watermark_filter.py
@@ -69,6 +69,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '500MB')
         super().__init__(*args, **kwargs)
         self.prob_threshold = prob_threshold
         if frame_sampling_method not in ['all_keyframes', 'uniform']:
diff --git a/data_juicer/ops/mapper/image_captioning_mapper.py b/data_juicer/ops/mapper/image_captioning_mapper.py
index 0bc486193..98bb3ad7c 100644
--- a/data_juicer/ops/mapper/image_captioning_mapper.py
+++ b/data_juicer/ops/mapper/image_captioning_mapper.py
@@ -81,6 +81,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '16GB')
+
         super().__init__(*args, **kwargs)
 
         if keep_candidate_mode not in [
diff --git a/data_juicer/ops/mapper/image_diffusion_mapper.py b/data_juicer/ops/mapper/image_diffusion_mapper.py
index c53d6f56d..53e315844 100644
--- a/data_juicer/ops/mapper/image_diffusion_mapper.py
+++ b/data_juicer/ops/mapper/image_diffusion_mapper.py
@@ -91,6 +91,7 @@ def __init__(self,
         :param hf_img2seq: model name on huggingface to generate caption if
             caption_key is None.
         """
+        kwargs.setdefault('mem_required', '8GB')
         super().__init__(*args, **kwargs)
         self._init_parameters = self.remove_extra_parameters(locals())
         self.strength = strength
diff --git a/data_juicer/ops/mapper/image_tagging_mapper.py b/data_juicer/ops/mapper/image_tagging_mapper.py
index d47fbf0ef..e3fc46f1b 100644
--- a/data_juicer/ops/mapper/image_tagging_mapper.py
+++ b/data_juicer/ops/mapper/image_tagging_mapper.py
@@ -36,6 +36,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '9GB')
         super().__init__(*args, **kwargs)
         self.model_key = prepare_model(
             model_type='recognizeAnything',
diff --git a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py
index 4833409a4..75ffb9b3a 100644
--- a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py
+++ b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py
@@ -32,6 +32,7 @@ def __init__(self, keep_original_sample: bool = True, *args, **kwargs):
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '30GB')
         super().__init__(*args, **kwargs)
         AUTOINSTALL.check([
             'transformers', 'transformers_stream_generator', 'einops',
diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py
index dbf614510..d4c664c5f 100644
--- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py
+++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py
@@ -108,6 +108,7 @@ def __init__(
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '20GB')
         super().__init__(*args, **kwargs)
 
         if keep_candidate_mode not in [
diff --git a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py
index b2f4c8139..67eb7e234 100644
--- a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py
+++ b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py
@@ -81,6 +81,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '40GB')
         super().__init__(*args, **kwargs)
         AUTOINSTALL.check([
             'torch',
diff --git a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py
index 04cd641ab..737626260 100644
--- a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py
+++ b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py
@@ -108,6 +108,7 @@ def __init__(
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '20GB')
         super().__init__(*args, **kwargs)
 
         if keep_candidate_mode not in [
diff --git a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py
index 763a3381c..2c32093a5 100644
--- a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py
+++ b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py
@@ -37,6 +37,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '500MB')
         super().__init__(*args, **kwargs)
         AUTOINSTALL.check(['torchaudio'])
         self.model_key = prepare_model(model_type='huggingface',
diff --git a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py
index 26227738b..d4995d3f6 100644
--- a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py
+++ b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py
@@ -55,6 +55,7 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '9GB')
         super().__init__(*args, **kwargs)
         if frame_sampling_method not in ['all_keyframes', 'uniform']:
             raise ValueError(
diff --git a/environments/dev_requires.txt b/environments/dev_requires.txt
index 0ecd058c4..44dd79158 100644
--- a/environments/dev_requires.txt
+++ b/environments/dev_requires.txt
@@ -4,4 +4,4 @@ sphinx
 sphinx-autobuild
 sphinx_rtd_theme
 recommonmark
-wandb
+wandb<=0.19.0
diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt
index 414458edc..71aa0ba38 100644
--- a/environments/minimal_requires.txt
+++ b/environments/minimal_requires.txt
@@ -33,3 +33,4 @@ pydantic>=2.0
 Pillow
 fastapi[standard]>=0.100
 httpx
+wordcloud
diff --git a/environments/sandbox_requires.txt b/environments/sandbox_requires.txt
index 7f1d27a25..6a1791cf8 100644
--- a/environments/sandbox_requires.txt
+++ b/environments/sandbox_requires.txt
@@ -1,5 +1,4 @@
 torch>=1.11.0
-wandb
 fire
 pyspark
 # vbench-related