diff --git a/README.md b/README.md index 518e54713..d891ac332 100644 --- a/README.md +++ b/README.md @@ -333,6 +333,11 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml # use command line tool dj-analyze --config configs/demo/analyzer.yaml + +# you can also use auto mode to avoid writing a recipe. It will analyze a small +# part (e.g. 1000 samples, specified by argument `auto_num`) of your dataset +# with all Filters that produce stats. +dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000] ``` - **Note:** Analyzer only compute stats of Filter ops. So extra Mapper or Deduplicator ops will be ignored in the analysis process. diff --git a/README_ZH.md b/README_ZH.md index 366fcb004..01633731b 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -310,6 +310,10 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml # 使用命令行工具 dj-analyze --config configs/demo/analyzer.yaml + +# 你也可以使用"自动"模式来避免写一个新的数据菜谱。它会使用全部可产出统计信息的 Filter 来分析 +# 你的数据集的一小部分(如1000条样本,可通过 `auto_num` 参数指定) +dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000] ``` * **注意**:Analyzer 只计算 Filter 算子的状态,其他的算子(例如 Mapper 和 Deduplicator)会在分析过程中被忽略。 diff --git a/configs/config_all.yaml b/configs/config_all.yaml index 9811b0e97..82cd6824e 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -594,7 +594,7 @@ process: vertical_flip: false # flip frame image vertically (top to bottom). reduce_mode: avg # reduce mode when one text corresponds to multiple videos in a chunk, must be one of ['avg','max', 'min']. any_or_all: any # keep this sample when any/all videos meet the filter condition - mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched + mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched - video_motion_score_filter: # Keep samples with video motion scores within a specific range. min_score: 0.25 # the minimum motion score to keep samples max_score: 10000.0 # the maximum motion score to keep samples diff --git a/data_juicer/analysis/column_wise_analysis.py b/data_juicer/analysis/column_wise_analysis.py index 775b42683..825d9b4dd 100644 --- a/data_juicer/analysis/column_wise_analysis.py +++ b/data_juicer/analysis/column_wise_analysis.py @@ -4,6 +4,7 @@ import matplotlib.pyplot as plt import pandas as pd from tqdm import tqdm +from wordcloud import WordCloud from data_juicer.utils.constant import Fields @@ -145,33 +146,39 @@ def analyze(self, show_percentiles=False, show=False, skip_export=False): else: axes = [None] * num_subcol - # draw histogram - self.draw_hist(axes[0], - data, - os.path.join(self.output_path, - f'{column_name}-hist.png'), - percentiles=percentiles) - - # draw box - self.draw_box(axes[1], - data, - os.path.join(self.output_path, - f'{column_name}-box.png'), - percentiles=percentiles) + if not skip_export: + # draw histogram + self.draw_hist(axes[0], + data, + os.path.join(self.output_path, + f'{column_name}-hist.png'), + percentiles=percentiles) + + # draw box + self.draw_box(axes[1], + data, + os.path.join(self.output_path, + f'{column_name}-box.png'), + percentiles=percentiles) else: # object (string) or string list -- only draw histogram for # this stat if self.save_stats_in_one_file: - axes = subfig.subplots(1, 1) + axes = subfig.subplots(1, num_subcol) else: - axes = None + axes = [None] * num_subcol if not skip_export: self.draw_hist( - axes, data, + axes[0], data, os.path.join(self.output_path, f'{column_name}-hist.png')) + self.draw_wordcloud( + axes[1], data, + os.path.join(self.output_path, + f'{column_name}-wordcloud.png')) + # add a title to the figure of this stat if self.save_stats_in_one_file: subfig.suptitle(f'{data.name}', @@ -297,3 +304,33 @@ def draw_box(self, ax, data, save_path, percentiles=None, show=False): # accumulated overlapped figures in different draw_xxx function # calling ax.clear() + + def draw_wordcloud(self, ax, data, save_path, show=False): + word_list = data.tolist() + word_nums = {} + for w in word_list: + if w in word_nums: + word_nums[w] += 1 + else: + word_nums[w] = 1 + + wc = WordCloud(width=400, height=320) + wc.generate_from_frequencies(word_nums) + + if ax is None: + ax = plt.figure(figsize=(20, 16)) + else: + ax.imshow(wc, interpolation='bilinear') + ax.axis('off') + + if not self.save_stats_in_one_file: + # save into file + wc.to_file(save_path) + + if show: + plt.show() + else: + # if no showing, we need to clear this axes to avoid + # accumulated overlapped figures in different draw_xxx function + # calling + ax.clear() diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py index 6cf30b5a9..c7f0aaf38 100644 --- a/data_juicer/config/config.py +++ b/data_juicer/config/config.py @@ -23,7 +23,7 @@ global_parser = None -def init_configs(args: Optional[List[str]] = None): +def init_configs(args: Optional[List[str]] = None, which_entry: object = None): """ initialize the jsonargparse parser and parse configs from one of: 1. POSIX-style commands line args; @@ -32,14 +32,29 @@ def init_configs(args: Optional[List[str]] = None): 4. hard-coded defaults :param args: list of params, e.g., ['--conifg', 'cfg.yaml'], defaut None. + :param which_entry: which entry to init configs (executor/analyzer) :return: a global cfg object used by the Executor or Analyzer """ parser = ArgumentParser(default_env=True, default_config_files=None) - parser.add_argument('--config', - action=ActionConfigFile, - help='Path to a dj basic configuration file.', - required=True) + # required but mutually exclusive args group + required_group = parser.add_mutually_exclusive_group(required=True) + required_group.add_argument('--config', + action=ActionConfigFile, + help='Path to a dj basic configuration file.') + required_group.add_argument('--auto', + action='store_true', + help='Weather to use an auto analyzing ' + 'strategy instead of a specific data ' + 'recipe. If a specific config file is ' + 'given by --config arg, this arg is ' + 'disabled. Only available for Analyzer.') + + parser.add_argument('--auto_num', + type=PositiveInt, + default=1000, + help='The number of samples to be analyzed ' + 'automatically. It\'s 1000 in default.') parser.add_argument( '--hpo_config', @@ -97,7 +112,7 @@ def init_configs(args: Optional[List[str]] = None): parser.add_argument( '--export_path', type=str, - default='./outputs/hello_world.jsonl', + default='./outputs/hello_world/hello_world.jsonl', help='Path to export and save the output processed dataset. The ' 'directory to store the processed dataset will be the work ' 'directory of this process.') @@ -339,6 +354,14 @@ def init_configs(args: Optional[List[str]] = None): try: cfg = parser.parse_args(args=args) + + # check the entry + from data_juicer.core.analyzer import Analyzer + if not isinstance(which_entry, Analyzer) and cfg.auto: + err_msg = '--auto argument can only be used for analyzer!' + logger.error(err_msg) + raise NotImplementedError(err_msg) + cfg = init_setup_from_cfg(cfg) cfg = update_op_process(cfg, parser) @@ -488,6 +511,16 @@ def init_setup_from_cfg(cfg: Namespace): SpecialTokens.image = cfg.image_special_token SpecialTokens.eoc = cfg.eoc_special_token + # add all filters that produce stats + if cfg.auto: + import pkgutil + + import data_juicer.ops.filter as djfilters + cfg.process = [{ + filter_name: {} + } for _, filter_name, _ in pkgutil.iter_modules(djfilters.__path__) + if filter_name not in djfilters.NON_STATS_FILTERS] + # Apply text_key modification during initializing configs # users can freely specify text_key for different ops using `text_key` # otherwise, set arg text_key of each op to text_keys @@ -636,7 +669,10 @@ def update_op_process(cfg, parser): temp_args = namespace_to_arg_list(temp_cfg, includes=recognized_args, excludes=['config']) - temp_args = ['--config', temp_cfg.config[0].absolute] + temp_args + if temp_cfg.config: + temp_args = ['--config', temp_cfg.config[0].absolute] + temp_args + else: + temp_args = ['--auto'] + temp_args temp_parser.parse_args(temp_args) return cfg @@ -662,6 +698,8 @@ def namespace_to_arg_list(namespace, prefix='', includes=None, excludes=None): def config_backup(cfg: Namespace): + if not cfg.config: + return cfg_path = cfg.config[0].absolute work_dir = cfg.work_dir target_path = os.path.join(work_dir, os.path.basename(cfg_path)) diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py index 2ae4d3511..63e512d41 100644 --- a/data_juicer/core/analyzer.py +++ b/data_juicer/core/analyzer.py @@ -33,7 +33,7 @@ def __init__(self, cfg: Optional[Namespace] = None): :param cfg: optional jsonargparse Namespace dict. """ - self.cfg = init_configs() if cfg is None else cfg + self.cfg = init_configs(which_entry=self) if cfg is None else cfg self.work_dir = self.cfg.work_dir @@ -87,6 +87,10 @@ def run(self, if load_data_np is None: load_data_np = self.cfg.np dataset = self.formatter.load_dataset(load_data_np, self.cfg) + if self.cfg.auto: + # if it's auto analysis, only analyze for a minor part of the input + # dataset to save time and computing resource + dataset = dataset.take(min(len(dataset), self.cfg.auto_num)) # extract processes logger.info('Preparing process operators...') diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py index dad6818e1..8cb986b2b 100644 --- a/data_juicer/ops/filter/__init__.py +++ b/data_juicer/ops/filter/__init__.py @@ -63,3 +63,10 @@ 'VideoTaggingFromFramesFilter', 'VideoWatermarkFilter', 'WordRepetitionFilter', 'WordsNumFilter' ] + +NON_STATS_FILTERS = [ + 'specified_field_filter', + 'specified_numeric_field_filter', + 'suffix_filter', + 'video_tagging_from_frames_filter', +] diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py index bbaba15eb..723845a5d 100644 --- a/data_juicer/ops/filter/image_aesthetics_filter.py +++ b/data_juicer/ops/filter/image_aesthetics_filter.py @@ -46,7 +46,7 @@ def __init__(self, :param args: Extra positional arguments. :param kwargs: Extra keyword arguments. """ - + kwargs.setdefault('mem_required', '1500MB') super().__init__(*args, **kwargs) if hf_scorer_model == '': hf_scorer_model = \ diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py index 603a48518..aea409ec4 100644 --- a/data_juicer/ops/filter/image_nsfw_filter.py +++ b/data_juicer/ops/filter/image_nsfw_filter.py @@ -41,6 +41,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '1GB') super().__init__(*args, **kwargs) self.score_threshold = score_threshold if any_or_all not in ['any', 'all']: diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py index dc36cd68a..6881eccf5 100644 --- a/data_juicer/ops/filter/image_text_matching_filter.py +++ b/data_juicer/ops/filter/image_text_matching_filter.py @@ -52,6 +52,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '1500MB') super().__init__(*args, **kwargs) self.min_score = min_score self.max_score = max_score diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py index d43c9bc3f..9a3f9361b 100644 --- a/data_juicer/ops/filter/image_text_similarity_filter.py +++ b/data_juicer/ops/filter/image_text_similarity_filter.py @@ -53,6 +53,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '1500MB') super().__init__(*args, **kwargs) self.min_score = min_score self.max_score = max_score diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py index 0d9eead6a..b752736a4 100644 --- a/data_juicer/ops/filter/image_watermark_filter.py +++ b/data_juicer/ops/filter/image_watermark_filter.py @@ -45,6 +45,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '500MB') super().__init__(*args, **kwargs) self.prob_threshold = prob_threshold if any_or_all not in ['any', 'all']: diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py index 98a2dfb1f..9dec0dc3c 100644 --- a/data_juicer/ops/filter/phrase_grounding_recall_filter.py +++ b/data_juicer/ops/filter/phrase_grounding_recall_filter.py @@ -114,6 +114,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '1GB') super().__init__(*args, **kwargs) self.min_recall = min_recall self.max_recall = max_recall diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 5e674162d..f65334f56 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -73,7 +73,7 @@ def __init__(self, :param args: Extra positional arguments. :param kwargs: Extra keyword arguments. """ - + kwargs.setdefault('mem_required', '1500MB') super().__init__(*args, **kwargs) if hf_scorer_model == '': hf_scorer_model = \ diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py index 6b3e92641..da793ccf4 100644 --- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py +++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py @@ -74,6 +74,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '1500MB') super().__init__(*args, **kwargs) self.min_score = min_score self.max_score = max_score diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py index 27bafe1d0..a1dd9d214 100644 --- a/data_juicer/ops/filter/video_nsfw_filter.py +++ b/data_juicer/ops/filter/video_nsfw_filter.py @@ -65,6 +65,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '1GB') super().__init__(*args, **kwargs) self.score_threshold = score_threshold if frame_sampling_method not in ['all_keyframes', 'uniform']: diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py index 7c41b5521..8872aab32 100644 --- a/data_juicer/ops/filter/video_tagging_from_frames_filter.py +++ b/data_juicer/ops/filter/video_tagging_from_frames_filter.py @@ -61,6 +61,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '9GB') super().__init__(*args, **kwargs) if contain not in ['any', 'all']: raise ValueError(f'the containing type [{contain}] is not ' diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 2b7e30f8f..959c91e23 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -69,6 +69,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '500MB') super().__init__(*args, **kwargs) self.prob_threshold = prob_threshold if frame_sampling_method not in ['all_keyframes', 'uniform']: diff --git a/data_juicer/ops/mapper/image_captioning_mapper.py b/data_juicer/ops/mapper/image_captioning_mapper.py index 0bc486193..98bb3ad7c 100644 --- a/data_juicer/ops/mapper/image_captioning_mapper.py +++ b/data_juicer/ops/mapper/image_captioning_mapper.py @@ -81,6 +81,8 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '16GB') + super().__init__(*args, **kwargs) if keep_candidate_mode not in [ diff --git a/data_juicer/ops/mapper/image_diffusion_mapper.py b/data_juicer/ops/mapper/image_diffusion_mapper.py index c53d6f56d..53e315844 100644 --- a/data_juicer/ops/mapper/image_diffusion_mapper.py +++ b/data_juicer/ops/mapper/image_diffusion_mapper.py @@ -91,6 +91,7 @@ def __init__(self, :param hf_img2seq: model name on huggingface to generate caption if caption_key is None. """ + kwargs.setdefault('mem_required', '8GB') super().__init__(*args, **kwargs) self._init_parameters = self.remove_extra_parameters(locals()) self.strength = strength diff --git a/data_juicer/ops/mapper/image_tagging_mapper.py b/data_juicer/ops/mapper/image_tagging_mapper.py index d47fbf0ef..e3fc46f1b 100644 --- a/data_juicer/ops/mapper/image_tagging_mapper.py +++ b/data_juicer/ops/mapper/image_tagging_mapper.py @@ -36,6 +36,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '9GB') super().__init__(*args, **kwargs) self.model_key = prepare_model( model_type='recognizeAnything', diff --git a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py index 4833409a4..75ffb9b3a 100644 --- a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py @@ -32,6 +32,7 @@ def __init__(self, keep_original_sample: bool = True, *args, **kwargs): :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '30GB') super().__init__(*args, **kwargs) AUTOINSTALL.check([ 'transformers', 'transformers_stream_generator', 'einops', diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index dbf614510..d4c664c5f 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -108,6 +108,7 @@ def __init__( :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '20GB') super().__init__(*args, **kwargs) if keep_candidate_mode not in [ diff --git a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py index b2f4c8139..67eb7e234 100644 --- a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py @@ -81,6 +81,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '40GB') super().__init__(*args, **kwargs) AUTOINSTALL.check([ 'torch', diff --git a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py index 04cd641ab..737626260 100644 --- a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py @@ -108,6 +108,7 @@ def __init__( :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '20GB') super().__init__(*args, **kwargs) if keep_candidate_mode not in [ diff --git a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py index 763a3381c..2c32093a5 100644 --- a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py @@ -37,6 +37,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '500MB') super().__init__(*args, **kwargs) AUTOINSTALL.check(['torchaudio']) self.model_key = prepare_model(model_type='huggingface', diff --git a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py index 26227738b..d4995d3f6 100644 --- a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py @@ -55,6 +55,7 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ + kwargs.setdefault('mem_required', '9GB') super().__init__(*args, **kwargs) if frame_sampling_method not in ['all_keyframes', 'uniform']: raise ValueError( diff --git a/environments/dev_requires.txt b/environments/dev_requires.txt index 0ecd058c4..44dd79158 100644 --- a/environments/dev_requires.txt +++ b/environments/dev_requires.txt @@ -4,4 +4,4 @@ sphinx sphinx-autobuild sphinx_rtd_theme recommonmark -wandb +wandb<=0.19.0 diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index 414458edc..71aa0ba38 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -33,3 +33,4 @@ pydantic>=2.0 Pillow fastapi[standard]>=0.100 httpx +wordcloud diff --git a/environments/sandbox_requires.txt b/environments/sandbox_requires.txt index 7f1d27a25..6a1791cf8 100644 --- a/environments/sandbox_requires.txt +++ b/environments/sandbox_requires.txt @@ -1,5 +1,4 @@ torch>=1.11.0 -wandb fire pyspark # vbench-related