diff --git a/Dockerfile b/Dockerfile index 03aa89a5b..a3f8eb55b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,6 +16,9 @@ ENV JAVA_HOME=/opt/jdk WORKDIR /data-juicer +# install requirements which need to be installed from source +RUN pip install git+https://github.com/xinyu1205/recognize-anything.git --default-timeout 1000 + # install requirements first to better reuse installed library cache COPY environments/ environments/ RUN cat environments/* | xargs pip install --default-timeout 1000 diff --git a/README.md b/README.md index b7a155275..d215eda24 100644 --- a/README.md +++ b/README.md @@ -175,15 +175,15 @@ pip install -v -e .[tools] # install a subset of tools dependencies The dependency options are listed below: -| Tag | Description | -|--------------|----------------------------------------------------------------------------------------------| +| Tag | Description | +|------------------|----------------------------------------------------------------------------------------------| | `.` or `.[mini]` | Install minimal dependencies for basic Data-Juicer. | -| `.[all]` | Install all optional dependencies (including minimal dependencies and all of the following). | -| `.[sci]` | Install all dependencies for all OPs. | -| `.[dist]` | Install dependencies for distributed data processing. (Experimental) | -| `.[dev]` | Install dependencies for developing the package as contributors. | -| `.[tools]` | Install dependencies for dedicated tools, such as quality classifiers. | -| `.[sandbox]` | Install dependencies for sandbox, such as VBench for video evaluation. | +| `.[all]` | Install all optional dependencies (including minimal dependencies and all of the following). | +| `.[sci]` | Install all dependencies for all OPs. | +| `.[sandbox]` | Install all dependencies for sandbox. | +| `.[dist]` | Install dependencies for distributed data processing. (Experimental) | +| `.[dev]` | Install dependencies for developing the package as contributors. | +| `.[tools]` | Install dependencies for dedicated tools, such as quality classifiers. | ### Using pip @@ -214,6 +214,8 @@ pip install py-data-juicer ```shell docker build -t datajuicer/data-juicer: . ``` + + - The format of `` is like `v0.2.0`, which is the same as release version tag. ### Installation check diff --git a/README_ZH.md b/README_ZH.md index 2f8068c02..14d289a3b 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -158,15 +158,15 @@ pip install -v -e .[tools] # 安装部分工具库的依赖 依赖选项如下表所示: -| 标签 | 描述 | -|--------------|------------------------------| +| 标签 | 描述 | +|------------------|------------------------------| | `.` 或者 `.[mini]` | 安装支持 Data-Juicer 基础功能的最小依赖项 | -| `.[all]` | 安装所有可选依赖项(包括最小依赖项以及下面所有依赖项) | -| `.[sci]` | 安装所有算子的全量依赖 | -| `.[dist]` | 安装以分布式方式进行数据处理的依赖(实验性功能) | -| `.[dev]` | 安装作为贡献者开发 Data-Juicer 所需的依赖项 | -| `.[tools]` | 安装专用工具库(如质量分类器)所需的依赖项 | -| `.[sandbox]` | 安装沙河实验需要的依赖库,如用于视频评测的VBench | +| `.[all]` | 安装所有可选依赖项(包括最小依赖项以及下面所有依赖项) | +| `.[sci]` | 安装所有算子的全量依赖 | +| `.[sandbox]` | 安装沙盒实验室的基础依赖 | +| `.[dist]` | 安装以分布式方式进行数据处理的依赖(实验性功能) | +| `.[dev]` | 安装作为贡献者开发 Data-Juicer 所需的依赖项 | +| `.[tools]` | 安装专用工具库(如质量分类器)所需的依赖项 | ### 使用 pip 安装 @@ -193,6 +193,8 @@ pip install py-data-juicer ```shell docker build -t datajuicer/data-juicer: . ``` + + - ``的格式类似于`v0.2.0`,与发布(Release)的版本号相同。 ### 安装校验 diff --git a/configs/data_juicer_recipes/README.md b/configs/data_juicer_recipes/README.md index 1f6ad757c..57073fb2a 100644 --- a/configs/data_juicer_recipes/README.md +++ b/configs/data_juicer_recipes/README.md @@ -49,3 +49,11 @@ We use simple 3-σ rule to set the hyperparameters for ops in each recipe. |-------------------------------|-------| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | LLaVA-1.5-13B
(baseline) | **80.0** | 63.3 | 53.6 | 71.6 | **61.3** | 85.9 | 1531.3 | 67.7 | 63.6 | 61.6 | 72.5 | 36.1 | | LLaVA-1.5-13B
(refined pretrain dataset) | 79.94 | **63.5** | **54.09** | **74.20** | 60.82 | **86.67** | **1565.53** | **68.2** | **63.9** | **61.8** | **75.9** | **37.4** | + +## For Video Dataset + +We provide a video dataset processing recipe example for users to better utilize video-related OPs in [general-video-refine-example.yaml](general-video-refine-example.yaml). Here we apply three types of OPs: +- Text-Only: to improve the dataset quality according to the video captions. +- Video-Only: to improve the dataset quality according to the video features. +- Text-Video: to improve the dataset quality according to the alignment between text and videos. +Users can start to process their video datasets based on this recipe. diff --git a/configs/data_juicer_recipes/README_ZH.md b/configs/data_juicer_recipes/README_ZH.md index d7dd848d7..855e433d6 100644 --- a/configs/data_juicer_recipes/README_ZH.md +++ b/configs/data_juicer_recipes/README_ZH.md @@ -49,3 +49,12 @@ |---------------------------------|-------| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | LLaVA-1.5-13B
(基线) | **80.0** | 63.3 | 53.6 | 71.6 | **61.3** | 85.9 | 1531.3 | 67.7 | 63.6 | 61.6 | 72.5 | 36.1 | | LLaVA-1.5-13B
(完善后的预训练数据集) | 79.94 | **63.5** | **54.09** | **74.20** | 60.82 | **86.67** | **1565.53** | **68.2** | **63.9** | **61.8** | **75.9** | **37.4** | + +## 视频数据集 + +我们为用户提供了一个视频数据集处理菜谱样例以协助更好地使用视频相关的算子: [general-video-refine-example.yaml](general-video-refine-example.yaml) 。这里我们应用了三种类型的算子: +- 仅文本:根据视频描述提高数据集质量 +- 仅视频:根据视频性质提高数据集质量 +- 文本-视频:根据文本和视频间的对齐提高数据集质量 +用户可以基于这个菜谱开始他们的视频数据集处理流程。 +- \ No newline at end of file diff --git a/configs/data_juicer_recipes/general-video-refine-example.yaml b/configs/data_juicer_recipes/general-video-refine-example.yaml new file mode 100644 index 000000000..892c22ecb --- /dev/null +++ b/configs/data_juicer_recipes/general-video-refine-example.yaml @@ -0,0 +1,65 @@ +# Process config example including: +# - all global arguments +# - all ops and their arguments + +# global parameters +project_name: 'all' # project name for distinguish your configs +dataset_path: '/path/to/a/video-text/dataset.jsonl' + # accepted format: 'weight1(optional) dataset1-path weight2(optional) dataset2-path' +export_path: '/path/to/store/refined/dataset.jsonl' +np: 48 # number of subprocess to process your dataset + # Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys. +open_tracer: true # whether to open the tracer to trace the changes during process. It might take more time when opening tracer + +# for multimodal data processing +video_key: 'videos' # key name of field to store the list of sample video paths. +video_special_token: '<__dj__video>' # the special token that represents a video in the text. In default, it's "<__dj__video>". You can specify your own special token according to your input dataset. + +eoc_special_token: '<|__dj__eoc|>' # the special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset. + +# process schedule: a list of several process operators with their arguments +# hyperparameters are set according to the 3-sigma stats on MSR-VTT dataset +process: + - language_id_score_filter: # filter text in specific language with language scores larger than a specific max value + lang: en # keep text in what language + min_score: 0.26311219 # the min language scores to filter text + - perplexity_filter: # filter text with perplexity score out of specific range + lang: en # compute perplexity in what language + max_ppl: 7376.81378 # the max perplexity score to filter text + - video_aesthetics_filter: # filter samples according to the aesthetics score of frame images extracted from videos. + hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor + min_score: 0.31767486 # the min aesthetics score of filter range + max_score: 1.0 # the max aesthetics score of filter range + frame_sampling_method: 'uniform' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframe", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "uniform" with frame_num=3, considering that the number of keyframes can be large while their difference is usually small in terms of their aesthetics. + frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. + reduce_mode: avg # reduce mode to the all frames extracted from videos, must be one of ['avg','max', 'min']. + any_or_all: any # keep this sample when any/all images meet the filter condition + - video_frames_text_similarity_filter: # keep samples those similarities between sampled video frame images and text within a specific range. + hf_clip: openai/clip-vit-base-patch32 # clip model name on huggingface to compute the similarity between frame image and text. It's kind of language-related. For example, for Chinese datasets, ChineseCLIP might be a better choice. + min_score: 0.16571071 # the min similarity to keep samples. + max_score: 1.0 # the max similarity to keep samples. + frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes". + frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. + horizontal_flip: false # flip frame image horizontally (left to right). + vertical_flip: false # flip frame image vertically (top to bottom). + reduce_mode: avg # reduce mode when one text corresponds to multiple videos in a chunk, must be one of ['avg','max', 'min']. + any_or_all: any # keep this sample when any/all videos meet the filter condition + - video_motion_score_filter: # Keep samples with video motion scores within a specific range. + min_score: 0.25 # the minimum motion score to keep samples + max_score: 10000.0 # the maximum motion score to keep samples + sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow + any_or_all: any # keep this sample when any/all videos meet the filter condition + - video_nsfw_filter: # filter samples according to the nsfw scores of videos in them + hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification + score_threshold: 0.34847191 # the nsfw score threshold for samples, range from 0 to 1 + frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes". + frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. + reduce_mode: avg # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min']. + any_or_all: any # keep this sample when any/all images meet the filter condition + - video_watermark_filter: # filter samples according to the predicted watermark probabilities of videos in them + hf_watermark_model: amrul-hzz/watermark_detector # Huggingface model name for watermark classification + prob_threshold: 0.96510297 # the predicted watermark probability threshold for samples, range from 0 to 1 + frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes". + frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. + reduce_mode: avg # reduce mode for multiple sampled video frames to compute final predicted watermark probabilities of videos, must be one of ['avg','max', 'min']. + any_or_all: any # keep this sample when any/all images meet the filter condition diff --git a/configs/demo/sandbox/vbench_eval_config.yaml b/configs/demo/sandbox/vbench_eval_config.yaml index f760d210a..86e302bfd 100644 --- a/configs/demo/sandbox/vbench_eval_config.yaml +++ b/configs/demo/sandbox/vbench_eval_config.yaml @@ -16,12 +16,11 @@ eval_name: load_ckpt_from_local: false # The dimensions considered in this eval. -# All dimensions include: ['subject_consistency', 'background_consistency', -# 'temporal_flickering', 'dynamic_degree', 'aesthetic_quality', -# 'object_class', 'multiple_objects', 'human_action', 'color', -# 'spatial_relationship', 'scene', 'temporal_style', -# 'appearance_style', 'overall_consistency'] -# NOTE: the evaluation of motion_smoothness and imaging_quality has bug in the pipy vbench repository. +# All dimensions include: ['subject_consistency', 'background_consistency', 'temporal_flickering', +# 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality', 'object_class', +# 'multiple_objects', 'human_action', 'color', 'spatial_relationship', 'scene', 'temporal_style', +# 'appearance_style', 'overall_consistency'] +# NOTE: current version of vbench in pypi has bug in evaluation on dimension motion_smoothness and imaging_quality dimension_list: - subject_consistency - dynamic_degree diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py index b9bdefaeb..330ac82c8 100644 --- a/data_juicer/config/config.py +++ b/data_juicer/config/config.py @@ -1,11 +1,13 @@ +import copy import os import shutil import time -from argparse import ArgumentError +from argparse import ArgumentError, Namespace from typing import Dict, List, Tuple, Union from jsonargparse import (ActionConfigFile, ArgumentParser, dict_to_namespace, namespace_to_dict) +from jsonargparse.typehints import ActionTypeHint from jsonargparse.typing import ClosedUnitInterval, NonNegativeInt, PositiveInt from loguru import logger @@ -370,8 +372,8 @@ def init_setup_from_cfg(cfg): 2. update cache directory 3. update checkpoint and `temp_dir` of tempfile - :param cfg: a original cfg - :param cfg: a updated cfg + :param cfg: an original cfg + :param cfg: an updated cfg """ cfg.export_path = os.path.abspath(cfg.export_path) @@ -552,16 +554,16 @@ def update_op_process(cfg, parser): # e.g. # `python demo.py --config demo.yaml # --language_id_score_filter.lang en` + temp_cfg = cfg for i, op_in_process in enumerate(cfg.process): op_in_process_name = list(op_in_process.keys())[0] - temp_cfg = cfg if op_in_process_name not in option_in_commands: # update op params to temp cfg if set if op_in_process[op_in_process_name]: temp_cfg = parser.merge_config( - dict_to_namespace(op_in_process), cfg) + dict_to_namespace(op_in_process), temp_cfg) else: # args in the command line override the ones in `cfg.process` @@ -584,9 +586,42 @@ def update_op_process(cfg, parser): None if internal_op_para is None else namespace_to_dict(internal_op_para) } + + # check the op params via type hint + temp_parser = copy.deepcopy(parser) + recognized_args = set([ + action.dest for action in parser._actions + if hasattr(action, 'dest') and isinstance(action, ActionTypeHint) + ]) + + temp_args = namespace_to_arg_list(temp_cfg, + includes=recognized_args, + excludes=['config']) + temp_args = ['--config', temp_cfg.config[0].absolute] + temp_args + temp_parser.parse_args(temp_args) return cfg +def namespace_to_arg_list(namespace, prefix='', includes=None, excludes=None): + arg_list = [] + + for key, value in vars(namespace).items(): + + if issubclass(type(value), Namespace): + nested_args = namespace_to_arg_list(value, f'{prefix}{key}.') + arg_list.extend(nested_args) + elif value is not None: + concat_key = f'{prefix}{key}' + if includes is not None and concat_key not in includes: + continue + if excludes is not None and concat_key in excludes: + continue + arg_list.append(f'--{concat_key}') + arg_list.append(f'{value}') + + return arg_list + + def config_backup(cfg): cfg_path = cfg.config[0].absolute work_dir = cfg.work_dir diff --git a/data_juicer/core/executor.py b/data_juicer/core/executor.py index 5ffa6eb90..9ec12a35e 100644 --- a/data_juicer/core/executor.py +++ b/data_juicer/core/executor.py @@ -209,6 +209,9 @@ def run(self, load_data_np=None): desc=op_name + '_compute_stats') if self.cfg.use_checkpoint: prev = dataset + if op.stats_export_path is not None: + self.exporter.export_compute_stats( + dataset, op.stats_export_path) tmp = dataset.filter(op.process, num_proc=self.cfg.np, desc=op_name + '_process') diff --git a/data_juicer/core/exporter.py b/data_juicer/core/exporter.py index a8c7c35f9..fe74aacc5 100644 --- a/data_juicer/core/exporter.py +++ b/data_juicer/core/exporter.py @@ -195,6 +195,18 @@ def export(self, dataset): self._export_impl(dataset, self.export_path, self.suffix, self.export_stats) + def export_compute_stats(self, dataset, export_path): + """ + Export method for saving compute status in filters + """ + keep_stats_in_res_ds = self.keep_stats_in_res_ds + self.keep_stats_in_res_ds = True + self._export_impl(dataset, + export_path, + self.suffix, + export_stats=False) + self.keep_stats_in_res_ds = keep_stats_in_res_ds + @staticmethod def to_jsonl(dataset, export_path, num_proc=1, **kwargs): """ diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py index a9ef0b908..d42d72f95 100644 --- a/data_juicer/core/ray_executor.py +++ b/data_juicer/core/ray_executor.py @@ -188,6 +188,9 @@ def process_batch_arrow(table: pa.Table) -> pa.Table: else: dataset = dataset.map(op.compute_stats, num_gpus=num_gpus) + if op.stats_export_path is not None: + dataset.write_json(op.stats_export_path, + force_ascii=False) dataset = dataset.filter(op.process) else: logger.error( diff --git a/data_juicer/core/sandbox/evaluators.py b/data_juicer/core/sandbox/evaluators.py index c1c289e3e..192676b3d 100644 --- a/data_juicer/core/sandbox/evaluators.py +++ b/data_juicer/core/sandbox/evaluators.py @@ -1,5 +1,7 @@ +import json import os import shutil + import torch from vbench import VBench @@ -76,6 +78,7 @@ def run(self, eval_type, eval_obj, **kwargs): raise NotImplementedError( 'To be refactored from gpt4v related operators/tools.') + class VBenchEvaluator(BaseEvaluator): def merge_results(self, result_path): @@ -98,25 +101,25 @@ def run(self, eval_type, eval_obj, **kwargs): dimension_list = self.eval_config.dimension_list local = self.eval_config.load_ckpt_from_local if cuda_device_count() > 0: - device = torch.device("cuda") + device = torch.device('cuda') else: - device = torch.device("cpu") + device = torch.device('cpu') my_vbench = VBench(device, prompt_path, result_dir) - my_vbench.evaluate( - videos_path = videos_path, - name = name, - dimension_list = dimension_list, - local = local - ) - result_dict = self.merge_results(os.path.join(result_dir, - name+'_eval_results.json')) - + my_vbench.evaluate(videos_path=videos_path, + name=name, + dimension_list=dimension_list, + local=local) + result_dict = self.merge_results( + os.path.join(result_dir, name + '_eval_results.json')) + + with open(os.path.join(result_dir, name + '_merged_results.json'), + 'w') as f: + json.dump(result_dict, f) + return float(result_dict['mean_score']) else: raise NotImplementedError( 'Unsupported evaluation type: {}'.format(eval_type)) - - class LmHarnessEvaluator(BaseEvaluator): diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index e5464619a..b5b2e79d9 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -153,6 +153,7 @@ def __init__(self, *args, **kwargs): from data_juicer.core.data import wrap_func_with_nested_access self.compute_stats = wrap_func_with_nested_access(self.compute_stats) + self.stats_export_path = kwargs.get('stats_export_path', None) def compute_stats(self, sample, context=False): """ diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py index 2ca191c66..50ccc1014 100644 --- a/data_juicer/ops/deduplicator/image_deduplicator.py +++ b/data_juicer/ops/deduplicator/image_deduplicator.py @@ -13,14 +13,21 @@ OP_NAME = 'image_deduplicator' with AvailabilityChecking(['imagededup'], OP_NAME): - from imagededup.methods import AHash, DHash, PHash, WHash + import imagededup # noqa: F401 - HASH_METHOD = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } + HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} + + def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash + + mapping = { + 'phash': PHash, + 'dhash': DHash, + 'whash': WHash, + 'ahash': AHash + } + + return mapping[method_name] @OPERATORS.register_module(OP_NAME) @@ -40,10 +47,10 @@ def __init__(self, method: str = 'phash', *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) - if method not in HASH_METHOD.keys(): + if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' - f'Can only be one of {HASH_METHOD.keys()}.') - self.hasher = HASH_METHOD[method]() + f'Can only be one of {HASH_METHOD}.') + self.hasher = get_hash_method(method)() def compute_hash(self, sample, context=False): # check if it's computed already diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index a95cd3baa..10530c48b 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -11,14 +11,21 @@ OP_NAME = 'ray_image_deduplicator' with AvailabilityChecking(['imagededup'], OP_NAME): - from imagededup.methods import AHash, DHash, PHash, WHash + import imagededup # noqa: F401 - HASH_METHOD = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } + HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} + + def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash + + mapping = { + 'phash': PHash, + 'dhash': DHash, + 'whash': WHash, + 'ahash': AHash + } + + return mapping[method_name] @OPERATORS.register_module(OP_NAME) @@ -46,10 +53,10 @@ def __init__(self, redis_port=redis_port, *args, **kwargs) - if method not in HASH_METHOD.keys(): + if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' - f'Can only be one of {HASH_METHOD.keys()}.') - self.hasher = HASH_METHOD[method]() + f'Can only be one of {HASH_METHOD}.') + self.hasher = get_hash_method(method)() def calculate_hash(self, sample, context=False): if self.image_key not in sample or not sample[self.image_key]: diff --git a/data_juicer/utils/logger_utils.py b/data_juicer/utils/logger_utils.py index 392dddac2..a91f610fe 100644 --- a/data_juicer/utils/logger_utils.py +++ b/data_juicer/utils/logger_utils.py @@ -18,6 +18,7 @@ import inspect import os import sys +from io import StringIO from loguru import logger from loguru._file_sink import FileSink @@ -52,12 +53,14 @@ def __init__(self, level='INFO', caller_names=('datasets', 'logging')): Default value: (apex, pycocotools). """ self.level = level - self.linebuf = '' self.caller_names = caller_names + self.buffer = StringIO() + self.BUFFER_SIZE = 1024 * 1024 def write(self, buf): full_name = get_caller_name(depth=1) module_name = full_name.rsplit('.', maxsplit=-1)[0] + self.buffer.write(buf) if module_name in self.caller_names: for line in buf.rstrip().splitlines(): # use caller level log @@ -66,8 +69,13 @@ def write(self, buf): # sys.__stdout__.write(buf) logger.opt(raw=True).info(buf) + self.buffer.truncate(self.BUFFER_SIZE) + + def getvalue(self): + return self.buffer.getvalue() + def flush(self): - pass + self.buffer.flush() def redirect_sys_output(log_level='INFO'): @@ -76,7 +84,7 @@ def redirect_sys_output(log_level='INFO'): :param log_level: log level string of loguru. Default value: "INFO". """ - redirect_logger = StreamToLoguru(log_level) + redirect_logger = StreamToLoguru(level=log_level) sys.stderr = redirect_logger sys.stdout = redirect_logger diff --git a/docs/Sandbox-ZH.md b/docs/Sandbox-ZH.md index accab8bd1..e26f53aef 100644 --- a/docs/Sandbox-ZH.md +++ b/docs/Sandbox-ZH.md @@ -4,6 +4,19 @@ 用户在沙盒中,除了Data-Juicer基础的数据优化与数据菜谱微调功能外,还可以便捷地使用数据洞察与分析、沙盒模型训练与评测、基于数据和模型反馈优化数据菜谱等可配置组件,共同组成完整的一站式数据-模型研发流水线。 ## 快速上手 +### 依赖准备 +在使用沙盒实验室前,你可能需要使用如下命令安装沙盒相关的第三方依赖: +```shell +pip install -v -e .[sandbox] + +# 或者直接安装全量依赖 +pip install -v -e .[all] +``` + +**注意**:一些沙盒的依赖还需要额外的领域依赖。例如,如果用户想要在沙盒中训练一个 ModelScope 平台的NLP模型,那可能需要为 `modelscope` 库 +安装额外的 `nlp` 领域依赖(参考其[安装文档](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85) )。 +因此如果使用沙盒过程中,这些第三方依赖抛出了一些"未找到模块(Module-Not-Found)"的报错时,用户需要先检查这些库的文档以寻求帮助。 + ### 准备沙盒配置文件 沙盒的主配置文件除了Data-Juicer的配置文件外,还包括了若干额外的参数用于指定沙盒流水线中可能会运行的模型训练、推理、评测等步骤的配置信息,完整的额外参数可参考 [config_all.yaml](https://github.com/modelscope/data-juicer/blob/main/configs/config_all.yaml) 中的“for sandbox or hpo”部分参数。一个sandbox的配置文件示例可参考`configs/demo/sandbox/sandbox.yaml`: ```yaml diff --git a/docs/Sandbox.md b/docs/Sandbox.md index 4004f3460..480819806 100644 --- a/docs/Sandbox.md +++ b/docs/Sandbox.md @@ -4,6 +4,19 @@ In Data-Juicer, the data sandbox laboratory provides users with the best practic In addition to the basic data optimization and recipe refinement features offered by Data-Juicer, users can seamlessly use configurable components such as data probe and analysis, model training and evaluation, and data and model feedback-based recipe refinement to form a complete one-stop data-model research and development pipeline. ## Quick Start +### Requirements +Before using sandbox, you might need to install sandbox-related third-party dependencies by running the command below: +```shell +pip install -v -e .[sandbox] + +# or install all dependencies +pip install -v -e .[all] +``` + +**NOTICE**: some sandbox-related dependencies require extra domain dependencies. For example, if users want to train an NLP model from ModelScope +in the sandbox, you might need to install extra `nlp` dependencies for `modelscope` library (see the [installation docs](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)). +So if some Module-Not-Found errors are raised by these third-party libraries when running the sandbox, users need to check their docs first. + ### Prepare Configuration Files for Sandbox The configuration file of the sandbox includes several additional parameters in addition to the configuration of Data-Juicer. These parameters are used to specify the configuration information for model training, inference, evaluation, and other steps that may run in the sandbox pipeline. For the complete set of additional parameters, please refer to the "for sandbox or hpo" section in the [config_all.yaml](https://github.com/modelscope/data-juicer/blob/main/configs/config_all.yaml). An example of a sandbox configuration file can be found in `configs/demo/sandbox/sandbox.yaml`: ```yaml diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index b273872b0..c162fb21d 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -1,7 +1,7 @@ fsspec==2023.5.0 pyarrow<=12.0.0 pandas==2.0.3 -datasets==2.11.0 +datasets==2.18.0 av soundfile librosa diff --git a/environments/sandbox_requires.txt b/environments/sandbox_requires.txt index cf0048038..bced95149 100644 --- a/environments/sandbox_requires.txt +++ b/environments/sandbox_requires.txt @@ -1,6 +1,9 @@ torch -vbench wandb fire pyspark -detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6 \ No newline at end of file +# vbench-related +detectron2@git+https://github.com/facebookresearch/detectron2.git@b7c7f4ba82192ff06f2bbb162b9f67b00ea55867 +vbench +# modelscope-related +modelscope diff --git a/environments/science_requires.txt b/environments/science_requires.txt index 4faa330c8..e848ea5ba 100644 --- a/environments/science_requires.txt +++ b/environments/science_requires.txt @@ -1,3 +1,5 @@ +torch>=1.11.0 +torchaudio easyocr fasttext-wheel kenlm @@ -16,8 +18,6 @@ accelerate tiktoken opencc==1.1.6 imagededup -torch -torchaudio dlib spacy-pkuseg==0.0.32 diffusers diff --git a/setup.py b/setup.py index 000bfe901..18c78b368 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def get_install_requirements(require_f_paths, env_dir='environments'): get_install_requirements( ['preprocess_requires.txt', 'quality_classifier_requires.txt']), 'sandbox': - get_install_requirements(['sandbox_requires.txt']) + get_install_requirements(['sandbox_requires.txt']), } extra_requires['all'] = [v for v in extra_requires.values()] diff --git a/tests/config/demo_4_test_bad_val.yaml b/tests/config/demo_4_test_bad_val.yaml new file mode 100644 index 000000000..3f1b4dbd2 --- /dev/null +++ b/tests/config/demo_4_test_bad_val.yaml @@ -0,0 +1,19 @@ +# Process config example for Arxiv dataset + +# global parameters +project_name: 'test_demo' +dataset_path: './demos/data/demo-dataset.jsonl' # path to your dataset directory or file +np: 4 # number of subprocess to process your dataset + +export_path: './outputs/demo/demo-processed.parquet' + +# process schedule +# a list of several process operators with their arguments +process: + - whitespace_normalization_mapper: + - language_id_score_filter: + lang: 'zh' + min_score: 1.1 # !! a bad value !! + - document_deduplicator: # deduplicate text samples using md5 hashing exact matching method + lowercase: false # whether to convert text to lower case + ignore_non_character: false diff --git a/tests/config/test_config_funcs.py b/tests/config/test_config_funcs.py index bd2d6e7d8..741d7d9a1 100644 --- a/tests/config/test_config_funcs.py +++ b/tests/config/test_config_funcs.py @@ -1,6 +1,6 @@ import os import unittest -from contextlib import redirect_stdout +from contextlib import redirect_stdout, redirect_stderr from io import StringIO from jsonargparse import Namespace @@ -12,6 +12,9 @@ test_yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'demo_4_test.yaml') +test_bad_yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + 'demo_4_test_bad_val.yaml') + class ConfigTest(DataJuicerTestCaseBase): @@ -61,6 +64,7 @@ def test_yaml_cfg_file(self): 'video_key': 'videos', 'accelerator': 'cpu', 'spec_numprocs': 0, + 'stats_export_path': None, 'cpu_required': 1, 'mem_required': 0, 'use_actor': False, @@ -70,6 +74,34 @@ def test_yaml_cfg_file(self): _, op_from_cfg = load_ops(cfg.process) self.assertTrue(len(op_from_cfg) == 3) + def test_val_range_check_cmd(self): + out = StringIO() + err_msg_head = ("language_id_score_filter.min_score") + err_msg = ("Not of type ClosedUnitInterval: 1.1 does not conform to " + "restriction v>=0 and v<=1") + with redirect_stdout(out), redirect_stderr(out): + with self.assertRaises(SystemExit) as cm: + init_configs( + args=f'--config {test_yaml_path} ' + '--language_id_score_filter.min_score 1.1'.split()) + self.assertEqual(cm.exception.code, 2) + out_str = out.getvalue() + self.assertIn(err_msg_head, out_str) + self.assertIn(err_msg, out_str) + + def test_val_range_check_yaml(self): + out = StringIO() + err_msg_head = ("language_id_score_filter.min_score") + err_msg = ("Not of type ClosedUnitInterval: 1.1 does not conform to " + "restriction v>=0 and v<=1") + with redirect_stdout(out), redirect_stderr(out): + with self.assertRaises(SystemExit) as cm: + init_configs(args=f'--config {test_bad_yaml_path}'.split()) + self.assertEqual(cm.exception.code, 2) + out_str = out.getvalue() + self.assertIn(err_msg_head, out_str) + self.assertIn(err_msg, out_str) + def test_mixture_cfg(self): out = StringIO() with redirect_stdout(out): @@ -99,6 +131,7 @@ def test_mixture_cfg(self): 'video_key': 'videos', 'accelerator': 'cpu', 'spec_numprocs': 0, + 'stats_export_path': None, 'cpu_required': 1, 'mem_required': 0, 'use_actor': False, @@ -115,6 +148,7 @@ def test_mixture_cfg(self): 'video_key': 'videos', 'accelerator': 'cpu', 'spec_numprocs': 0, + 'stats_export_path': None, 'cpu_required': 1, 'mem_required': 0, 'use_actor': False, @@ -131,6 +165,7 @@ def test_mixture_cfg(self): 'video_key': 'videos', 'accelerator': 'cpu', 'spec_numprocs': 0, + 'stats_export_path': None, 'cpu_required': 1, 'mem_required': 0, 'use_actor': False, @@ -147,6 +182,7 @@ def test_mixture_cfg(self): 'video_key': 'videos', 'accelerator': 'cpu', 'spec_numprocs': 0, + 'stats_export_path': None, 'cpu_required': 1, 'mem_required': 0, 'use_actor': False, @@ -163,6 +199,7 @@ def test_mixture_cfg(self): 'video_key': 'videos', 'accelerator': 'cpu', 'spec_numprocs': 0, + 'stats_export_path': None, 'cpu_required': 1, 'mem_required': 0, 'use_actor': False, diff --git a/tools/mm_eval/README.md b/tools/mm_eval/README.md new file mode 100644 index 000000000..cedacb7c0 --- /dev/null +++ b/tools/mm_eval/README.md @@ -0,0 +1,3 @@ +VBench from the paper "VBench: Comprehensive Benchmark Suite for Video Generative Models". + +Please refer to [GitHub](https://github.com/Vchitect/VBench) for more detail. diff --git a/tools/mm_eval/README_ZH.md b/tools/mm_eval/README_ZH.md new file mode 100644 index 000000000..85c7e75a7 --- /dev/null +++ b/tools/mm_eval/README_ZH.md @@ -0,0 +1,3 @@ +VBench来自paper:"VBench: Comprehensive Benchmark Suite for Video Generative Models"。 + +请跳转[GitHub](https://github.com/Vchitect/VBench)查看更多信息。 diff --git a/tools/mm_eval/vbench_metrics/VBench_full_info.json b/tools/mm_eval/vbench_metrics/VBench_full_info.json index a3a4f0968..e60c40eb0 100644 --- a/tools/mm_eval/vbench_metrics/VBench_full_info.json +++ b/tools/mm_eval/vbench_metrics/VBench_full_info.json @@ -9129,4 +9129,4 @@ } } } -] \ No newline at end of file +] diff --git a/tools/mm_eval/vbench_metrics/evaluate.py b/tools/mm_eval/vbench_metrics/evaluate.py index dbb35cca4..4e3c62552 100644 --- a/tools/mm_eval/vbench_metrics/evaluate.py +++ b/tools/mm_eval/vbench_metrics/evaluate.py @@ -3,91 +3,97 @@ Matches the original implementation at https://github.com/Vchitect/VBench/blob/master/evaluate.py""" -import torch -import os -from vbench import VBench -from datetime import datetime +# flake8: noqa: E501 + import argparse import json +import os +from datetime import datetime + +import torch +from vbench import VBench + def parse_args(): CUR_DIR = os.path.dirname(os.path.abspath(__file__)) - parser = argparse.ArgumentParser(description='VBench', formatter_class=argparse.RawTextHelpFormatter) + parser = argparse.ArgumentParser( + description='VBench', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( - "--output_path", + '--output_path', type=str, default='./evaluation_results/', - help="output path to save the evaluation results", + help='output path to save the evaluation results', ) parser.add_argument( - "--full_json_dir", + '--full_json_dir', type=str, default=f'{CUR_DIR}/VBench_full_info.json', - help="path to save the json file that contains the prompt and dimension information", + help= + 'path to save the json file that contains the prompt and dimension information', ) parser.add_argument( - "--videos_path", + '--videos_path', type=str, required=True, - help="folder that contains the sampled videos", + help='folder that contains the sampled videos', ) parser.add_argument( - "--dimension", + '--dimension', nargs='+', required=True, - help="list of evaluation dimensions, usage: --dimension ", + help= + 'list of evaluation dimensions, usage: --dimension ', ) parser.add_argument( - "--load_ckpt_from_local", + '--load_ckpt_from_local', type=bool, required=False, - help="whether load checkpoints from local default paths (assuming you have downloaded the checkpoints locally", + help= + 'whether load checkpoints from local default paths (assuming you have downloaded the checkpoints locally', ) parser.add_argument( - "--read_frame", + '--read_frame', type=bool, required=False, - help="whether directly read frames, or directly read videos", + help='whether directly read frames, or directly read videos', ) parser.add_argument( - "--mode", + '--mode', choices=['custom_input', 'vbench_standard', 'vbench_category'], default='vbench_standard', - help="""This flags determine the mode of evaluations, choose one of the following: + help= + """This flags determine the mode of evaluations, choose one of the following: 1. "custom_input": receive input prompt from either --prompt/--prompt_file flags or the filename 2. "vbench_standard": evaluate on standard prompt suite of VBench 3. "vbench_category": evaluate on specific category """, ) parser.add_argument( - "--custom_input", - action="store_true", + '--custom_input', + action='store_true', required=False, help="(deprecated) use --mode=\"custom_input\" instead", ) - parser.add_argument( - "--prompt", - type=str, - default="", - help="""Specify the input prompt + parser.add_argument('--prompt', + type=str, + default='', + help="""Specify the input prompt If not specified, filenames will be used as input prompts * Mutually exclusive to --prompt_file. ** This option must be used with --custom_input flag - """ - ) + """) parser.add_argument( - "--prompt_file", + '--prompt_file', type=str, required=False, help="""Specify the path of the file that contains prompt lists If not specified, filenames will be used as input prompts * Mutually exclusive to --prompt. ** This option must be used with --custom_input flag - """ - ) + """) parser.add_argument( - "--category", + '--category', type=str, required=False, help="""This is for mode=='vbench_category' @@ -97,14 +103,14 @@ def parse_args(): ## for dimension specific params ### parser.add_argument( - "--imaging_quality_preprocessing_mode", + '--imaging_quality_preprocessing_mode', type=str, required=False, default='longer', help="""This is for setting preprocessing in imaging_quality 1. 'shorter': if the shorter side is more than 512, the image is resized so that the shorter side is 512. 2. 'longer': if the longer side is more than 512, the image is resized so that the longer side is 512. - 3. 'shorter_centercrop': if the shorter side is more than 512, the image is resized so that the shorter side is 512. + 3. 'shorter_centercrop': if the shorter side is more than 512, the image is resized so that the shorter side is 512. Then the center 512 x 512 after resized is used for evaluation. 4. 'None': no preprocessing """, @@ -117,9 +123,9 @@ def main(): args = parse_args() print(f'args: {args}') - device = torch.device("cuda") + device = torch.device('cuda') my_VBench = VBench(device, args.full_json_dir, args.output_path) - + print(f'start evaluation') current_time = datetime.now().strftime('%Y-%m-%d-%H:%M:%S') @@ -128,37 +134,41 @@ def main(): prompt = [] - assert args.custom_input == False, "(Deprecated) use --mode=custom_input instead" - - if (args.prompt_file is not None) and (args.prompt != ""): - raise Exception("--prompt_file and --prompt cannot be used together") - if (args.prompt_file is not None or args.prompt != "") and (not args.mode=='custom_input'): - raise Exception("must set --mode=custom_input for using external prompt") + assert args.custom_input == False, '(Deprecated) use --mode=custom_input instead' + + if (args.prompt_file is not None) and (args.prompt != ''): + raise Exception('--prompt_file and --prompt cannot be used together') + if (args.prompt_file is not None + or args.prompt != '') and (not args.mode == 'custom_input'): + raise Exception( + 'must set --mode=custom_input for using external prompt') if args.prompt_file: with open(args.prompt_file, 'r') as f: prompt = json.load(f) - assert type(prompt) == dict, "Invalid prompt file format. The correct format is {\"video_path\": prompt, ... }" - elif args.prompt != "": + assert type( + prompt + ) == dict, "Invalid prompt file format. The correct format is {\"video_path\": prompt, ... }" + elif args.prompt != '': prompt = [args.prompt] - if args.category != "": + if args.category != '': kwargs['category'] = args.category - kwargs['imaging_quality_preprocessing_mode'] = args.imaging_quality_preprocessing_mode + kwargs[ + 'imaging_quality_preprocessing_mode'] = args.imaging_quality_preprocessing_mode my_VBench.evaluate( - videos_path = args.videos_path, - name = f'results_{current_time}', - prompt_list=prompt, # pass in [] to read prompt from filename - dimension_list = args.dimension, + videos_path=args.videos_path, + name=f'results_{current_time}', + prompt_list=prompt, # pass in [] to read prompt from filename + dimension_list=args.dimension, local=args.load_ckpt_from_local, read_frame=args.read_frame, mode=args.mode, - **kwargs - ) + **kwargs) print('done') -if __name__ == "__main__": - main() \ No newline at end of file +if __name__ == '__main__': + main() diff --git a/tools/sandbox_starter.py b/tools/sandbox_starter.py index 8f2fe3771..e1f8d91a4 100644 --- a/tools/sandbox_starter.py +++ b/tools/sandbox_starter.py @@ -1,8 +1,8 @@ import json import yaml -from loguru import logger from jsonargparse import dict_to_namespace +from loguru import logger from data_juicer.config import init_configs from data_juicer.core.sandbox.pipelines import SandBoxExecutor