-
Notifications
You must be signed in to change notification settings - Fork 191
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* + add a recipe example for video processing + add more details in docs for Docker image * + add docs about this example recipe
- Loading branch information
Showing
6 changed files
with
89 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
65 changes: 65 additions & 0 deletions
65
configs/data_juicer_recipes/general-video-refine-example.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Process config example including: | ||
# - all global arguments | ||
# - all ops and their arguments | ||
|
||
# global parameters | ||
project_name: 'all' # project name for distinguish your configs | ||
dataset_path: '/path/to/a/video-text/dataset.jsonl' | ||
# accepted format: 'weight1(optional) dataset1-path weight2(optional) dataset2-path' | ||
export_path: '/path/to/store/refined/dataset.jsonl' | ||
np: 48 # number of subprocess to process your dataset | ||
# Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys. | ||
open_tracer: true # whether to open the tracer to trace the changes during process. It might take more time when opening tracer | ||
|
||
# for multimodal data processing | ||
video_key: 'videos' # key name of field to store the list of sample video paths. | ||
video_special_token: '<__dj__video>' # the special token that represents a video in the text. In default, it's "<__dj__video>". You can specify your own special token according to your input dataset. | ||
|
||
eoc_special_token: '<|__dj__eoc|>' # the special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset. | ||
|
||
# process schedule: a list of several process operators with their arguments | ||
# hyperparameters are set according to the 3-sigma stats on MSR-VTT dataset | ||
process: | ||
- language_id_score_filter: # filter text in specific language with language scores larger than a specific max value | ||
lang: en # keep text in what language | ||
min_score: 0.26311219 # the min language scores to filter text | ||
- perplexity_filter: # filter text with perplexity score out of specific range | ||
lang: en # compute perplexity in what language | ||
max_ppl: 7376.81378 # the max perplexity score to filter text | ||
- video_aesthetics_filter: # filter samples according to the aesthetics score of frame images extracted from videos. | ||
hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor | ||
min_score: 0.31767486 # the min aesthetics score of filter range | ||
max_score: 1.0 # the max aesthetics score of filter range | ||
frame_sampling_method: 'uniform' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframe", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "uniform" with frame_num=3, considering that the number of keyframes can be large while their difference is usually small in terms of their aesthetics. | ||
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. | ||
reduce_mode: avg # reduce mode to the all frames extracted from videos, must be one of ['avg','max', 'min']. | ||
any_or_all: any # keep this sample when any/all images meet the filter condition | ||
- video_frames_text_similarity_filter: # keep samples those similarities between sampled video frame images and text within a specific range. | ||
hf_clip: openai/clip-vit-base-patch32 # clip model name on huggingface to compute the similarity between frame image and text. It's kind of language-related. For example, for Chinese datasets, ChineseCLIP might be a better choice. | ||
min_score: 0.16571071 # the min similarity to keep samples. | ||
max_score: 1.0 # the max similarity to keep samples. | ||
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes". | ||
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. | ||
horizontal_flip: false # flip frame image horizontally (left to right). | ||
vertical_flip: false # flip frame image vertically (top to bottom). | ||
reduce_mode: avg # reduce mode when one text corresponds to multiple videos in a chunk, must be one of ['avg','max', 'min']. | ||
any_or_all: any # keep this sample when any/all videos meet the filter condition | ||
- video_motion_score_filter: # Keep samples with video motion scores within a specific range. | ||
min_score: 0.25 # the minimum motion score to keep samples | ||
max_score: 10000.0 # the maximum motion score to keep samples | ||
sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow | ||
any_or_all: any # keep this sample when any/all videos meet the filter condition | ||
- video_nsfw_filter: # filter samples according to the nsfw scores of videos in them | ||
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification | ||
score_threshold: 0.34847191 # the nsfw score threshold for samples, range from 0 to 1 | ||
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes". | ||
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. | ||
reduce_mode: avg # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min']. | ||
any_or_all: any # keep this sample when any/all images meet the filter condition | ||
- video_watermark_filter: # filter samples according to the predicted watermark probabilities of videos in them | ||
hf_watermark_model: amrul-hzz/watermark_detector # Huggingface model name for watermark classification | ||
prob_threshold: 0.96510297 # the predicted watermark probability threshold for samples, range from 0 to 1 | ||
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes". | ||
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. | ||
reduce_mode: avg # reduce mode for multiple sampled video frames to compute final predicted watermark probabilities of videos, must be one of ['avg','max', 'min']. | ||
any_or_all: any # keep this sample when any/all images meet the filter condition |