From 8ac1f9af45c0a62758be75625471e708075b27e1 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Mon, 11 Mar 2024 10:31:02 +0900 Subject: [PATCH 1/3] Update words in DJ_SORA.md: `open source` to `open-source` (#237) `open source` to `open-source` --- docs/DJ_SORA.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/DJ_SORA.md b/docs/DJ_SORA.md index 1dce43860..4cf15f81e 100644 --- a/docs/DJ_SORA.md +++ b/docs/DJ_SORA.md @@ -2,7 +2,7 @@ English | [中文页面](DJ_SORA_ZH.md) --- -Data is the key to the unprecedented development of large multi-modal models such as SORA. How to obtain and process data efficiently and scientifically faces new challenges! DJ-SORA aims to create a series of large-scale, high-quality open source multi-modal data sets to assist the open source community in data understanding and model training. +Data is the key to the unprecedented development of large multi-modal models such as SORA. How to obtain and process data efficiently and scientifically faces new challenges! DJ-SORA aims to create a series of large-scale, high-quality open-source multi-modal data sets to assist the open-source community in data understanding and model training. DJ-SORA is based on Data-Juicer (including hundreds of dedicated video, image, audio, text and other multi-modal data processing [operators](Operators_ZH.md) and tools) to form a series of systematic and reusable Multimodal "data recipes" for analyzing, cleaning, and generating large-scale, high-quality multimodal data. From 9a3db9ca16eb2d2f25f2eb7faf2e4bfb27c583fe Mon Sep 17 00:00:00 2001 From: Daoyuan Chen <67475544+yxdyc@users.noreply.github.com> Date: Mon, 11 Mar 2024 09:40:20 +0800 Subject: [PATCH 2/3] Update DJ_SORA.md (#234) fix the toc links --- docs/DJ_SORA.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/DJ_SORA.md b/docs/DJ_SORA.md index 4cf15f81e..25052360b 100644 --- a/docs/DJ_SORA.md +++ b/docs/DJ_SORA.md @@ -18,12 +18,12 @@ This project is being actively updated and maintained. We eagerly invite you to # Roadmap ## Overview -* [Support high-performance loading and processing of video data](#Support high-performance loading and processing of video data) -* [Basic Operators (video spatio-temporal dimension)](#Basic operator video spatio-temporal dimension) -* [Advanced Operators (fine-grained modal matching and data generation)](#Advanced operators fine-grained modal matching and data generation) -* [Advanced Operators (Video Content)](#Advanced Operator Video Content) -* [DJ-SORA Data Recipes and Datasets](#DJ-SORA Data Recipes and Datasets) -* [DJ-SORA Data Validation and Model Training](#DJ-SORA Data Validation and Model Training) +* [Support high-performance loading and processing of video data](#support-high-performance-loading-and-processing-of-video-data) +* [Basic Operators (video spatio-temporal dimension)](#basic-operators-video-spatio-temporal-dimension) +* [Advanced Operators (fine-grained modal matching and data generation)](#advanced-operators-fine-grained-modal-matching-and-data-generation) +* [Advanced Operators (Video Content)](#advanced-operators-video-content) +* [DJ-SORA Data Recipes and Datasets](#dj-sora-data-recipes-and-datasets) +* [DJ-SORA Data Validation and Model Training](#dj-sora-data-validation-and-model-training) ## Support high-performance loading and processing of video data From da6440a55188300261d5e3f2d8074d496ea747d6 Mon Sep 17 00:00:00 2001 From: garyzhang99 <46197280+garyzhang99@users.noreply.github.com> Date: Mon, 11 Mar 2024 10:03:10 +0800 Subject: [PATCH 3/3] fix scripts (#235) --- scripts/dlc/partition_data_dlc.py | 2 +- scripts/dlc/run_on_dlc.sh | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/dlc/partition_data_dlc.py b/scripts/dlc/partition_data_dlc.py index b0f5bbbfc..2585c14d4 100644 --- a/scripts/dlc/partition_data_dlc.py +++ b/scripts/dlc/partition_data_dlc.py @@ -28,7 +28,7 @@ def partition_data(json_file_path: str, hostnames: List[str]): nodes_video_size[min_node] += video_sizes[video] for hostname in hostnames: - host_file_path = f"{json_file_path.rsplit('.', 1)[0]}_{hostname}.json" + host_file_path = f"{json_file_path.rsplit('.', 1)[0]}_{hostname}.jsonl" with open(host_file_path, 'w') as f: for entry in nodes_data[hostname]: f.write(json.dumps(entry) + '\n') diff --git a/scripts/dlc/run_on_dlc.sh b/scripts/dlc/run_on_dlc.sh index 8ed356e99..d2c257235 100644 --- a/scripts/dlc/run_on_dlc.sh +++ b/scripts/dlc/run_on_dlc.sh @@ -36,5 +36,13 @@ else sed -i$SED_I_SUFFIX "s|\(dataset_path: '\)\(.*\)'\(.*\)|\1\2_$hostname'\3|" "$new_config_file" fi +if grep -q "export_path: .*\.json" "$new_config_file"; then + # .json data_path + sed -i$SED_I_SUFFIX "s|\(export_path: \)\(.*\)\(/[^/]*\)\(.json\)|\1\2\3_$hostname\4|" "$new_config_file" +else + # dir export_path + sed -i$SED_I_SUFFIX "s|\(export_path: '\)\(.*\)'\(.*\)|\1\2_$hostname'\3|" "$new_config_file" +fi + # run to process data python tools/process_data.py --config "$new_config_file"