From 155f9f615faee319bf3eda54bff9a9b2be38bb39 Mon Sep 17 00:00:00 2001 From: chenhesen Date: Fri, 24 Nov 2023 17:24:09 +0800 Subject: [PATCH] support audio & audio-text data reading (#95) * fix opencc serialization error * support audio-text data reading * update multimodal_README * fix pre-commit error * modify audio_special_token * support only one target_field * fix pre-commit * add id for log --- configs/config_all.yaml | 2 + data_juicer/utils/mm_utils.py | 15 +- tools/multimodal/README.md | 37 +++ tools/multimodal/README_ZH.md | 35 +++ .../dj_to_llava.py | 4 +- .../dj_to_wavcaps.py | 166 +++++++++++++ .../wavcaps_to_dj.py | 226 ++++++++++++++++++ 7 files changed, 481 insertions(+), 4 deletions(-) create mode 100644 tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py create mode 100644 tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py diff --git a/configs/config_all.yaml b/configs/config_all.yaml index 58970a08b..95d9623fc 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -26,6 +26,8 @@ cache_compress: null # The compression me # for multimodal data processing image_key: 'images' # Key name of field to store the list of sample image paths. image_special_token: '<__dj__image>' # The special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset. +audio_key: 'audios' # Key name of field to store the list of sample audio paths. +audio_special_token: '<__dj__audio>' # The special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset. eoc_special_token: '<|__dj__eoc|>' # The special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset. diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py index ea6b2063f..817f298bd 100644 --- a/data_juicer/utils/mm_utils.py +++ b/data_juicer/utils/mm_utils.py @@ -1,4 +1,4 @@ -from datasets import Image +from datasets import Audio, Image from data_juicer.utils.constant import DEFAULT_PREFIX @@ -8,6 +8,7 @@ class SpecialTokens(object): # modality image = f'<{DEFAULT_PREFIX}image>' + audio = f'<{DEFAULT_PREFIX}audio>' # others eoc = f'<|{DEFAULT_PREFIX}eoc|>' @@ -17,13 +18,23 @@ def load_images(paths): return [load_image(path) for path in paths] +def load_audios(paths): + return [load_audio(path) for path in paths] + + def load_image(path): img_feature = Image() img = img_feature.decode_example(img_feature.encode_example(path)) return img -def get_image_size(path): +def load_audio(path, sampling_rate=None): + aud_feature = Audio(sampling_rate) + aud = aud_feature.decode_example(aud_feature.encode_example(path)) + return (aud['array'], aud['sampling_rate']) + + +def get_image_size(path, ): import os return os.path.getsize(path) diff --git a/tools/multimodal/README.md b/tools/multimodal/README.md index b9175c27c..b950ea08b 100644 --- a/tools/multimodal/README.md +++ b/tools/multimodal/README.md @@ -18,6 +18,7 @@ For now, dataset formats that are supported by Data-Juicer are listed in the fol | Format | source_format_to_data_juicer_format | data_juicer_format_to_target_format | Ref. | |------------|-------------------------------------|-------------------------------------|------------------------------------------------------------------------------------------------------------------| | LLaVA-like | `llava_to_dj.py` | `dj_to_llava.py` | [Format Description](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) | +| WavCaps-like | `wavcaps_to_dj.py` | `dj_to_wavcaps.py` | [Format Description](https://github.com/XinhaoMei/WavCaps#table-of-contents) | For all tools, you can run the following command to find out the usage of them: @@ -91,3 +92,39 @@ and converted datasets, so we can regard this sample is aligned with the origina } ] ``` + +### WavCaps-like + +The [WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) is composed of four sub-datasets: [FreeSound](https://freesound.org/), [BBC Sound Effects](https://sound-effects.bbcrewind.co.uk/),[SoundBible](https://soundbible.com/) and [AudioSet Strongly-labelled Subset](https://research.google.com/audioset/download_strong.html). Each sub-dataset has different fields. For example, the 'description' field is included in SoundBible, but does not exist in AudioSet. To ensure that the different sub-datasets can be properly merged after conversion, the union of all fields from the sub-datasets is used during the wavcaps_to_dj stage, and all fields are fully retained during the dj_to_wavcaps stage. + +```json +# original dataset +{ "num_captions_per_audio": 1, + "data": [{ + "title": "Airplane Landing Airport", + "description": "Large commercial airplane landing at an airport runway.", + "author": "Daniel Simion", + "href": "2219-Airplane-Landing-Airport.html", + "caption": "An airplane is landing.", + "id": "2219", + "duration": 14.1424375, + "audio": "wav_path", + "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}] +} + +# converted dataset +{ "num_captions_per_audio": 1, + "data": [{ + "title": "Airplane Landing Airport", + "description": "Large commercial airplane landing at an airport runway.", + "author": "Daniel Simion", + "href": "2219-Airplane-Landing-Airport.html", + "caption": "An airplane is landing.", + "id": "2219", + "duration": 14.1424375, + "audio": "wav_path", + "download_link": "http://soundbible.com/grab.php?id=2219&type=wav", + "category": "", + "tags": "" }] +} +``` diff --git a/tools/multimodal/README_ZH.md b/tools/multimodal/README_ZH.md index 9eb7757ce..be63af955 100644 --- a/tools/multimodal/README_ZH.md +++ b/tools/multimodal/README_ZH.md @@ -15,6 +15,7 @@ | 格式 | source_format_to_data_juicer_format | data_juicer_format_to_target_format | 格式参考 | |-----------|-------------------------------------|-------------------------------------|----------------------------------------------------------------------------------------------------| | 类LLaVA格式 | `llava_to_dj.py` | `dj_to_llava.py` | [格式描述](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) | +| 类WavCaps格式 | `wavcaps_to_dj.py` | `dj_to_wavcaps.py` | [格式描述](https://github.com/XinhaoMei/WavCaps#table-of-contents) | 对于所有工具,您可以运行以下命令来了解它们的详细用法: @@ -74,3 +75,37 @@ python tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --hel } ] ``` + +#### 类WavCaps格式 +[WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) 数据集由 [FreeSound](https://freesound.org/),[BBC Sound Effects](https://sound-effects.bbcrewind.co.uk/),[SoundBible](https://soundbible.com/),[AudioSet Strongly-labelled Subset](https://research.google.com/audioset/download_strong.html) 四个子数据集组成,每个数据集里都有不同的字段。例如SoundBible里包含了‘description’字段,而该字段在AudioSet里并不存在。为了保证不同子数据集在转换后能够正常合并,在wavcaps_to_dj阶段使用了所有子数据集字段的并集,并在dj_to_wavcaps阶段完整保留了所有字段。 +```json +# 原始数据集 +{ "num_captions_per_audio": 1, + "data": [{ + "title": "Airplane Landing Airport", + "description": "Large commercial airplane landing at an airport runway.", + "author": "Daniel Simion", + "href": "2219-Airplane-Landing-Airport.html", + "caption": "An airplane is landing.", + "id": "2219", + "duration": 14.1424375, + "audio": "wav_path", + "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}] +} + +# 转换后数据集 +{ "num_captions_per_audio": 1, + "data": [{ + "title": "Airplane Landing Airport", + "description": "Large commercial airplane landing at an airport runway.", + "author": "Daniel Simion", + "href": "2219-Airplane-Landing-Airport.html", + "caption": "An airplane is landing.", + "id": "2219", + "duration": 14.1424375, + "audio": "wav_path", + "download_link": "http://soundbible.com/grab.php?id=2219&type=wav", + "category": "", + "tags": "" }] +} +``` diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py index b0c1495df..c58a06604 100644 --- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py +++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py @@ -1,5 +1,5 @@ -# This tool is used to convert multimodal dataset in LLaVA format to a target -# dataset in Data-Juicer format. +# This tool is used to convert multimodal dataset in Data-Juicer format to a +# target dataset in LLaVA format. # # Corresponding Data-Juicer format: # - multi-chunk interleaved image-text sequence diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py new file mode 100644 index 000000000..b7cf268e1 --- /dev/null +++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py @@ -0,0 +1,166 @@ +# This tool is used to convert multimodal dataset in Data-Juicer format to a +# target dataset in WavCaps format. +# +# Data-Juicer format: +# {'id': 2219, +# 'audios': ['./path/to/audio/2219.flac'], +# 'text': '<__dj__audio>\n' +# 'An airplane is landing. <|__dj__eoc|>', +# '__dj__meta__': { +# 'num_captions_per_audio': 1, +# 'title': 'Airplane Landing Airport', +# 'description': 'Large commercial airplane landing at an airport runway.', # noqa: E501 +# 'author': 'Daniel Simion', +# 'href': '2219-Airplane-Landing-Airport.html', +# 'caption': 'An airplane is landing.', +# 'id': '2219', +# 'duration': 14.1424375, +# 'audio': 'wav_path', +# 'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav', +# 'category': '', +# 'tags': '' }} +# {'id': 2218, +# 'audios': ['./path/to/audio/2218.flac'], +# 'text': '<__dj__audio>\n' +# 'Someone is ringing a bell. <|__dj__eoc|>', +# '__dj__meta__': { +# 'num_captions_per_audio': 1, +# 'title': 'Service Bell Help', +# 'description': 'Customer ringing service bell in need of help in a store.', # noqa: E501 +# 'author': 'Daniel Simion', +# 'href': '2218-Service-Bell-Help.html', +# 'caption': 'Someone is ringing a bell.', +# 'id': '2218', +# 'duration': 1.5698125, +# 'audio': 'wav_path', +# 'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav', +# 'category': '', +# 'tags': '' }} +# +# Corresponding WavCps format: +# { 'num_captions_per_audio': 1, +# 'data': [{ +# 'title': 'Airplane Landing Airport', +# 'description': 'Large commercial airplane landing at an airport runway.', # noqa: E501 +# 'author': 'Daniel Simion', +# 'href': '2219-Airplane-Landing-Airport.html', +# 'caption': 'An airplane is landing.', +# 'id': '2219', +# 'duration': 14.1424375, +# 'audio': 'wav_path', +# 'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav' +# }, { +# 'title': 'Service Bell Help', +# 'description': 'Customer ringing service bell in need of help in a store.', # noqa: E501 +# 'author': 'Daniel Simion', +# 'href': '2218-Service-Bell-Help.html', +# 'caption': 'Someone is ringing a bell.', +# 'id': '2218', +# 'duration': 1.5698125, +# 'audio': 'wav_path', +# 'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav' +# }, +# ...] +# } + +import json +import os + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + +from data_juicer.utils.constant import Fields +from data_juicer.utils.mm_utils import SpecialTokens + + +@logger.catch +def main( + dj_ds_path: str, + target_wavcaps_ds_path: str, + target_field: str = 'caption', + eoc_special_token: str = SpecialTokens.eoc, + audio_special_token: str = SpecialTokens.audio, + remove_eoc_at_last: bool = True, + remove_target_field_token: bool = False, + sent_seperator: str = '\n', +): + """ + Convert a Data-Juicer-format dataset to a WavCaps-like dataset. + + :param dj_ds_path: path to the input dataset in Data-Juicer format. + :param target_wavcaps_ds_path: path to store the converted dataset in + WavCaps format. + :param target_field: the field used to describe audio in the WavCaps-like + dataset, which can be one of ['caption','title','description']. + :param eoc_special_token: the special token for "end of a chunk". It's used + to split conversation chunks explicitly. Default: <|__dj__eoc|> (from + Data-Juicer). + :param audio_special_token: the special token for audios. It's used to + locate the audios in the text. + :param remove_eoc_at_last: whether to remove the extra eoc_special_token at + the end of text. Default: True. + :param remove_target_field_token: whether to remove the extra + target_field_token at text. + :param sent_seperator: seperator to split different sentences. Default: \n. + """ + # ----- Constant settings. Better not to change them. ----- + from_format = '[[%s]]: ' # default handle method for the text label + # ----- Constant settings. Better not to change them. ----- + + if not os.path.exists(dj_ds_path): + raise FileNotFoundError( + f'Input dataset [{dj_ds_path}] can not be found.') + if not target_wavcaps_ds_path.endswith('.json'): + raise ValueError( + 'Only support "json" target dataset file for WavCaps now.') + if os.path.dirname(target_wavcaps_ds_path) \ + and not os.path.exists(os.path.dirname(target_wavcaps_ds_path)): + logger.info( + f'Create directory [{os.path.dirname(target_wavcaps_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(target_wavcaps_ds_path)) + + if target_field not in ['caption', 'description', 'title']: + raise ValueError( + "target_field must be in '['caption', 'description', 'title']'") + + logger.info('Start to convert.') + samples = {'num_captions_per_audio': 1, 'data': []} + with jl.open(dj_ds_path, 'r') as reader: + for sample in tqdm(reader): + id = sample['id'] + if Fields.meta not in sample: + logger.warning( + f'{Fields.meta} does not exist in this sample with ' + f'id [{id}].') + continue + + if target_field not in sample[Fields.meta].keys(): + logger.warning( + f'{target_field} does not exist in this sample with ' + f'id [{id}].') + continue + samples['num_captions_per_audio'] = sample[ + Fields.meta]['num_captions_per_audio'] + del sample[Fields.meta]['num_captions_per_audio'] + + sample[Fields.meta][target_field] = sample['text'].replace( + audio_special_token + sent_seperator, '') + if remove_eoc_at_last: + sample[Fields.meta][target_field] = sample[ + Fields.meta][target_field].replace(eoc_special_token, '') + if remove_target_field_token: + sample[Fields.meta][target_field] = sample[ + Fields.meta][target_field].replace( + from_format % target_field, '') + samples['data'].append(sample[Fields.meta]) + + logger.info(f'Start to write the converted dataset to ' + f'[{target_wavcaps_ds_path}]...') + json.dump(samples, open(target_wavcaps_ds_path, 'w', encoding='utf-8')) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py new file mode 100644 index 000000000..7cb9470a2 --- /dev/null +++ b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py @@ -0,0 +1,226 @@ +# This tool is used to convert multimodal dataset in WavCaps format to a target +# dataset in Data-Juicer format. +# +# WavCps format: +# { 'num_captions_per_audio': 1, +# 'data': [{ +# 'title': 'Airplane Landing Airport', +# 'description': 'Large commercial airplane landing at an airport runway.', # noqa: E501 +# 'author': 'Daniel Simion', +# 'href': '2219-Airplane-Landing-Airport.html', +# 'caption': 'An airplane is landing.', +# 'id': '2219', +# 'duration': 14.1424375, +# 'audio': 'wav_path', +# 'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav' +# }, { +# 'title': 'Service Bell Help', +# 'description': 'Customer ringing service bell in need of help in a store.', # noqa: E501 +# 'author': 'Daniel Simion', +# 'href': '2218-Service-Bell-Help.html', +# 'caption': 'Someone is ringing a bell.', +# 'id': '2218', +# 'duration': 1.5698125, +# 'audio': 'wav_path', +# 'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav' +# }, +# ...] +# } +# +# Corresponding Data-Juicer format: +# {'id': 2219, +# 'audios': ['./path/to/audio/2219.flac'], +# 'text': '<__dj__audio>\n' +# 'An airplane is landing. <|__dj__eoc|>', +# '__dj__meta__': { +# 'num_captions_per_audio': 1, +# 'title': 'Airplane Landing Airport', +# 'description': 'Large commercial airplane landing at an airport runway.', # noqa: E501 +# 'author': 'Daniel Simion', +# 'href': '2219-Airplane-Landing-Airport.html', +# 'caption': 'An airplane is landing.', +# 'id': '2219', +# 'duration': 14.1424375, +# 'audio': 'wav_path', +# 'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav', +# 'category': '', +# 'tags': '' }} +# {'id': 2218, +# 'audios': ['./path/to/audio/2218.flac'], +# 'text': '<__dj__audio>\n' +# 'Someone is ringing a bell. <|__dj__eoc|>', +# '__dj__meta__': { +# 'num_captions_per_audio': 1, +# 'title': 'Service Bell Help', +# 'description': 'Customer ringing service bell in need of help in a store.', # noqa: E501 +# 'author': 'Daniel Simion', +# 'href': '2218-Service-Bell-Help.html', +# 'caption': 'Someone is ringing a bell.', +# 'id': '2218', +# 'duration': 1.5698125, +# 'audio': 'wav_path', +# 'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav', +# 'category': '', +# 'tags': '' }} + +import json +import os + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + +from data_juicer.utils.constant import Fields +from data_juicer.utils.mm_utils import SpecialTokens + + +def creat_meta_filed(num_captions_per_audio, source_meta): + meta_dict = { + 'num_captions_per_audio': num_captions_per_audio, + 'title': '', + 'description': '', + 'author': '', + 'href': '', + 'caption': '', + 'id': '', + 'duration': '', + 'audio': '', + 'download_link': '', + 'category': '', + 'tags': '' + } + for key in source_meta: + meta_dict[key] = source_meta[key] + return meta_dict + + +def get_all_files(dirname): + result = {} + for maindir, subdir, file_name_list in os.walk(dirname): + for filename in file_name_list: + filepath = os.path.join(maindir, filename) + result[filename] = filepath + return result + + +@logger.catch +def main( + wavcaps_json_path: str, + wavcaps_audio_path: str, + target_ds_path: str, + str_id: bool = True, + target_field: str = 'caption', + eoc_special_token: str = SpecialTokens.eoc, + audio_special_token: str = SpecialTokens.audio, + add_eoc_at_last: bool = True, + add_target_field_token: bool = False, + sent_seperator: str = '\n', +): + """ + Convert a WavCaps-like dataset to the Data-Juicer format. + + :param wavcaps_json_path: path to the json files of WavCaps-like dataset. + :param wavcaps_audio_path: path to the audio files of WavCaps-like dataset. + :param target_ds_path: path to store the converted dataset in Data-Juicer + format. + :param target_field: the field used to describe audio in the WavCaps-like + dataset, which can be one of ['caption','title','description']. + :param eoc_special_token: the special token for "end of a chunk". It's used + to split conversation chunks explicitly. Default: <|__dj__eoc|> (from + Data-Juicer). + :param audio_special_token: the special token for audios. It's used to + locate the audios in the text. + :param add_eoc_at_last: whether to add an extra eoc_special_token at the + end of text. Default: True. + :param add_target_field_token: whether to add an extra target_field_token + into text. + :param sent_seperator: seperator to split different sentences. Default: \n. + """ + # ----- Constant settings. Better not to change them. ----- + text_key = 'text' # default key of field to store the sample text + audio_key = 'audios' # default key of field to store the audio list + from_format = '[[%s]]: ' # default handle method for the text label + # ----- Constant settings. Better not to change them. ----- + + # check arguments + # check paths + if not os.path.exists(wavcaps_json_path): + raise FileNotFoundError( + f'Input WavCaps json path [{wavcaps_json_path}] can ' + f'not be found.') + if not os.path.exists(wavcaps_audio_path): + raise FileNotFoundError( + f'Input WavCaps audio path [{wavcaps_audio_path}] can ' + f'not be found.') + if not target_ds_path.endswith('.jsonl'): + raise ValueError('Only support "jsonl" target dataset file now.') + + if target_field not in ['caption', 'description', 'title']: + raise ValueError( + "target_field must be in '['caption', 'description', 'title']'") + + if os.path.dirname(target_ds_path) \ + and not os.path.exists(os.path.dirname(target_ds_path)): + logger.info(f'Create directory [{os.path.dirname(target_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(target_ds_path)) + + # check whether to add the eoc special token at last + if not add_eoc_at_last: + logger.warning('You choose not to add special eoc token at the last, ' + 'which might cause some compatibility problems for ' + 'other type of datasets (e.g. OpenFlamingo).') + + # load WavCaps dataset + logger.info('Loading original WavCaps dataset.') + wavcaps_ds = json.load(open(wavcaps_json_path, 'r', encoding='utf-8')) + num_captions_per_audio = wavcaps_ds['num_captions_per_audio'] + wavcaps_ds = wavcaps_ds['data'] + logger.info(f'Load [{len(wavcaps_ds)}] samples.') + all_audio_files = get_all_files(wavcaps_audio_path) + + with jl.open(target_ds_path, 'w') as writer: + for sample in tqdm(wavcaps_ds): + # id + id = sample['id'] + if str_id: + id = str(id) + + audio_name = id.strip().split('.')[0] + '.flac' + target_meta = creat_meta_filed(num_captions_per_audio, sample) + + # audio and text + if audio_name not in all_audio_files: + logger.warning(f'No audios in the sample with id [{id}], ' + f'which means this sample is not a multimodal ' + f'sample. You\'d better remove this sample ' + f'before converting.') + continue + audio = [all_audio_files[audio_name]] + text = audio_special_token + sent_seperator + if target_field not in sample.keys(): + logger.warning( + f'{target_field} does not exist in this sample with ' + f'id [{id}].') + continue + + if add_target_field_token: + text += from_format % target_field + text += sample[target_field] + if add_eoc_at_last: + text += eoc_special_token + + # get the new sample with Data-Juicer format + new_sample = { + 'id': id, + text_key: text, + audio_key: audio, + Fields.meta: target_meta + } + writer.write(new_sample) + logger.info(f'Store the target dataset into [{target_ds_path}].') + + +if __name__ == '__main__': + fire.Fire(main)