From e0bd57329a4089331e4c5bb7d345b40e7df386d5 Mon Sep 17 00:00:00 2001 From: "lielin.hyl" Date: Fri, 20 Dec 2024 11:36:47 +0800 Subject: [PATCH] + add messages <--> dj conversion tools --- .../post_tuning_dialog/dj_to_messages.py | 67 ++++++++++--------- .../llama_factory_sharegpt_to_dj.py | 15 +++-- .../post_tuning_dialog/messages_to_dj.py | 67 +++++++++---------- 3 files changed, 76 insertions(+), 73 deletions(-) diff --git a/tools/fmt_conversion/post_tuning_dialog/dj_to_messages.py b/tools/fmt_conversion/post_tuning_dialog/dj_to_messages.py index ea4bf40ec..cf9e2325f 100644 --- a/tools/fmt_conversion/post_tuning_dialog/dj_to_messages.py +++ b/tools/fmt_conversion/post_tuning_dialog/dj_to_messages.py @@ -1,7 +1,27 @@ -# This tool is used to convert dataset in ModelScope-Swift Messages format to a -# target dataset in Data-Juicer query-response format. +# This tool is used to convert dataset in Data-Juicer format to a +# target dataset in ModelScope-Swift Messages-like format. # -# ModelScope-Swift Messages format: +# Data-Juicer format (query-response format): +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "query": "Is the bus driving down the street or pulled off to the side?", +# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# "history": [ +# [ +# "\nWhat are the colors of the bus in the image?", +# "The bus in the image is white and red." +# ], +# [ +# "What feature can be seen on the back of the bus?", +# "The back of the bus features an advertisement." +# ], +# ] +# }, +# ... +# ] +# +# Corresponding ModelScope-Swift Messages format: # - usually in json format # [ # { @@ -36,36 +56,14 @@ # ... # ] # -# Corresponding Data-Juicer format (query-response format): -# [ -# { -# "images": ["coco/train2017/000000033471.jpg"], -# "query": "Is the bus driving down the street or pulled off to the side?", -# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 -# "history": [ -# [ -# "\nWhat are the colors of the bus in the image?", -# "The bus in the image is white and red." -# ], -# [ -# "What feature can be seen on the back of the bus?", -# "The back of the bus features an advertisement." -# ], -# ] -# }, -# ... -# ] -# # Reference: # https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md # # This format is nearly the same as the LLaMA-Factory ShareGPT format, so we # reuse the code in that conversion tools. -from typing import List, Union - +import dj_to_llama_factory_sharegpt import fire -import llama_factory_sharegpt_to_dj from loguru import logger @@ -76,31 +74,34 @@ def main( messages_key: str = 'messages', role_key: str = 'role', content_key: str = 'content', + human_role: str = 'user', + assistant_role: str = 'assistant', system_role: str = 'system', instruction_role: str = 'instruction', - multimodal_keys: Union[str, List[str]] = None, ): """ - Convert a Messages-like dataset to the Data-Juicer query-response format. + Convert a ShareGPT-like dataset to the Data-Juicer query-response format. :param src_ds_path: the path to the source ShareGPT-like dataset. :param tgt_ds_path: the path to store the converted target dataset. :param messages_key: the field key to store messages. :param role_key: the field key to store the sentence from. :param content_key: the field key to store the sentence content. - :param system_role: the field key to store the system prompt. - :param instruction_role: the field key to store the instruction content. - :param multimodal_keys: optional keys to store multimodal data. + :param human_role: the role to store the human prompt. + :param assistant_role: the role to store the instruction content. + :param system_role: the role to store the system prompt. + :param instruction_role: the role to store the instruction content. """ - llama_factory_sharegpt_to_dj.main( + dj_to_llama_factory_sharegpt.main( src_ds_path, tgt_ds_path, conversations_key=messages_key, from_key=role_key, value_key=content_key, + human_role=human_role, + assistant_role=assistant_role, system_role=system_role, instruction_role=instruction_role, - multimodal_keys=multimodal_keys, ) diff --git a/tools/fmt_conversion/post_tuning_dialog/llama_factory_sharegpt_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/llama_factory_sharegpt_to_dj.py index 8f427b553..1814dff0d 100644 --- a/tools/fmt_conversion/post_tuning_dialog/llama_factory_sharegpt_to_dj.py +++ b/tools/fmt_conversion/post_tuning_dialog/llama_factory_sharegpt_to_dj.py @@ -78,7 +78,9 @@ def sharegpt_to_dj( instruction_role: str = 'instruction', multimodal_keys: Union[str, List[str]] = None, ): - modified_keys = {conversations_key}.union(set(multimodal_keys)) + modified_keys = {conversations_key} + if multimodal_keys: + modified_keys = modified_keys.union(set(multimodal_keys)) new_sample = { key: sample[key] for key in sample if key not in modified_keys @@ -140,11 +142,12 @@ def sharegpt_to_dj( }) # update multimodal data - for mm_key in multimodal_keys: - if not isinstance(sample[mm_key], list): - new_sample[mm_key] = [sample[mm_key]] - else: - new_sample[mm_key] = sample[mm_key] + if multimodal_keys: + for mm_key in multimodal_keys: + if not isinstance(sample[mm_key], list): + new_sample[mm_key] = [sample[mm_key]] + else: + new_sample[mm_key] = sample[mm_key] return new_sample diff --git a/tools/fmt_conversion/post_tuning_dialog/messages_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/messages_to_dj.py index cf9e2325f..ea4bf40ec 100644 --- a/tools/fmt_conversion/post_tuning_dialog/messages_to_dj.py +++ b/tools/fmt_conversion/post_tuning_dialog/messages_to_dj.py @@ -1,27 +1,7 @@ -# This tool is used to convert dataset in Data-Juicer format to a -# target dataset in ModelScope-Swift Messages-like format. +# This tool is used to convert dataset in ModelScope-Swift Messages format to a +# target dataset in Data-Juicer query-response format. # -# Data-Juicer format (query-response format): -# [ -# { -# "images": ["coco/train2017/000000033471.jpg"], -# "query": "Is the bus driving down the street or pulled off to the side?", -# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 -# "history": [ -# [ -# "\nWhat are the colors of the bus in the image?", -# "The bus in the image is white and red." -# ], -# [ -# "What feature can be seen on the back of the bus?", -# "The back of the bus features an advertisement." -# ], -# ] -# }, -# ... -# ] -# -# Corresponding ModelScope-Swift Messages format: +# ModelScope-Swift Messages format: # - usually in json format # [ # { @@ -56,14 +36,36 @@ # ... # ] # +# Corresponding Data-Juicer format (query-response format): +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "query": "Is the bus driving down the street or pulled off to the side?", +# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# "history": [ +# [ +# "\nWhat are the colors of the bus in the image?", +# "The bus in the image is white and red." +# ], +# [ +# "What feature can be seen on the back of the bus?", +# "The back of the bus features an advertisement." +# ], +# ] +# }, +# ... +# ] +# # Reference: # https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md # # This format is nearly the same as the LLaMA-Factory ShareGPT format, so we # reuse the code in that conversion tools. -import dj_to_llama_factory_sharegpt +from typing import List, Union + import fire +import llama_factory_sharegpt_to_dj from loguru import logger @@ -74,34 +76,31 @@ def main( messages_key: str = 'messages', role_key: str = 'role', content_key: str = 'content', - human_role: str = 'user', - assistant_role: str = 'assistant', system_role: str = 'system', instruction_role: str = 'instruction', + multimodal_keys: Union[str, List[str]] = None, ): """ - Convert a ShareGPT-like dataset to the Data-Juicer query-response format. + Convert a Messages-like dataset to the Data-Juicer query-response format. :param src_ds_path: the path to the source ShareGPT-like dataset. :param tgt_ds_path: the path to store the converted target dataset. :param messages_key: the field key to store messages. :param role_key: the field key to store the sentence from. :param content_key: the field key to store the sentence content. - :param human_role: the role to store the human prompt. - :param assistant_role: the role to store the instruction content. - :param system_role: the role to store the system prompt. - :param instruction_role: the role to store the instruction content. + :param system_role: the field key to store the system prompt. + :param instruction_role: the field key to store the instruction content. + :param multimodal_keys: optional keys to store multimodal data. """ - dj_to_llama_factory_sharegpt.main( + llama_factory_sharegpt_to_dj.main( src_ds_path, tgt_ds_path, conversations_key=messages_key, from_key=role_key, value_key=content_key, - human_role=human_role, - assistant_role=assistant_role, system_role=system_role, instruction_role=instruction_role, + multimodal_keys=multimodal_keys, )