Skip to content

Commit

Permalink
+ add messages <--> dj conversion tools
Browse files Browse the repository at this point in the history
  • Loading branch information
HYLcool committed Dec 20, 2024
1 parent cb9f56f commit 5d7cd04
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 1 deletion.
108 changes: 108 additions & 0 deletions tools/fmt_conversion/post_tuning_dialog/dj_to_messages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# This tool is used to convert dataset in ModelScope-Swift Messages format to a
# target dataset in Data-Juicer query-response format.
#
# ModelScope-Swift Messages format:
# - usually in json format
# [
# {
# "images": ["coco/train2017/000000033471.jpg"],
# "messages": [
# {
# "role": "human",
# "content": "<image>\nWhat are the colors of the bus in the image?"
# },
# {
# "role": "gpt",
# "content": "The bus in the image is white and red."
# },
# {
# "role": "human",
# "content": "What feature can be seen on the back of the bus?"
# },
# {
# "role": "gpt",
# "content": "The back of the bus features an advertisement."
# },
# {
# "role": "human",
# "content": "Is the bus driving down the street or pulled off to the side?" # noqa: E501
# },
# {
# "role": "gpt",
# "content": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501
# }
# ]
# },
# ...
# ]
#
# Corresponding Data-Juicer format (query-response format):
# [
# {
# "images": ["coco/train2017/000000033471.jpg"],
# "query": "Is the bus driving down the street or pulled off to the side?",
# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501
# "history": [
# [
# "<image>\nWhat are the colors of the bus in the image?",
# "The bus in the image is white and red."
# ],
# [
# "What feature can be seen on the back of the bus?",
# "The back of the bus features an advertisement."
# ],
# ]
# },
# ...
# ]
#
# Reference:
# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md
#
# This format is nearly the same as the LLaMA-Factory ShareGPT format, so we
# reuse the code in that conversion tools.

from typing import List, Union

import fire
import llama_factory_sharegpt_to_dj
from loguru import logger


@logger.catch(reraise=True)
def main(
src_ds_path: str,
tgt_ds_path: str,
messages_key: str = 'messages',
role_key: str = 'role',
content_key: str = 'content',
system_role: str = 'system',
instruction_role: str = 'instruction',
multimodal_keys: Union[str, List[str]] = None,
):
"""
Convert a Messages-like dataset to the Data-Juicer query-response format.
:param src_ds_path: the path to the source ShareGPT-like dataset.
:param tgt_ds_path: the path to store the converted target dataset.
:param messages_key: the field key to store messages.
:param role_key: the field key to store the sentence from.
:param content_key: the field key to store the sentence content.
:param system_role: the field key to store the system prompt.
:param instruction_role: the field key to store the instruction content.
:param multimodal_keys: optional keys to store multimodal data.
"""
llama_factory_sharegpt_to_dj.main(
src_ds_path,
tgt_ds_path,
conversations_key=messages_key,
from_key=role_key,
value_key=content_key,
system_role=system_role,
instruction_role=instruction_role,
multimodal_keys=multimodal_keys,
)


if __name__ == '__main__':
fire.Fire(main)
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def main(
multimodal_keys = [multimodal_keys]

# load ShareGPT dataset
logger.info('Loading original ShareGPT dataset.')
logger.info('Loading original dataset.')
src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8'))
logger.info(f'Load [{len(src_ds)}] samples.')

Expand Down
109 changes: 109 additions & 0 deletions tools/fmt_conversion/post_tuning_dialog/messages_to_dj.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# This tool is used to convert dataset in Data-Juicer format to a
# target dataset in ModelScope-Swift Messages-like format.
#
# Data-Juicer format (query-response format):
# [
# {
# "images": ["coco/train2017/000000033471.jpg"],
# "query": "Is the bus driving down the street or pulled off to the side?",
# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501
# "history": [
# [
# "<image>\nWhat are the colors of the bus in the image?",
# "The bus in the image is white and red."
# ],
# [
# "What feature can be seen on the back of the bus?",
# "The back of the bus features an advertisement."
# ],
# ]
# },
# ...
# ]
#
# Corresponding ModelScope-Swift Messages format:
# - usually in json format
# [
# {
# "images": ["coco/train2017/000000033471.jpg"],
# "messages": [
# {
# "role": "human",
# "content": "<image>\nWhat are the colors of the bus in the image?"
# },
# {
# "role": "gpt",
# "content": "The bus in the image is white and red."
# },
# {
# "role": "human",
# "content": "What feature can be seen on the back of the bus?"
# },
# {
# "role": "gpt",
# "content": "The back of the bus features an advertisement."
# },
# {
# "role": "human",
# "content": "Is the bus driving down the street or pulled off to the side?" # noqa: E501
# },
# {
# "role": "gpt",
# "content": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501
# }
# ]
# },
# ...
# ]
#
# Reference:
# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md
#
# This format is nearly the same as the LLaMA-Factory ShareGPT format, so we
# reuse the code in that conversion tools.

import dj_to_llama_factory_sharegpt
import fire
from loguru import logger


@logger.catch(reraise=True)
def main(
src_ds_path: str,
tgt_ds_path: str,
messages_key: str = 'messages',
role_key: str = 'role',
content_key: str = 'content',
human_role: str = 'user',
assistant_role: str = 'assistant',
system_role: str = 'system',
instruction_role: str = 'instruction',
):
"""
Convert a ShareGPT-like dataset to the Data-Juicer query-response format.
:param src_ds_path: the path to the source ShareGPT-like dataset.
:param tgt_ds_path: the path to store the converted target dataset.
:param messages_key: the field key to store messages.
:param role_key: the field key to store the sentence from.
:param content_key: the field key to store the sentence content.
:param human_role: the role to store the human prompt.
:param assistant_role: the role to store the instruction content.
:param system_role: the role to store the system prompt.
:param instruction_role: the role to store the instruction content.
"""
dj_to_llama_factory_sharegpt.main(
src_ds_path,
tgt_ds_path,
conversations_key=messages_key,
from_key=role_key,
value_key=content_key,
human_role=human_role,
assistant_role=assistant_role,
system_role=system_role,
instruction_role=instruction_role,
)


if __name__ == '__main__':
fire.Fire(main)

0 comments on commit 5d7cd04

Please sign in to comment.