Skip to content

Commit

Permalink
+ add messages <--> dj conversion tools
Browse files Browse the repository at this point in the history
  • Loading branch information
HYLcool committed Dec 20, 2024
1 parent 5d7cd04 commit e0bd573
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 73 deletions.
67 changes: 34 additions & 33 deletions tools/fmt_conversion/post_tuning_dialog/dj_to_messages.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,27 @@
# This tool is used to convert dataset in ModelScope-Swift Messages format to a
# target dataset in Data-Juicer query-response format.
# This tool is used to convert dataset in Data-Juicer format to a
# target dataset in ModelScope-Swift Messages-like format.
#
# ModelScope-Swift Messages format:
# Data-Juicer format (query-response format):
# [
# {
# "images": ["coco/train2017/000000033471.jpg"],
# "query": "Is the bus driving down the street or pulled off to the side?",
# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501
# "history": [
# [
# "<image>\nWhat are the colors of the bus in the image?",
# "The bus in the image is white and red."
# ],
# [
# "What feature can be seen on the back of the bus?",
# "The back of the bus features an advertisement."
# ],
# ]
# },
# ...
# ]
#
# Corresponding ModelScope-Swift Messages format:
# - usually in json format
# [
# {
Expand Down Expand Up @@ -36,36 +56,14 @@
# ...
# ]
#
# Corresponding Data-Juicer format (query-response format):
# [
# {
# "images": ["coco/train2017/000000033471.jpg"],
# "query": "Is the bus driving down the street or pulled off to the side?",
# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501
# "history": [
# [
# "<image>\nWhat are the colors of the bus in the image?",
# "The bus in the image is white and red."
# ],
# [
# "What feature can be seen on the back of the bus?",
# "The back of the bus features an advertisement."
# ],
# ]
# },
# ...
# ]
#
# Reference:
# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md
#
# This format is nearly the same as the LLaMA-Factory ShareGPT format, so we
# reuse the code in that conversion tools.

from typing import List, Union

import dj_to_llama_factory_sharegpt
import fire
import llama_factory_sharegpt_to_dj
from loguru import logger


Expand All @@ -76,31 +74,34 @@ def main(
messages_key: str = 'messages',
role_key: str = 'role',
content_key: str = 'content',
human_role: str = 'user',
assistant_role: str = 'assistant',
system_role: str = 'system',
instruction_role: str = 'instruction',
multimodal_keys: Union[str, List[str]] = None,
):
"""
Convert a Messages-like dataset to the Data-Juicer query-response format.
Convert a ShareGPT-like dataset to the Data-Juicer query-response format.
:param src_ds_path: the path to the source ShareGPT-like dataset.
:param tgt_ds_path: the path to store the converted target dataset.
:param messages_key: the field key to store messages.
:param role_key: the field key to store the sentence from.
:param content_key: the field key to store the sentence content.
:param system_role: the field key to store the system prompt.
:param instruction_role: the field key to store the instruction content.
:param multimodal_keys: optional keys to store multimodal data.
:param human_role: the role to store the human prompt.
:param assistant_role: the role to store the instruction content.
:param system_role: the role to store the system prompt.
:param instruction_role: the role to store the instruction content.
"""
llama_factory_sharegpt_to_dj.main(
dj_to_llama_factory_sharegpt.main(
src_ds_path,
tgt_ds_path,
conversations_key=messages_key,
from_key=role_key,
value_key=content_key,
human_role=human_role,
assistant_role=assistant_role,
system_role=system_role,
instruction_role=instruction_role,
multimodal_keys=multimodal_keys,
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ def sharegpt_to_dj(
instruction_role: str = 'instruction',
multimodal_keys: Union[str, List[str]] = None,
):
modified_keys = {conversations_key}.union(set(multimodal_keys))
modified_keys = {conversations_key}
if multimodal_keys:
modified_keys = modified_keys.union(set(multimodal_keys))
new_sample = {
key: sample[key]
for key in sample if key not in modified_keys
Expand Down Expand Up @@ -140,11 +142,12 @@ def sharegpt_to_dj(
})

# update multimodal data
for mm_key in multimodal_keys:
if not isinstance(sample[mm_key], list):
new_sample[mm_key] = [sample[mm_key]]
else:
new_sample[mm_key] = sample[mm_key]
if multimodal_keys:
for mm_key in multimodal_keys:
if not isinstance(sample[mm_key], list):
new_sample[mm_key] = [sample[mm_key]]
else:
new_sample[mm_key] = sample[mm_key]

return new_sample

Expand Down
67 changes: 33 additions & 34 deletions tools/fmt_conversion/post_tuning_dialog/messages_to_dj.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,7 @@
# This tool is used to convert dataset in Data-Juicer format to a
# target dataset in ModelScope-Swift Messages-like format.
# This tool is used to convert dataset in ModelScope-Swift Messages format to a
# target dataset in Data-Juicer query-response format.
#
# Data-Juicer format (query-response format):
# [
# {
# "images": ["coco/train2017/000000033471.jpg"],
# "query": "Is the bus driving down the street or pulled off to the side?",
# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501
# "history": [
# [
# "<image>\nWhat are the colors of the bus in the image?",
# "The bus in the image is white and red."
# ],
# [
# "What feature can be seen on the back of the bus?",
# "The back of the bus features an advertisement."
# ],
# ]
# },
# ...
# ]
#
# Corresponding ModelScope-Swift Messages format:
# ModelScope-Swift Messages format:
# - usually in json format
# [
# {
Expand Down Expand Up @@ -56,14 +36,36 @@
# ...
# ]
#
# Corresponding Data-Juicer format (query-response format):
# [
# {
# "images": ["coco/train2017/000000033471.jpg"],
# "query": "Is the bus driving down the street or pulled off to the side?",
# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501
# "history": [
# [
# "<image>\nWhat are the colors of the bus in the image?",
# "The bus in the image is white and red."
# ],
# [
# "What feature can be seen on the back of the bus?",
# "The back of the bus features an advertisement."
# ],
# ]
# },
# ...
# ]
#
# Reference:
# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md
#
# This format is nearly the same as the LLaMA-Factory ShareGPT format, so we
# reuse the code in that conversion tools.

import dj_to_llama_factory_sharegpt
from typing import List, Union

import fire
import llama_factory_sharegpt_to_dj
from loguru import logger


Expand All @@ -74,34 +76,31 @@ def main(
messages_key: str = 'messages',
role_key: str = 'role',
content_key: str = 'content',
human_role: str = 'user',
assistant_role: str = 'assistant',
system_role: str = 'system',
instruction_role: str = 'instruction',
multimodal_keys: Union[str, List[str]] = None,
):
"""
Convert a ShareGPT-like dataset to the Data-Juicer query-response format.
Convert a Messages-like dataset to the Data-Juicer query-response format.
:param src_ds_path: the path to the source ShareGPT-like dataset.
:param tgt_ds_path: the path to store the converted target dataset.
:param messages_key: the field key to store messages.
:param role_key: the field key to store the sentence from.
:param content_key: the field key to store the sentence content.
:param human_role: the role to store the human prompt.
:param assistant_role: the role to store the instruction content.
:param system_role: the role to store the system prompt.
:param instruction_role: the role to store the instruction content.
:param system_role: the field key to store the system prompt.
:param instruction_role: the field key to store the instruction content.
:param multimodal_keys: optional keys to store multimodal data.
"""
dj_to_llama_factory_sharegpt.main(
llama_factory_sharegpt_to_dj.main(
src_ds_path,
tgt_ds_path,
conversations_key=messages_key,
from_key=role_key,
value_key=content_key,
human_role=human_role,
assistant_role=assistant_role,
system_role=system_role,
instruction_role=instruction_role,
multimodal_keys=multimodal_keys,
)


Expand Down

0 comments on commit e0bd573

Please sign in to comment.