From a293e710290f91813cbbfa6144b2a2b1a3a59318 Mon Sep 17 00:00:00 2001 From: zhijianma Date: Mon, 13 Nov 2023 22:54:35 +0800 Subject: [PATCH] docs: pre-commit style --- data_juicer/utils/constant.py | 1 + data_juicer/utils/mm_utils.py | 4 +++- data_juicer/utils/model_utils.py | 5 +++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py index 401d408de..c3a68c32d 100644 --- a/data_juicer/utils/constant.py +++ b/data_juicer/utils/constant.py @@ -1,5 +1,6 @@ DEFAULT_PREFIX = '__dj__' + class Fields(object): stats = DEFAULT_PREFIX + 'stats__' meta = DEFAULT_PREFIX + 'meta__' diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py index b67484062..6ed0931e0 100644 --- a/data_juicer/utils/mm_utils.py +++ b/data_juicer/utils/mm_utils.py @@ -1,8 +1,8 @@ - from datasets import Image from data_juicer.utils.constant import DEFAULT_PREFIX + # A class to keep special tokens for multimodal information in the texts # The tokens in this class can be updated by corresponding arguments in config class SpecialTokens(object): @@ -12,9 +12,11 @@ class SpecialTokens(object): # others eoc = f'<|{DEFAULT_PREFIX}eoc|>' + def load_images(paths): return [load_image(path) for path in paths] + def load_image(path): img_feature = Image() img = img_feature.decode_example(img_feature.encode_example(path)) diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index e67b416f6..516dc7fe1 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -169,12 +169,13 @@ def prepare_huggingface_tokenizer(tokenizer_name): trust_remote_code=True) return tokenizer + def prepare_huggingface_clip(clip_name): """ Prepare and load a clip and processor from HuggingFace. - :param tokenizer_name: input tokenizer name - :return: a tokenizer instance. + :param clip_name: input clip name + :return: a pair of clip instance and processor instance. """ from transformers import CLIPProcessor, CLIPModel