Skip to content

Commit

Permalink
docs: pre-commit style
Browse files Browse the repository at this point in the history
  • Loading branch information
zhijianma committed Nov 13, 2023
1 parent ceada34 commit a293e71
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 3 deletions.
1 change: 1 addition & 0 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
DEFAULT_PREFIX = '__dj__'


class Fields(object):
stats = DEFAULT_PREFIX + 'stats__'
meta = DEFAULT_PREFIX + 'meta__'
Expand Down
4 changes: 3 additions & 1 deletion data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

from datasets import Image

from data_juicer.utils.constant import DEFAULT_PREFIX


# A class to keep special tokens for multimodal information in the texts
# The tokens in this class can be updated by corresponding arguments in config
class SpecialTokens(object):
Expand All @@ -12,9 +12,11 @@ class SpecialTokens(object):
# others
eoc = f'<|{DEFAULT_PREFIX}eoc|>'


def load_images(paths):
return [load_image(path) for path in paths]


def load_image(path):
img_feature = Image()
img = img_feature.decode_example(img_feature.encode_example(path))
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,13 @@ def prepare_huggingface_tokenizer(tokenizer_name):
trust_remote_code=True)
return tokenizer


def prepare_huggingface_clip(clip_name):
"""
Prepare and load a clip and processor from HuggingFace.
:param tokenizer_name: input tokenizer name
:return: a tokenizer instance.
:param clip_name: input clip name
:return: a pair of clip instance and processor instance.
"""
from transformers import CLIPProcessor, CLIPModel

Expand Down

0 comments on commit a293e71

Please sign in to comment.