Skip to content

Commit

Permalink
some typo and desp
Browse files Browse the repository at this point in the history
  • Loading branch information
zhijianma committed Dec 6, 2023
1 parent ef34418 commit f851215
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 35 deletions.
21 changes: 5 additions & 16 deletions data_juicer/ops/filter/image_text_matching_filter.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
import numpy as np
from jsonargparse.typing import ClosedUnitInterval
from PIL import ImageOps

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import SpecialTokens, load_image
from data_juicer.utils.mm_utils import (SpecialTokens, load_image,
remove_special_tokens)
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_text_matching_filter'

with AvailabilityChecking(['torch'], OP_NAME):
with AvailabilityChecking(['torch', 'transformers'], OP_NAME):
import torch
import transformers # noqa: F401
from PIL import ImageOps

# avoid hanging when calling blip in multiprocessing
torch.set_num_threads(1)
Expand Down Expand Up @@ -102,18 +103,7 @@ def compute_stats(self, sample, context=False):
sample[Fields.context][loaded_image_key] = image

text = sample[self.text_key]
special_token_dict = {
key: value
for key, value in SpecialTokens.__dict__.items()
if not key.startswith('__')
}
offset = 0

def remove_special_token(text):
for value in special_token_dict.values():
text = text.replace(value, '')
return text

matching_scores = []
model, processor = get_model(self.model_key)

Expand All @@ -124,7 +114,7 @@ def remove_special_token(text):
if count == 0 or len(chunk) == 0:
continue
else:
text_chunk = remove_special_token(chunk)
text_chunk = remove_special_tokens(chunk)
image_chunk = []
for image_key in loaded_image_keys[offset:offset + count]:
image = images[image_key]
Expand Down Expand Up @@ -172,7 +162,6 @@ def process(self, sample):

# different strategies
if self.any:

return keep_bools.any()
else:
return keep_bools.all()
20 changes: 5 additions & 15 deletions data_juicer/ops/filter/image_text_similarity_filter.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
import numpy as np
from jsonargparse.typing import ClosedUnitInterval
from PIL import ImageOps

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import SpecialTokens, load_image
from data_juicer.utils.mm_utils import (SpecialTokens, load_image,
remove_special_tokens)
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_text_similarity_filter'

with AvailabilityChecking(['torch'], OP_NAME):
with AvailabilityChecking(['torch', 'transformers'], OP_NAME):

import torch
import transformers # noqa: F401
from PIL import ImageOps

# avoid hanging when calling clip in multiprocessing
torch.set_num_threads(1)
Expand Down Expand Up @@ -102,18 +103,7 @@ def compute_stats(self, sample, context=False):
sample[Fields.context][loaded_image_key] = image

text = sample[self.text_key]
special_token_dict = {
key: value
for key, value in SpecialTokens.__dict__.items()
if not key.startswith('__')
}
offset = 0

def remove_special_token(text):
for value in special_token_dict.values():
text = text.replace(value, '')
return text

similarity = []
model, processor = get_model(self.model_key)

Expand All @@ -124,7 +114,7 @@ def remove_special_token(text):
if count == 0 or len(chunk) == 0:
continue
else:
text_chunk = remove_special_token(chunk)
text_chunk = remove_special_tokens(chunk)
image_chunk = []
for image_key in loaded_image_keys[offset:offset + count]:
image = images[image_key]
Expand Down
15 changes: 15 additions & 0 deletions data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@ class SpecialTokens(object):
eoc = f'<|{DEFAULT_PREFIX}eoc|>'


def get_special_tokens():
special_token_dict = {
key: value
for key, value in SpecialTokens.__dict__.items()
if not key.startswith('__')
}
return special_token_dict


def remove_special_tokens(text):
for value in get_special_tokens().values():
text = text.replace(value, '')
return text


def load_images(paths):
return [load_image(path) for path in paths]

Expand Down
4 changes: 2 additions & 2 deletions docs/Operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ All the specific operators are listed below, each featured with several capabili
| image_aspect_ratio_filter | Image | - | Keeps samples contains images with aspect ratios within specific range |
| image_shape_filter | Image | - | Keeps samples contains images with widths and heights within specific ranges |
| image_size_filter | Image | - | Keeps samples contains images whose size in bytes are within specific range |
| image_text_matching_filter | Multimodal | - | Keeps samples with matching score between text and images within the specified range |
| image_text_similarity_filter | Multimodal | - | Keeps samples with similarity between text and images within the specified range |
| image_text_matching_filter | Multimodal | - | Keeps samples with image-text classification matching score within the specified range based on a BLIP model |
| image_text_similarity_filter | Multimodal | - | Keeps samples with image-text feature cosine similarity within the specified range based on a CLIP model |
| language_id_score_filter | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score |
| maximum_line_length_filter | Code | en, zh | Keeps samples with maximum line length within the specified range |
| perplexity_filter | General | en, zh | Keeps samples with perplexity score below the specified threshold |
Expand Down
4 changes: 2 additions & 2 deletions docs/Operators_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ Data-Juicer 中的算子分为以下 5 种类型。
| image_aspect_ratio_filter | Image | - | 保留样本中包含的图片的宽高比在指定范围内的样本 |
| image_shape_filter | Image | - | 保留样本中包含的图片的形状(即宽和高)在指定范围内的样本 |
| image_size_filter | Image | - | 保留样本中包含的图片的大小(bytes)在指定范围内的样本 |
| image_text_matching_filter | Multimodal | - | 保留文本图像匹配度在指定范围内的样本 |
| image_text_similarity_filter | Multimodal | - | 保留文本图像相似度在指定范围内的样本 |
| image_text_matching_filter | Multimodal | - | 保留图像-文本的分类匹配分(基于BLIP模型)在指定范围内的样本 |
| image_text_similarity_filter | Multimodal | - | 保留图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 |
| language_id_score_filter | General | en, zh | 保留特定语言的样本,通过预测的置信度得分来判断 |
| maximum_line_length_filter | Code | en, zh | 保留最大行长度在指定范围内的样本 |
| perplexity_filter | General | en, zh | 保留困惑度低于指定阈值的样本 |
Expand Down

0 comments on commit f851215

Please sign in to comment.