From 967bbb41932db2ec8aa4a3efbf7d1b4fcb0caff7 Mon Sep 17 00:00:00 2001 From: "brian.li" Date: Sat, 2 Nov 2024 06:20:49 +0000 Subject: [PATCH] Fix qwen vl image input bug --- lmms_eval/models/qwen2_vl.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lmms_eval/models/qwen2_vl.py b/lmms_eval/models/qwen2_vl.py index 565b7888..a7ac04f1 100755 --- a/lmms_eval/models/qwen2_vl.py +++ b/lmms_eval/models/qwen2_vl.py @@ -234,12 +234,13 @@ def _collate(x): texts = [self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] image_inputs, video_inputs = process_vision_info(messages) - total_frames = video_inputs[0].shape[0] - indices = np.linspace(0, total_frames - 1, self.max_num_frames, dtype=int) - # Append the last frame index if not already included - if total_frames - 1 not in indices: - indices = np.append(indices, total_frames - 1) - video_inputs[0] = video_inputs[0][indices] + if video_inputs is not None: + total_frames = video_inputs[0].shape[0] + indices = np.linspace(0, total_frames - 1, self.max_num_frames, dtype=int) + # Append the last frame index if not already included + if total_frames - 1 not in indices: + indices = np.append(indices, total_frames - 1) + video_inputs[0] = video_inputs[0][indices] inputs = self.processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt") if self.device_map == "auto":