From 967bbb41932db2ec8aa4a3efbf7d1b4fcb0caff7 Mon Sep 17 00:00:00 2001
From: "brian.li" <brian.li@bytedance.com>
Date: Sat, 2 Nov 2024 06:20:49 +0000
Subject: [PATCH] Fix qwen vl image input bug

---
 lmms_eval/models/qwen2_vl.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/lmms_eval/models/qwen2_vl.py b/lmms_eval/models/qwen2_vl.py
index 565b7888..a7ac04f1 100755
--- a/lmms_eval/models/qwen2_vl.py
+++ b/lmms_eval/models/qwen2_vl.py
@@ -234,12 +234,13 @@ def _collate(x):
 
             texts = [self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
             image_inputs, video_inputs = process_vision_info(messages)
-            total_frames = video_inputs[0].shape[0]
-            indices = np.linspace(0, total_frames - 1, self.max_num_frames, dtype=int)
-            # Append the last frame index if not already included
-            if total_frames - 1 not in indices:
-                indices = np.append(indices, total_frames - 1)
-            video_inputs[0] = video_inputs[0][indices]
+            if video_inputs is not None:
+                total_frames = video_inputs[0].shape[0]
+                indices = np.linspace(0, total_frames - 1, self.max_num_frames, dtype=int)
+                # Append the last frame index if not already included
+                if total_frames - 1 not in indices:
+                    indices = np.append(indices, total_frames - 1)
+                video_inputs[0] = video_inputs[0][indices]
             inputs = self.processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
 
             if self.device_map == "auto":