Skip to content

Commit

Permalink
Enhance video loading functionality with resizing options and increas…
Browse files Browse the repository at this point in the history
…ed frame limit (#383)
  • Loading branch information
pufanyi authored Oct 30, 2024
1 parent e05c93e commit e4f1816
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 4 deletions.
13 changes: 11 additions & 2 deletions lmms_eval/models/model_utils/load_video.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import base64
from io import BytesIO
from typing import Optional
from typing import Optional, Tuple

import av
import numpy as np
Expand Down Expand Up @@ -113,11 +113,20 @@ def read_video_pyav_pil(video_path: str, *, num_frm: int = 8, fps: float = None,
return [Image.fromarray(frame) for frame in frames]


def read_video_pyav_base64(video_path: str, *, num_frm: int = 8, fps: Optional[float] = None, format="rgb24", img_format="PNG"):
def read_video_pyav_base64(video_path: str, *, num_frm: int = 8, fps: Optional[float] = None, format="rgb24", img_format="PNG", max_image_size: Optional[Tuple[int, int] | int] = None, resize_strategy: str = "resize"):
frames = read_video_pyav(video_path, num_frm=num_frm, fps=fps, format=format)
base64_frames = []
for frame in frames:
img = Image.fromarray(frame)
if max_image_size:
if resize_strategy == "resize":
if isinstance(max_image_size, int):
max_image_size = (max_image_size, max_image_size)
img = img.resize(max_image_size)
elif resize_strategy == "thumbnail":
img.thumbnail(max_image_size)
else:
raise ValueError(f"Unknown resize strategy: {resize_strategy}")
output_buffer = BytesIO()
img.save(output_buffer, format=img_format)
byte_data = output_buffer.getvalue()
Expand Down
8 changes: 6 additions & 2 deletions lmms_eval/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@ def __init__(
use_flash_attention_2: Optional[bool] = True,
max_pixels: int = 12845056,
min_pixels: int = 3136,
max_num_frames: int = 32,
max_num_frames: int = 256,
use_custom_video_loader: Optional[bool] = False,
fps: Optional[float] = None, # Only applicable if use_custom_video_loader is True
max_image_size: Optional[int] = None, # Only applicable if use_custom_video_loader is True
**kwargs,
) -> None:
super().__init__()
Expand All @@ -55,6 +56,9 @@ def __init__(
self.fps = fps
if self.fps and not self.use_custom_video_loader:
raise ValueError("FPS is only applicable if use_custom_video_loader is True")
self.max_image_size = max_image_size
if self.max_image_size and not self.use_custom_video_loader:
raise ValueError("max_image_size is only applicable if use_custom_video_loader is True")

self.use_custom_video_loader = use_custom_video_loader
self.fps = fps
Expand Down Expand Up @@ -222,7 +226,7 @@ def _collate(x):
visual = visuals[i] if i < len(visuals) else None
if isinstance(visual, str) and visual.endswith((".mp4", ".avi", ".mov")): # Video file
if self.use_custom_video_loader:
visual = read_video_pyav_base64(visual, num_frm=self.max_num_frames, fps=self.fps, img_format="JPEG")
visual = read_video_pyav_base64(visual, num_frm=self.max_num_frames, fps=self.fps, img_format="JPEG", max_image_size=self.max_image_size)
image_contents = list(map(lambda x: f"data:image/jpeg;base64,{x}", visual))
message.append({"role": "user", "content": [{"type": "video", "video": image_contents}, {"type": "text", "text": context}]})
else:
Expand Down

0 comments on commit e4f1816

Please sign in to comment.