From 4f18c854e45b4651e2d3a09041108d0e9c41fa04 Mon Sep 17 00:00:00 2001 From: kcz358 Date: Sat, 23 Nov 2024 06:04:24 +0000 Subject: [PATCH] Fix neptune doc to visual logic --- lmms_eval/models/llava_onevision.py | 17 +++-------------- lmms_eval/models/model_utils/load_video.py | 4 ++-- lmms_eval/tasks/neptune/neptune_full.yaml | 3 ++- lmms_eval/tasks/neptune/utils.py | 22 +++++++++++----------- 4 files changed, 18 insertions(+), 28 deletions(-) diff --git a/lmms_eval/models/llava_onevision.py b/lmms_eval/models/llava_onevision.py index 1e24b2a0..838c10f0 100644 --- a/lmms_eval/models/llava_onevision.py +++ b/lmms_eval/models/llava_onevision.py @@ -22,7 +22,7 @@ from lmms_eval.api.instance import Instance from lmms_eval.api.model import lmms from lmms_eval.api.registry import register_model -from lmms_eval.models.model_utils.load_video import read_video_pyav +from lmms_eval.models.model_utils.load_video import load_video_decord, read_video_pyav # Suppress warnings warnings.filterwarnings("ignore") @@ -367,17 +367,6 @@ def flatten(self, input): new_list.append(j) return new_list - def load_video(self, video_path, max_frames_num): - if type(video_path) == str: - vr = VideoReader(video_path, ctx=cpu(0)) - else: - vr = VideoReader(video_path[0], ctx=cpu(0)) - total_frame_num = len(vr) - uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int) - frame_idx = uniform_sampled_frames.tolist() - spare_frames = vr.get_batch(frame_idx).asnumpy() - return spare_frames # (frames, height, width, channels) - def generate_until(self, requests: List[Instance]) -> List[str]: res = [] @@ -461,7 +450,7 @@ def _collate(x): image_tensor = [] try: if self.video_decode_backend == "decord": - frames = self.load_video(visual, self.max_frames_num) + frames = self.load_video_decord(visual, self.max_frames_num) elif self.video_decode_backend == "pyav": frames = read_video_pyav(visual[0], num_frm=self.max_frames_num) frames = self._image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].half().cuda() @@ -672,7 +661,7 @@ def _collate(x): image_tensor = [] try: if self.video_decode_backend == "decord": - frames = self.load_video(visual, self.max_frames_num) + frames = load_video_decord(visual, self.max_frames_num) elif self.video_decode_backend == "pyav": frames = read_video_pyav(visual[0], num_frm=self.max_frames_num) frames = self._image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].half().cuda() diff --git a/lmms_eval/models/model_utils/load_video.py b/lmms_eval/models/model_utils/load_video.py index 2d4879cc..3fb0372b 100644 --- a/lmms_eval/models/model_utils/load_video.py +++ b/lmms_eval/models/model_utils/load_video.py @@ -6,9 +6,9 @@ def load_video_decord(video_path, max_frames_num): if type(video_path) == str: - vr = VideoReader(video_path, ctx=cpu(0)) + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) else: - vr = VideoReader(video_path[0], ctx=cpu(0)) + vr = VideoReader(video_path[0], ctx=cpu(0), num_threads=1) total_frame_num = len(vr) uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int) frame_idx = uniform_sampled_frames.tolist() diff --git a/lmms_eval/tasks/neptune/neptune_full.yaml b/lmms_eval/tasks/neptune/neptune_full.yaml index 52534c47..e83ca82e 100755 --- a/lmms_eval/tasks/neptune/neptune_full.yaml +++ b/lmms_eval/tasks/neptune/neptune_full.yaml @@ -2,7 +2,8 @@ dataset_path: lmms-lab/GoogleDeepMind-NEPTUNE dataset_name: full dataset_kwargs: token: True - cache_dir: ./ + cache_dir: neptune + video: True task: "neptune_full" test_split: test output_type: generate_until diff --git a/lmms_eval/tasks/neptune/utils.py b/lmms_eval/tasks/neptune/utils.py index f27a837f..775ea30b 100755 --- a/lmms_eval/tasks/neptune/utils.py +++ b/lmms_eval/tasks/neptune/utils.py @@ -1,9 +1,10 @@ import json import os +import sys +from glob import glob from pathlib import Path import yaml -import sys from loguru import logger from lmms_eval.tasks._task_utils.file_utils import generate_submission_file @@ -24,17 +25,16 @@ def neptune_full_doc_to_visual(doc): cache_dir = os.path.join(base_cache_dir, cache_name) - video_path = doc["video_path"] - video_path = os.path.join(cache_dir, "downloads", video_path) - if os.path.exists(video_path): - video_path = video_path - elif os.path.exists(video_path.replace("mp4", "MP4")): - video_path = video_path.replace("mp4", "MP4") - elif os.path.exists(video_path.replace("mp4", "mkv")): - video_path = video_path.replace("mp4", "mkv") + video_path = doc["video_path"].split(".")[0] + "*.mp4" + video_path = os.path.join(cache_dir, video_path) + video_path = [f for f in glob(video_path) if "temp" not in f] + if len(video_path) > 1: + return video_path[:1] + elif len(video_path) > 0: + return video_path else: - sys.exit(f"video path:{video_path} does not exist, please check") - return [video_path] + # Some stupid hardcode to skip this + return [f"video path:{video_path} does not exist, please check"] def neptune_full_doc_to_text(doc, lmms_eval_specific_kwargs):