Skip to content

Commit

Permalink
✨ feat(mia_bench): introduce new MIA-Bench task configuration and uti…
Browse files Browse the repository at this point in the history
…lities

- add mia_bench.yaml for task configuration with dataset and evaluation settings
- implement utility functions for document processing and result generation
- integrate OpenAI and Azure API support for evaluation
- provide aggregation functions to calculate overall scores
  • Loading branch information
Luodian committed Nov 23, 2024
1 parent fd7a4a6 commit 87bf204
Show file tree
Hide file tree
Showing 2 changed files with 209 additions and 0 deletions.
24 changes: 24 additions & 0 deletions lmms_eval/tasks/mia_bench/mia_bench.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
dataset_path: lmms-lab/MIA-Bench
dataset_kwargs:
token: True
task: "mia_bench"
test_split: test
doc_to_visual: !function utils.mia_bench_doc_to_visual
doc_to_text: !function utils.mia_bench_doc_to_text
doc_to_target: ""
process_results: !function utils.mia_bench_process_results
metric_list:
- metric: gpt_eval_score
aggregation: !function utils.mia_bench_aggregate_results
higher_is_better: true

lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
generation_kwargs:
max_new_tokens: 512
metadata:
version: 0.0
task_type: image
gpt_eval_model_name: "gpt-4o-2024-08-06"
185 changes: 185 additions & 0 deletions lmms_eval/tasks/mia_bench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import json
import os
import random
import time
from collections import defaultdict
from pathlib import Path

import numpy as np
import requests
import yaml
from loguru import logger as eval_logger

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file


def mia_bench_doc_to_visual(doc):
return [doc["image"].convert("RGB")]


def mia_bench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
question_text = doc["instruction"]

pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") if lmms_eval_specific_kwargs else ""

post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") if lmms_eval_specific_kwargs else ""
formatted_question = f"{pre_prompt}{question_text}{post_prompt}"

return formatted_question


# ============================
# Result Processing Functions
# ============================

with open(Path(__file__).parent / "mia_bench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)

config = yaml.safe_load("".join(safe_data))

GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
NUM_SECONDS_TO_SLEEP = 10
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
headers = {"api-key": API_KEY, "Content-Type": "application/json", "api-version": "2023-07-01-preview"}


def get_eval(content: str, max_tokens: int, retries: int = 5):
global headers

messages = [
{"role": "user", "content": content},
]

payload = {
"model": GPT_EVAL_MODEL_NAME,
"messages": messages,
"temperature": 0,
"max_tokens": max_tokens,
}

for attempt in range(retries):
try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
response.raise_for_status()
response_data = response.json()

content = response_data["choices"][0]["message"]["content"].strip()
if content != "":
return content, response_data["model"]
break # If successful, break out of the loop

except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
if attempt < retries: # If we have retries left, sleep and then continue to next attempt
time.sleep(NUM_SECONDS_TO_SLEEP)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
return "", ""
return "", ""


def generate_prompt(d, response):
instruction = d["instruction"]
weight = d["component_weight"] * 1
d["num_of_component"] = len(d["components"])
for i in range(len(weight)):
weight[i] = str(weight[i])
if d["num_of_component"] == 1:
components = """The first component is:' """ + d["components"][0] + "'"
score = """The first component is worth """ + weight[0] + " scores."
elif d["num_of_component"] == 2:
components = """The first component is:' """ + d["components"][0] + """', and the second component is:' """ + d["components"][1] + "'"
score = """The first and second component is each worth """ + weight[0] + " and " + weight[1] + " scores."
elif d["num_of_component"] == 3:
components = """The first component is:' """ + d["components"][0] + """', and the second component is:' """ + d["components"][1] + """', and the third component is:' """ + d["components"][2] + "'"
score = """The first second, and third component is each worth """ + weight[0] + ", " + weight[1] + " and " + weight[2] + " scores."
elif d["num_of_component"] == 4:
components = (
"""The first component is:' """
+ d["components"][0]
+ """', and the second component is:' """
+ d["components"][1]
+ """', and the third component is:' """
+ d["components"][2]
+ """', and the fourth component is:' """
+ d["components"][3]
+ "'"
)
score = """The first second, third, and fourth component is each worth """ + weight[0] + ", " + weight[1] + ", " + weight[2] + " and " + weight[3] + " scores."
elif d["num_of_component"] == 5:
components = (
"""The first component is:' """
+ d["components"][0]
+ """', and the second component is:' """
+ d["components"][1]
+ """', and the third component is:' """
+ d["components"][2]
+ """', and the fourth component is:' """
+ d["components"][3]
+ """', and the fifth component is:' """
+ d["components"][4]
+ "'"
)
score = """The first second, third, fourth and fifth component is each worth """ + weight[0] + ", " + weight[1] + ", " + weight[2] + ", " + weight[3] + " and " + weight[4] + " scores."
return (
"""Here is an instruction for a multimodal LLM: ' """
+ instruction
+ """ You need to grade if the response from the model follows each component of the instruction. """
+ components
+ """ The response is:' """
+ response
+ """' You need to score the response and be strict. The total score ranges from 0 to 10, depending on if the response follows the instruction. """
+ score
+ " List scores of each component, and the total score in one sentence in this format: score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
)


def process_rawscore(component_type, raw_score):
first_sentence = raw_score.split(""".""")[0].split(""",""")
score_dict = {}
for i in range(len(first_sentence) - 1):
score_ = first_sentence[i].split(""":""")[1][1:].split("""/""")
score = int(score_[0]) / int(score_[1])
score_dict[component_type[i]] = score
total_score_ = first_sentence[i + 1].split(""":""")[1][1:].split("""/""")
total_score = int(total_score_[0]) / int(total_score_[1])
score_dict["total_score"] = total_score
return score_dict


def mia_bench_process_results(doc, results):
response = results[0].strip()
components = doc["components"]
eval_prompt = generate_prompt(doc, response)
eval_score, _ = get_eval(eval_prompt, 1024)
score_dict = process_rawscore(components, eval_score)
return {"gpt_eval_score": score_dict}


# ============================
# Aggregation Functions
# ============================


def mia_bench_aggregate_results(results):
total_score = 0
for result in results:
# Overall accuracy
total_score += result["total_score"]
return total_score / len(results)

0 comments on commit 87bf204

Please sign in to comment.