Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add gpqq-main, gpqa-diamond, mmlu-pro-lite #14

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ pip install -r requirements.txt
- [CRUX](https://crux-eval.github.io/) (`-d crux`)
- [MATH (Level 5)](https://huggingface.co/datasets/AI-MO/aimo-validation-math-level-5) (`-d math-l5`)
- [GSM8K](https://openai.com/index/solving-math-word-problems/) (`-d gsm`)
- [GPQA](https://arxiv.org/abs/2311.12022) (`-d gpqa-main` or `-d gpqa-diamond`)
- [MMLU-Pro-Lite](https://arxiv.org/abs/2406.01574) (`-d mmlu-pro-lite`)

- More tasks will be added soon. (e.g., ARC, MMLU-Pro, etc.)
<!-- - AlpacaEval (`-d alpaca-eval`) -->
Expand Down
32 changes: 32 additions & 0 deletions data_prep/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# https://huggingface.co/datasets/codeparrot/apps

import datasets
import json

dataset_path = "codeparrot/apps"
dataset_name = "apps"


dataset = datasets.load_dataset(dataset_path, "all", split="test")

def shuffle_choices_and_create_example(row, index):
new_example = {
"id": row['problem_id'],
"question": row['question'],
"problem": row['question'],
"answer": json.loads(row['solutions']) if row['solutions'] else None,
"difficulty": row['difficulty'],
"testcases": row['input_output'],
"source": dataset_path,
"config": dataset_name,
"task_type": "code_completion",
}
return new_example

dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
dataset.push_to_hub(
repo_id="DongfuJiang/zeroeval",
config_name=dataset_name,
split='test',
commit_message=f"Add {dataset_name} dataset",
)
36 changes: 36 additions & 0 deletions data_prep/gpqa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import datasets
import json

dataset_path = "Idavidrein/gpqa"
dataset_name = "gpqa_diamond"
# dataset_name = "gpqa_main"

dataset = datasets.load_dataset(dataset_path, dataset_name, split="train")
import random
random.seed(0)

def shuffle_choices_and_create_example(row, index):
list_choices = [row['Incorrect Answer 1'], row['Incorrect Answer 2'], row['Incorrect Answer 3'], row['Correct Answer']]
random.shuffle(list_choices)
new_example = {
"id": f"{dataset_name}_{index}",
"question": None,
"choices": list_choices,
"correct_answer": row['Correct Answer'],
"source": dataset_path,
"config": dataset_name,
"task_type": "multiple_choice",
}
prompt = f"What is the correct answer to this question: {row['Question']}"
prompt += f"\n\nChoices:\n(A) {list_choices[0]}\n(B) {list_choices[1]}\n(C) {list_choices[2]}\n(D) {list_choices[3]}"
prompt += "\nAnswer with the letter of the correct choice."
new_example["question"] = prompt
return new_example

dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
dataset.push_to_hub(
repo_id="DongfuJiang/zeroeval",
config_name=dataset_name,
split='test',
commit_message=f"Add {dataset_name} dataset",
)
29 changes: 29 additions & 0 deletions data_prep/math500.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import datasets
import json

dataset_path = "DongfuJiang/MATH-500"
dataset_name = "math_500"


dataset = datasets.load_dataset(dataset_path, split="test")

def shuffle_choices_and_create_example(row, index):
new_example = {
"id": row['unique_id'],
"question": row['problem'],
"problem": row['problem'],
"solution": row['solution'],
"answer": row['answer'],
"source": dataset_path,
"config": dataset_name,
"task_type": "qa",
}
return new_example

dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
dataset.push_to_hub(
repo_id="DongfuJiang/zeroeval",
config_name=dataset_name,
split='test',
commit_message=f"Add {dataset_name} dataset",
)
53 changes: 53 additions & 0 deletions data_prep/math_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import datasets
import json

from datasets import load_dataset
dataset_path = "lighteval/MATH"
dataset_name = "math"

dataset = load_dataset(dataset_path, "all", split="test")
def extraxt_answer(solution):
start = solution.find("\\boxed{")
if start == -1:
return None
start += len("\\boxed{")
# find the closing bracket
level = 1
end = start - 1
try:
while level > 0:
end += 1
if solution[end] == "{":
level += 1
elif solution[end] == "}":
level -= 1
except Exception as e:
print(solution)
print(start, end)
print(solution[start:])
print(solution[start:end])
raise e
return solution[start:end]

def shuffle_choices_and_create_example(row, index):
new_example = {
"id": index,
"question": row['problem'],
"problem": row['problem'],
"solution": row['solution'],
"level": row['level'],
"type": row['type'],
"answer": extraxt_answer(row['solution']),
"source": dataset_path,
"config": dataset_name,
"task_type": "qa",
}
return new_example

dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
dataset.push_to_hub(
repo_id="DongfuJiang/zeroeval",
config_name=dataset_name,
split='test',
commit_message=f"Add {dataset_name} dataset",
)
59 changes: 59 additions & 0 deletions data_prep/mathl5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import datasets
import json

dataset_path = "lighteval/MATH"
dataset_name = "math_l5"


dataset = datasets.load_dataset(dataset_path, "all", split="test")
def extraxt_answer(solution):
start = solution.find("\\boxed{")
if start == -1:
return None
start += len("\\boxed{")
# find the closing bracket
level = 1
end = start - 1
try:
while level > 0:
end += 1
if solution[end] == "{":
level += 1
elif solution[end] == "}":
level -= 1
except Exception as e:
print(solution)
print(start, end)
print(solution[start:])
print(solution[start:end])
raise e
return solution[start:end]

def shuffle_choices_and_create_example(row, index):
new_example = {
"id": index,
"question": row['problem'],
"problem": row['problem'],
"solution": row['solution'],
"level": row['level'],
"type": row['type'],
"answer": extraxt_answer(row['solution']),
"source": dataset_path,
"config": dataset_name,
"task_type": "qa",
}
return new_example

dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)

def filter_l5(example):
return example['level'] == 'Level 5'

dataset = dataset.filter(filter_l5)

dataset.push_to_hub(
repo_id="DongfuJiang/zeroeval",
config_name=dataset_name,
split='test',
commit_message=f"Add {dataset_name} dataset",
)
39 changes: 39 additions & 0 deletions data_prep/mmlu-pro-lite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import datasets
import json

dataset_path = "TIGER-Lab/MMLU-Pro"
dataset_name = "mmlu_pro_lite"


dataset = datasets.load_dataset(dataset_path, split="test")

total_sampled_num = 1000
dataset = dataset.shuffle(seed=0).select(range(total_sampled_num))

def shuffle_choices_and_create_example(row, index):
new_example = {
"id": f"{dataset_name}_{row['question_id']}",
"question": None,
"choices": row['options'],
"correct_answer": row['options'][row['answer_index']],
"source": dataset_path,
"config": dataset_name,
"task_type": "multiple_choice",
}
list_choices = row['options']
prompt = f"What is the correct answer to this question: {row['question']}"
# prompt += f"\n\nChoices:\n(A) {list_choices[0]}\n(B) {list_choices[1]}\n(C) {list_choices[2]}\n(D) {list_choices[3]}"
prompt += "\n\nChoices:"
for i, choice in enumerate(list_choices):
prompt += f"\n({chr(65+i)}) {choice}"
prompt += "\nAnswer with the letter of the correct choice."
new_example["question"] = prompt
return new_example

dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
dataset.push_to_hub(
repo_id="DongfuJiang/zeroeval",
config_name=dataset_name,
split='test',
commit_message=f"Add {dataset_name} dataset",
)
9 changes: 9 additions & 0 deletions data_prep/usaco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import os
import json

os.system("gdown https://drive.google.com/uc?id=1z5ODOJMqyer1QxzYtEUZ2hbAx-7nU8Vi")

os.system("unzip data.zip")

with open("./datasets/usaco_subset307_dict.json") as f:
data = json.load(f)
2 changes: 1 addition & 1 deletion src/evaluation/mcqa_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def gen_results(run_name_folders):

if __name__ == "__main__":
data_name = sys.argv[1]
if data_name not in ["mmlu-redux"]:
if data_name not in ["mmlu-redux", "mmlu-pro-lite", "gpqa-diamond", "gpqa-main"]:
print(f"Invalid data name: {data_name}")
sys.exit(1)
run_name_folders = {
Expand Down
8 changes: 7 additions & 1 deletion src/task_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ def mapping_task_names(data_name):
dataset = load_dataset("flydust/zero-eval", "crux", split="test")
elif data_name == "math-l5":
dataset = load_dataset("AI-MO/aimo-validation-math-level-5", split="train")
elif data_name == "gpqa-diamond":
dataset = load_dataset("DongfuJiang/zeroeval", "gpqa_diamond", split="test")
elif data_name == "mmlu-pro-lite":
dataset = load_dataset("DongfuJiang/zeroeval", "mmlu_pro_lite", split="test")
elif data_name == "gpqa-main":
dataset = load_dataset("DongfuJiang/zeroeval", "gpqa_main", split="test")
else:
raise ValueError(f"Data name {data_name} not supported")
return dataset, id_name
Expand All @@ -28,7 +34,7 @@ def prompt_generation(data_name, data_item, args):
"""
Generate prompt for different tasks.
"""
if data_name in ["mmlu-redux"]: # and other multiple-choice QA dataset
if data_name in ["mmlu-redux", "gpqa_diamond", "gpqa_main", "mmlu_pro_lite"]: # and other multiple-choice QA dataset
prompt = apply_mc_template(data_item)
elif data_name in ["alpaca_eval"]:
prompt = data_item["instruction"]
Expand Down