WildEval · jdf-prog · Oct 7, 2024 · Oct 7, 2024 · Oct 8, 2024 · Nov 10, 2024
diff --git a/README.md b/README.md
@@ -39,6 +39,8 @@ pip install -r requirements.txt
 - [CRUX](https://crux-eval.github.io/) (`-d crux`)
 - [MATH (Level 5)](https://huggingface.co/datasets/AI-MO/aimo-validation-math-level-5) (`-d math-l5`)
 - [GSM8K](https://openai.com/index/solving-math-word-problems/) (`-d gsm`)
+- [GPQA](https://arxiv.org/abs/2311.12022) (`-d gpqa-main` or `-d gpqa-diamond`)
+- [MMLU-Pro-Lite](https://arxiv.org/abs/2406.01574) (`-d mmlu-pro-lite`)
 
 - More tasks will be added soon. (e.g., ARC, MMLU-Pro, etc.)
 <!-- - AlpacaEval (`-d alpaca-eval`) -->

diff --git a/data_prep/apps.py b/data_prep/apps.py
@@ -0,0 +1,32 @@
+# https://huggingface.co/datasets/codeparrot/apps
+
+import datasets
+import json
+
+dataset_path = "codeparrot/apps"
+dataset_name = "apps"
+
+
+dataset = datasets.load_dataset(dataset_path, "all", split="test")
+
+def shuffle_choices_and_create_example(row, index):
+    new_example = {
+        "id": row['problem_id'],
+        "question": row['question'],
+        "problem": row['question'],
+        "answer": json.loads(row['solutions']) if row['solutions'] else None,
+        "difficulty": row['difficulty'],
+        "testcases": row['input_output'],
+        "source": dataset_path,
+        "config": dataset_name,
+        "task_type": "code_completion",
+    }
+    return new_example
+
+dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
+dataset.push_to_hub(
+    repo_id="DongfuJiang/zeroeval",
+    config_name=dataset_name,
+    split='test',
+    commit_message=f"Add {dataset_name} dataset",
+)
diff --git a/data_prep/gpqa.py b/data_prep/gpqa.py
@@ -0,0 +1,36 @@
+import datasets
+import json
+
+dataset_path = "Idavidrein/gpqa"
+dataset_name = "gpqa_diamond"
+# dataset_name = "gpqa_main"
+
+dataset = datasets.load_dataset(dataset_path, dataset_name, split="train")
+import random
+random.seed(0)
+
+def shuffle_choices_and_create_example(row, index):
+    list_choices = [row['Incorrect Answer 1'], row['Incorrect Answer 2'], row['Incorrect Answer 3'], row['Correct Answer']]
+    random.shuffle(list_choices)
+    new_example = {
+        "id": f"{dataset_name}_{index}",
+        "question": None,
+        "choices": list_choices,
+        "correct_answer": row['Correct Answer'],
+        "source": dataset_path,
+        "config": dataset_name,
+        "task_type": "multiple_choice",
+    }
+    prompt = f"What is the correct answer to this question: {row['Question']}"
+    prompt += f"\n\nChoices:\n(A) {list_choices[0]}\n(B) {list_choices[1]}\n(C) {list_choices[2]}\n(D) {list_choices[3]}"
+    prompt += "\nAnswer with the letter of the correct choice."
+    new_example["question"] = prompt
+    return new_example
+
+dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
+dataset.push_to_hub(
+    repo_id="DongfuJiang/zeroeval",
+    config_name=dataset_name,
+    split='test',
+    commit_message=f"Add {dataset_name} dataset",
+)
diff --git a/data_prep/math500.py b/data_prep/math500.py
@@ -0,0 +1,29 @@
+import datasets
+import json
+
+dataset_path = "DongfuJiang/MATH-500"
+dataset_name = "math_500"
+
+
+dataset = datasets.load_dataset(dataset_path, split="test")
+
+def shuffle_choices_and_create_example(row, index):
+    new_example = {
+        "id": row['unique_id'],
+        "question": row['problem'],
+        "problem": row['problem'],
+        "solution": row['solution'],
+        "answer": row['answer'],
+        "source": dataset_path,
+        "config": dataset_name,
+        "task_type": "qa",
+    }
+    return new_example
+
+dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
+dataset.push_to_hub(
+    repo_id="DongfuJiang/zeroeval",
+    config_name=dataset_name,
+    split='test',
+    commit_message=f"Add {dataset_name} dataset",
+)
diff --git a/data_prep/math_test.py b/data_prep/math_test.py
@@ -0,0 +1,53 @@
+import datasets
+import json
+
+from datasets import load_dataset
+dataset_path = "lighteval/MATH"
+dataset_name = "math"
+
+dataset = load_dataset(dataset_path, "all", split="test")
+def extraxt_answer(solution):
+    start = solution.find("\\boxed{")
+    if start == -1:
+        return None
+    start += len("\\boxed{")
+    # find the closing bracket
+    level = 1
+    end = start - 1
+    try:
+        while level > 0:
+            end += 1
+            if solution[end] == "{":
+                level += 1
+            elif solution[end] == "}":
+                level -= 1
+    except Exception as e:
+        print(solution)
+        print(start, end)
+        print(solution[start:])
+        print(solution[start:end])
+        raise e
+    return solution[start:end]
+
+def shuffle_choices_and_create_example(row, index):
+    new_example = {
+        "id": index,
+        "question": row['problem'],
+        "problem": row['problem'],
+        "solution": row['solution'],
+        "level": row['level'],
+        "type": row['type'],
+        "answer": extraxt_answer(row['solution']),
+        "source": dataset_path,
+        "config": dataset_name,
+        "task_type": "qa",
+    }
+    return new_example
+
+dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
+dataset.push_to_hub(
+    repo_id="DongfuJiang/zeroeval",
+    config_name=dataset_name,
+    split='test',
+    commit_message=f"Add {dataset_name} dataset",
+)
diff --git a/data_prep/mathl5.py b/data_prep/mathl5.py
@@ -0,0 +1,59 @@
+import datasets
+import json
+
+dataset_path = "lighteval/MATH"
+dataset_name = "math_l5"
+
+
+dataset = datasets.load_dataset(dataset_path, "all", split="test")
+def extraxt_answer(solution):
+    start = solution.find("\\boxed{")
+    if start == -1:
+        return None
+    start += len("\\boxed{")
+    # find the closing bracket
+    level = 1
+    end = start - 1
+    try:
+        while level > 0:
+            end += 1
+            if solution[end] == "{":
+                level += 1
+            elif solution[end] == "}":
+                level -= 1
+    except Exception as e:
+        print(solution)
+        print(start, end)
+        print(solution[start:])
+        print(solution[start:end])
+        raise e
+    return solution[start:end]
+
+def shuffle_choices_and_create_example(row, index):
+    new_example = {
+        "id": index,
+        "question": row['problem'],
+        "problem": row['problem'],
+        "solution": row['solution'],
+        "level": row['level'],
+        "type": row['type'],
+        "answer": extraxt_answer(row['solution']),
+        "source": dataset_path,
+        "config": dataset_name,
+        "task_type": "qa",
+    }
+    return new_example
+
+dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
+
+def filter_l5(example):
+    return example['level'] == 'Level 5'
+
+dataset = dataset.filter(filter_l5)
+
+dataset.push_to_hub(
+    repo_id="DongfuJiang/zeroeval",
+    config_name=dataset_name,
+    split='test',
+    commit_message=f"Add {dataset_name} dataset",
+)
diff --git a/data_prep/mmlu-pro-lite.py b/data_prep/mmlu-pro-lite.py
@@ -0,0 +1,39 @@
+import datasets
+import json
+
+dataset_path = "TIGER-Lab/MMLU-Pro"
+dataset_name = "mmlu_pro_lite"
+
+
+dataset = datasets.load_dataset(dataset_path, split="test")
+
+total_sampled_num = 1000
+dataset = dataset.shuffle(seed=0).select(range(total_sampled_num))
+
+def shuffle_choices_and_create_example(row, index):
+    new_example = {
+        "id": f"{dataset_name}_{row['question_id']}",
+        "question": None,
+        "choices": row['options'],
+        "correct_answer": row['options'][row['answer_index']],
+        "source": dataset_path,
+        "config": dataset_name,
+        "task_type": "multiple_choice",
+    }
+    list_choices = row['options']
+    prompt = f"What is the correct answer to this question: {row['question']}"
+    # prompt += f"\n\nChoices:\n(A) {list_choices[0]}\n(B) {list_choices[1]}\n(C) {list_choices[2]}\n(D) {list_choices[3]}"
+    prompt += "\n\nChoices:"
+    for i, choice in enumerate(list_choices):
+        prompt += f"\n({chr(65+i)}) {choice}"
+    prompt += "\nAnswer with the letter of the correct choice."
+    new_example["question"] = prompt
+    return new_example
+
+dataset = dataset.map(shuffle_choices_and_create_example, with_indices=True, remove_columns=dataset.column_names)
+dataset.push_to_hub(
+    repo_id="DongfuJiang/zeroeval",
+    config_name=dataset_name,
+    split='test',
+    commit_message=f"Add {dataset_name} dataset",
+)
diff --git a/data_prep/usaco.py b/data_prep/usaco.py
@@ -0,0 +1,9 @@
+import os
+import json
+
+os.system("gdown https://drive.google.com/uc?id=1z5ODOJMqyer1QxzYtEUZ2hbAx-7nU8Vi")
+
+os.system("unzip data.zip")
+
+with open("./datasets/usaco_subset307_dict.json") as f:
+    data = json.load(f)
diff --git a/src/evaluation/mcqa_eval.py b/src/evaluation/mcqa_eval.py
@@ -122,7 +122,7 @@ def gen_results(run_name_folders):
 
 if __name__ == "__main__":
     data_name = sys.argv[1]
-    if data_name not in ["mmlu-redux"]:
+    if data_name not in ["mmlu-redux", "mmlu-pro-lite", "gpqa-diamond", "gpqa-main"]:
         print(f"Invalid data name: {data_name}")
         sys.exit(1)
     run_name_folders = {

diff --git a/src/task_configs.py b/src/task_configs.py
@@ -20,6 +20,12 @@ def mapping_task_names(data_name):
         dataset = load_dataset("flydust/zero-eval", "crux", split="test")
     elif data_name == "math-l5":
         dataset = load_dataset("AI-MO/aimo-validation-math-level-5", split="train")
+    elif data_name == "gpqa-diamond":
+        dataset = load_dataset("DongfuJiang/zeroeval", "gpqa_diamond", split="test")
+    elif data_name == "mmlu-pro-lite":
+        dataset = load_dataset("DongfuJiang/zeroeval", "mmlu_pro_lite", split="test")
+    elif data_name == "gpqa-main":
+        dataset = load_dataset("DongfuJiang/zeroeval", "gpqa_main", split="test")
     else:
         raise ValueError(f"Data name {data_name} not supported")
     return dataset, id_name
@@ -28,7 +34,7 @@ def prompt_generation(data_name, data_item, args):
     """
     Generate prompt for different tasks.
     """
-    if data_name in ["mmlu-redux"]:  # and other multiple-choice QA dataset 
+    if data_name in ["mmlu-redux", "gpqa_diamond", "gpqa_main", "mmlu_pro_lite"]:  # and other multiple-choice QA dataset 
         prompt = apply_mc_template(data_item) 
     elif data_name in ["alpaca_eval"]:
         prompt = data_item["instruction"]