forked from openai/simple-evals
-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo.py
116 lines (106 loc) · 4.24 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json
import time
import pandas as pd
from . import common
from .drop_eval import DropEval
from .gpqa_eval import GPQAEval
from .humaneval_eval import HumanEval
from .math_eval import MathEval
from .mgsm_eval import MGSMEval
from .mmlu_eval import MMLUEval
from .sampler.chat_completion_sampler import (
OPENAI_SYSTEM_MESSAGE_API,
OPENAI_SYSTEM_MESSAGE_CHATGPT,
ChatCompletionSampler,
)
# from .sampler.claude_sampler import ClaudeCompletionSampler, CLAUDE_SYSTEM_MESSAGE_LMSYS
def main():
debug = True
samplers = {
# chatgpt models:
"gpt-4-turbo-2024-04-09_assistant": ChatCompletionSampler(
model="gpt-4-turbo-2024-04-09",
system_message=OPENAI_SYSTEM_MESSAGE_API,
),
"gpt-4-turbo-2024-04-09_chatgpt": ChatCompletionSampler(
model="gpt-4-turbo-2024-04-09",
system_message=OPENAI_SYSTEM_MESSAGE_CHATGPT,
),
"gpt-4o_assistant": ChatCompletionSampler(
model="gpt-4o",
system_message=OPENAI_SYSTEM_MESSAGE_API,
max_tokens=2048,
),
"gpt-4o_chatgpt": ChatCompletionSampler(
model="gpt-4o",
system_message=OPENAI_SYSTEM_MESSAGE_CHATGPT,
max_tokens=2048,
),
# claude models:
# "claude-3-opus-20240229_empty": ClaudeCompletionSampler(
# model="claude-3-opus-20240229", system_message=None,
# ),
}
equality_checker = ChatCompletionSampler(model="gpt-4-turbo-preview")
# ^^^ used for fuzzy matching, just for math
def get_evals(eval_name):
match eval_name:
case "mmlu":
return MMLUEval(num_examples=1 if debug else 2500)
case "math":
return MathEval(
equality_checker=equality_checker, num_examples=5 if debug else 2500
)
case "gpqa":
return GPQAEval(n_repeats=1 if debug else 10, num_examples=5 if debug else None)
case "mgsm":
return MGSMEval(num_examples_per_lang=10 if debug else 250)
case "drop":
return DropEval(num_examples=10 if debug else 2000, train_samples_per_prompt=3)
case "humaneval":
return HumanEval(num_examples=10 if debug else None)
case _:
raise Exception(f"Unrecoginized eval type: {eval_name}")
evals = {
eval_name: get_evals(eval_name) for eval_name in ["mmlu", "math", "gpqa", "mgsm", "drop"]
}
print(evals)
debug_suffix = "_DEBUG" if debug else ""
mergekey2resultpath = {}
for sampler_name, sampler in samplers.items():
for eval_name, eval_obj in evals.items():
result = eval_obj(sampler)
# ^^^ how to use a sampler
file_stem = f"{eval_name}_{sampler_name}"
report_filename = f"/tmp/{file_stem}{debug_suffix}.html"
print(f"Writing report to {report_filename}")
with open(report_filename, "w") as fh:
fh.write(common.make_report(result))
metrics = result.metrics | {"score": result.score}
print(metrics)
result_filename = f"/tmp/{file_stem}{debug_suffix}.json"
with open(result_filename, "w") as f:
f.write(json.dumps(metrics, indent=2))
print(f"Writing results to {result_filename}")
mergekey2resultpath[f"{file_stem}"] = result_filename
merge_metrics = []
for eval_sampler_name, result_filename in mergekey2resultpath.items():
try:
result = json.load(open(result_filename, "r+"))
except Exception as e:
print(e, result_filename)
continue
result = result.get("f1_score", result.get("score", None))
eval_name = eval_sampler_name[: eval_sampler_name.find("_")]
sampler_name = eval_sampler_name[eval_sampler_name.find("_") + 1 :]
merge_metrics.append(
{"eval_name": eval_name, "sampler_name": sampler_name, "metric": result}
)
merge_metrics_df = pd.DataFrame(merge_metrics).pivot(
index=["sampler_name"], columns="eval_name"
)
print("\nAll results: ")
print(merge_metrics_df.to_markdown())
return merge_metrics
if __name__ == "__main__":
main()