Skip to content

Commit

Permalink
[Update] Update MATH dataset with model judge (#1711)
Browse files Browse the repository at this point in the history
* Update math with llm judge

* Update math with llm judge

* Update math with llm judge

* Update math with llm judge

* Update math with llm judge
  • Loading branch information
liushz authored Nov 25, 2024
1 parent 80e3b9e commit e49fcfd
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 21 deletions.
47 changes: 47 additions & 0 deletions configs/eval_math_llm_judge_internal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from mmengine.config import read_base

with read_base():
from opencompass.configs.datasets.math.math_0shot_llm_judge_v2_gen_31d777 import math_datasets

# 选择一个感兴趣的模型
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as qwen2_5_72b_instruct_model

eval_model_name = 'eval_model_name'
postprocessor_model_name = 'postprocessor_model_name'
eval_model_urls = ['http://0.0.0.0:23333/v1']
postprocessor_model_urls = ['http://0.0.0.0:23333/v1']

datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])


for dataset in datasets:
dataset['eval_cfg']['evaluator']['model_name'] = eval_model_name
dataset['eval_cfg']['evaluator']['url'] = eval_model_urls
dataset['eval_cfg']['evaluator']['post_url'] = postprocessor_model_urls
dataset['eval_cfg']['evaluator']['post_model_name'] = postprocessor_model_name


# -------------Inferen Stage ----------------------------------------

from opencompass.runners import LocalRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask

infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)
),
)

eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)
),
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MATHDataset, GaoKaoMATHEvaluator

# ----------------------------- Model Eval Parameters -----------------------------

naive_model_name = 'dlc_model' # replace with your model name
naive_model_url = ['http://0.0.0.0:23333/v1'] # Multi-apis for accerlation

# ----------------------------- Detailed Config -----------------------------

math_reader_cfg = dict(input_columns=['problem'], output_column='solution')

math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)

evaluator = dict(
type=GaoKaoMATHEvaluator,
model_name=naive_model_name,
url=naive_model_url,
language='en',
with_postprocess=True,
post_url=naive_model_url,
post_model_name=naive_model_name,
)

math_eval_cfg = dict(
evaluator=evaluator,
)

math_datasets = [
dict(
type=MATHDataset,
abbr='math',
path='opencompass/math',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
29 changes: 29 additions & 0 deletions opencompass/datasets/compassbench_obj.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,35 @@ def load(path: str, name: str):
return dataset


@LOAD_DATASET.register_module()
class CompassBenchObjectiveMath(BaseDataset):

@staticmethod
def load(path: str):
with open(path, 'r') as infile:
data = [json.loads(line) for line in infile]
for idx in range(len(data)):
item = data[idx]
prefix = ''
if item.get('question_type',
None) and item['question_type'] in [
'multiple-answer', '多选题'
]:
if '_en_' in path:
prefix = 'This question may has multiple answers, \
please select all correct answers. like this: A, B, C as your final answer\n'

else:
prefix = '这道题可能有多个正确答案,请选择所有正确的答案,\
例如:A, B, C 作为你的最终答案\n'

if item.get('options', None) and len(item['options']) != 0:
item['question'] = prefix + item[
'question'] + '\n' + get_number(item['options'])
dataset = Dataset.from_list(data)
return dataset


@TEXT_POSTPROCESSORS.register_module()
def compassbench_objective_v1_3_postprocess(text: str, name) -> str:
split = False
Expand Down
118 changes: 98 additions & 20 deletions opencompass/datasets/gaokao_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

# from opencompass.utils import get_data_path


EVAL_PROMPT = """
请你作为一个数学高考阅卷专家,判断下面的答案是否与标准答案一致,即考生是否回答正确。下面是一些评判标准:
1. 有些答案可能包含多项内容,可能有单选题,多选题,填空题等,只要答案与标准答案一致即可, 对于多选题和多个空的填空题,需要考生对应的选项或空都回答正确才算正确。
Expand All @@ -27,6 +26,42 @@
分析:
""" # noqa E501

POST_PROMPT_CN="""
你是一个乐于助人的助手,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。
我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。
对于单选题,答案应该是选项字母,例如 "A";
对于多选题,答案应该是一个选项字母的列表,例如 ["A"] 或 ["A", "B", "C"];
对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,同一空白的答案可能有多个,请在同一个 string 中用逗号隔开表示,如 ['sqrt(x) 且 x > 10', '1/2, 1/3', '1/4'] 代表问题包含三小问,第一小问包含取值范围信息,第二小问有两个答案,第三小问有一个答案。
对于解答题,类似填空题,答案应该是一个答案列表,每小问的答案间用逗号隔开,同样需要注意某些小问答案多个的情况。
如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。
问题类型: {question_type}
原始问题: {question}
回答: {response}
提取的关键答案:
""" # noqa E501

POST_PROMPT_EN="""
You are a helpful assistant whose task is to extract precise key answers from given response sentences. You must only provide the extracted key answers without any additional text.
I will provide you with a question, a response sentence, and the question type. The response sentence is a reply to the provided question. Using the provided information, you must accurately and precisely identify and extract the expected key answers from the response sentence. Please do not provide subjective opinions about the question.
For multiple-choice questions, the answer should be the letter of the option, such as "A".
For multiple-answer questions, the answer should be a list of option letters, such as ["A"] or ["A", "B", "C"].
For fill-in-the-blank questions, the answer should be a list of answers to fill in the blanks. The number of items in the list should match the number of blanks in the question. If there are multiple answers for the same blank, separate them with a comma within the same string, like ['sqrt(x) and x > 10', '1/2, 1/3', '1/4'], which represents three sub-questions where the first sub-question includes a range, the second sub-question has two answers, and the third sub-question has one answer.
For problem-solving questions, similar to fill-in-the-blank questions, the answer should be a list of answers. Separate answers for different sub-questions with commas, and note that some sub-questions may have multiple answers.
If the response sentence provides multiple different answers, carefully determine whether a later provided answer is a correction or modification of an earlier answer. If so, extract this corrected or modified answer as the final answer. Conversely, if the response sentence fluctuates between multiple answers without a clear final answer, you should output [No valid answer].
Question type: {question_type}
Question: {question}
Output sentences: {response}
Key extracted answer:
""" # noqa E501


def extract_boxed_answer(text):
match = re.findall(r'\\boxed{(.+?)}', text)
Expand Down Expand Up @@ -57,7 +92,15 @@ def load(path: str):
@ICL_EVALUATORS.register_module()
class GaoKaoMATHEvaluator(BaseEvaluator):

def __init__(self, model_name, url, **kwargs):
def __init__(self,
model_name,
url,
question_type=None,
language='en',
with_postprocess=False,
post_url=[],
post_model_name='',
**kwargs):
if isinstance(url, str):
url = [url]

Expand All @@ -68,45 +111,90 @@ def __init__(self, model_name, url, **kwargs):
path=model_name,
openai_api_base=url,
key='EMPTY',
query_per_second=1,
query_per_second=2,
meta_template=api_meta_template,
temperature=kwargs.get('temperature', 0.01),
temperature=kwargs.get('temperature', 1e-6),
max_seq_len=kwargs.get('max_tokens', 8192),
)) for url in url
]

def batch_response(self, inputs):
batch_num = len(self.model)
self.question_type = question_type
self.language = language
self.with_postprocess = with_postprocess
self.post_url = post_url
self.post_model_name = post_model_name

def batch_response(self, models, inputs):
batch_num = len(models)
batch_size = (len(inputs) + batch_num - 1) // batch_num
result_responses = []

with concurrent.futures.ThreadPoolExecutor(
max_workers=batch_num) as executor:
futures = [
executor.submit(self.model[i].generate,
executor.submit(models[i].generate,
inputs[i * batch_size:(i + 1) * batch_size])
for i in range(batch_num)
]
for response in executor.map(lambda f: f.result(), futures):
result_responses.extend(response)
return result_responses

def postprocess(self, questions, predictions, question_type='None'):
self.post_model = [
MODELS.build(
dict(
type=OpenAISDK,
path=self.post_model_name,
openai_api_base=url,
key='EMPTY',
query_per_second=2,
meta_template=api_meta_template,
temperature=1e-6,
max_seq_len=1024,
)) for url in self.post_url
]
input_prompts = []
prompt = POST_PROMPT_EN if self.language == 'en' else POST_PROMPT_CN
for question, response, question_type in zip(questions, predictions,
question_type):
input_prompts.append(
prompt.format(question=question,
response=response,
question_type=question_type))
result_responses = self.batch_response(self.post_model, input_prompts)
return result_responses

def score(self, predictions, references, origin_prompt):
def score(self, predictions, references, origin_prompt, test_set):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
questions = [item[0]['prompt'] for item in origin_prompt]
count = 0
correct = 0
details = []
results = []

if self.with_postprocess:
if self.question_type:
self.question_type = [self.question_type] * len(questions)
# test_set type is huggingface Dataset
elif 'question_type' in test_set.column_names:
self.question_type = test_set['question_type']
else:
self.question_type = ['问答题'] * len(
questions) if self.language == 'cn' else [
'problem-solving'
] * len(questions)

predictions = self.postprocess(questions, predictions,
self.question_type)

inputs = []
for pred, ref, ques in zip(predictions, references, questions):
inputs.append(
EVAL_PROMPT.format(answer=pred, gold_answer=ref,
question=ques))
result_responses = self.batch_response(self.model, inputs)

result_responses = self.batch_response(inputs)
results = [
extract_boxed_answer(result) == 'yes'
for result in result_responses
Expand All @@ -132,13 +220,3 @@ def score(self, predictions, references, origin_prompt):
}

return detailed_result


if __name__ == '__main__':
evaluator = GaoKaoMATHEvaluator('http://0.0.0.0:23333/v1',
temperature=0.01,
max_tokens=2048,
procs=8)
predictions = ['1', '2', '3']
references = ['1', '2', '3']
evaluator.score(predictions, references)
2 changes: 1 addition & 1 deletion opencompass/utils/model_postprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def gen_output_naive(ori_data, extractor):


@TEXT_POSTPROCESSORS.register_module('naive')
def navie_model_postprocess(preds: list,
def naive_model_postprocess(preds: list,
model_name: str,
custom_instruction: str,
api_url: Union[str, list],
Expand Down

0 comments on commit e49fcfd

Please sign in to comment.