[Update] Update MATH dataset with model judge (#1711)

* Update math with llm judge * Update math with llm judge * Update math with llm judge * Update math with llm judge * Update math with llm judge
open-compass · Nov 25, 2024 · e49fcfd · e49fcfd
1 parent 80e3b9e
commit e49fcfd
Show file tree

Hide file tree

Showing 5 changed files with 226 additions and 21 deletions.
diff --git a/configs/eval_math_llm_judge_internal.py b/configs/eval_math_llm_judge_internal.py
@@ -0,0 +1,47 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.math.math_0shot_llm_judge_v2_gen_31d777 import math_datasets
+
+    # 选择一个感兴趣的模型
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as qwen2_5_72b_instruct_model
+
+eval_model_name = 'eval_model_name'
+postprocessor_model_name = 'postprocessor_model_name'
+eval_model_urls = ['http://0.0.0.0:23333/v1']
+postprocessor_model_urls = ['http://0.0.0.0:23333/v1']
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+
+for dataset in datasets:
+    dataset['eval_cfg']['evaluator']['model_name'] = eval_model_name
+    dataset['eval_cfg']['evaluator']['url'] = eval_model_urls
+    dataset['eval_cfg']['evaluator']['post_url'] = postprocessor_model_urls
+    dataset['eval_cfg']['evaluator']['post_model_name'] = postprocessor_model_name
+
+
+# -------------Inferen Stage ----------------------------------------
+
+from opencompass.runners import LocalRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=8,
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=256,
+        task=dict(type=OpenICLEvalTask)
+    ),
+)
diff --git a/opencompass/configs/datasets/math/math_0shot_llm_judge_v2_gen_31d777.py b/opencompass/configs/datasets/math/math_0shot_llm_judge_v2_gen_31d777.py
@@ -0,0 +1,51 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, GaoKaoMATHEvaluator
+
+# ----------------------------- Model Eval Parameters -----------------------------
+
+naive_model_name = 'dlc_model' # replace with your model name
+naive_model_url = ['http://0.0.0.0:23333/v1'] # Multi-apis for accerlation
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048),
+)
+
+evaluator = dict(
+    type=GaoKaoMATHEvaluator,
+    model_name=naive_model_name,
+    url=naive_model_url,
+    language='en',
+    with_postprocess=True,
+    post_url=naive_model_url,
+    post_model_name=naive_model_name,
+)
+
+math_eval_cfg = dict(
+    evaluator=evaluator,
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='opencompass/math',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
diff --git a/opencompass/datasets/compassbench_obj.py b/opencompass/datasets/compassbench_obj.py
@@ -63,6 +63,35 @@ def load(path: str, name: str):
         return dataset
 
 
+@LOAD_DATASET.register_module()
+class CompassBenchObjectiveMath(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        with open(path, 'r') as infile:
+            data = [json.loads(line) for line in infile]
+            for idx in range(len(data)):
+                item = data[idx]
+                prefix = ''
+                if item.get('question_type',
+                            None) and item['question_type'] in [
+                                'multiple-answer', '多选题'
+                            ]:
+                    if '_en_' in path:
+                        prefix = 'This question may has multiple answers, \
+please select all correct answers. like this: A, B, C as your final answer\n'
+
+                    else:
+                        prefix = '这道题可能有多个正确答案，请选择所有正确的答案，\
+例如：A, B, C 作为你的最终答案\n'
+
+                if item.get('options', None) and len(item['options']) != 0:
+                    item['question'] = prefix + item[
+                        'question'] + '\n' + get_number(item['options'])
+        dataset = Dataset.from_list(data)
+        return dataset
+
+
 @TEXT_POSTPROCESSORS.register_module()
 def compassbench_objective_v1_3_postprocess(text: str, name) -> str:
     split = False

diff --git a/opencompass/datasets/gaokao_math.py b/opencompass/datasets/gaokao_math.py
@@ -12,7 +12,6 @@
 
 # from opencompass.utils import get_data_path
 
-
 EVAL_PROMPT = """
 请你作为一个数学高考阅卷专家，判断下面的答案是否与标准答案一致，即考生是否回答正确。下面是一些评判标准：
 1. 有些答案可能包含多项内容，可能有单选题，多选题，填空题等，只要答案与标准答案一致即可, 对于多选题和多个空的填空题，需要考生对应的选项或空都回答正确才算正确。
@@ -27,6 +26,42 @@
 分析：
 """ # noqa E501
 
+POST_PROMPT_CN="""
+你是一个乐于助人的助手，任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案，不包括任何额外的文字。
+—
+我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息，你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。
+
+对于单选题，答案应该是选项字母，例如 "A"；
+对于多选题，答案应该是一个选项字母的列表，例如 ["A"] 或 ["A", "B", "C"]；
+对于填空题，答案应该是一个填入空白处的答案列表，列表的数量应该与问题中的空白数量相同，同一空白的答案可能有多个，请在同一个 string 中用逗号隔开表示，如 ['sqrt(x) 且 x > 10', '1/2, 1/3', '1/4'] 代表问题包含三小问，第一小问包含取值范围信息，第二小问有两个答案，第三小问有一个答案。
+对于解答题，类似填空题，答案应该是一个答案列表，每小问的答案间用逗号隔开，同样需要注意某些小问答案多个的情况。
+
+如果回答句子提供了多个不同的答案，请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样，提取这个修正或修改后的答案作为最终答案。相反，如果回答句子在多个答案之间波动而没有明确的最终答案，你应该输出 [No valid answer]。
+—
+问题类型: {question_type}
+原始问题: {question}
+回答: {response}
+提取的关键答案:
+""" # noqa E501
+
+POST_PROMPT_EN="""
+You are a helpful assistant whose task is to extract precise key answers from given response sentences. You must only provide the extracted key answers without any additional text.
+—
+I will provide you with a question, a response sentence, and the question type. The response sentence is a reply to the provided question. Using the provided information, you must accurately and precisely identify and extract the expected key answers from the response sentence. Please do not provide subjective opinions about the question.
+
+For multiple-choice questions, the answer should be the letter of the option, such as "A".
+For multiple-answer questions, the answer should be a list of option letters, such as ["A"] or ["A", "B", "C"].
+For fill-in-the-blank questions, the answer should be a list of answers to fill in the blanks. The number of items in the list should match the number of blanks in the question. If there are multiple answers for the same blank, separate them with a comma within the same string, like ['sqrt(x) and x > 10', '1/2, 1/3', '1/4'], which represents three sub-questions where the first sub-question includes a range, the second sub-question has two answers, and the third sub-question has one answer.
+For problem-solving questions, similar to fill-in-the-blank questions, the answer should be a list of answers. Separate answers for different sub-questions with commas, and note that some sub-questions may have multiple answers.
+
+If the response sentence provides multiple different answers, carefully determine whether a later provided answer is a correction or modification of an earlier answer. If so, extract this corrected or modified answer as the final answer. Conversely, if the response sentence fluctuates between multiple answers without a clear final answer, you should output [No valid answer].
+—
+Question type: {question_type}
+Question: {question}
+Output sentences: {response}
+Key extracted answer:
+""" # noqa E501
+
 
 def extract_boxed_answer(text):
     match = re.findall(r'\\boxed{(.+?)}', text)
@@ -57,7 +92,15 @@ def load(path: str):
 @ICL_EVALUATORS.register_module()
 class GaoKaoMATHEvaluator(BaseEvaluator):
 
-    def __init__(self, model_name, url, **kwargs):
+    def __init__(self,
+                 model_name,
+                 url,
+                 question_type=None,
+                 language='en',
+                 with_postprocess=False,
+                 post_url=[],
+                 post_model_name='',
+                 **kwargs):
         if isinstance(url, str):
             url = [url]
 
@@ -68,45 +111,90 @@ def __init__(self, model_name, url, **kwargs):
                     path=model_name,
                     openai_api_base=url,
                     key='EMPTY',
-                    query_per_second=1,
+                    query_per_second=2,
                     meta_template=api_meta_template,
-                    temperature=kwargs.get('temperature', 0.01),
+                    temperature=kwargs.get('temperature', 1e-6),
                     max_seq_len=kwargs.get('max_tokens', 8192),
                 )) for url in url
         ]
-
-    def batch_response(self, inputs):
-        batch_num = len(self.model)
+        self.question_type = question_type
+        self.language = language
+        self.with_postprocess = with_postprocess
+        self.post_url = post_url
+        self.post_model_name = post_model_name
+
+    def batch_response(self, models, inputs):
+        batch_num = len(models)
         batch_size = (len(inputs) + batch_num - 1) // batch_num
         result_responses = []
 
         with concurrent.futures.ThreadPoolExecutor(
                 max_workers=batch_num) as executor:
             futures = [
-                executor.submit(self.model[i].generate,
+                executor.submit(models[i].generate,
                                 inputs[i * batch_size:(i + 1) * batch_size])
                 for i in range(batch_num)
             ]
             for response in executor.map(lambda f: f.result(), futures):
                 result_responses.extend(response)
+        return result_responses
 
+    def postprocess(self, questions, predictions, question_type='None'):
+        self.post_model = [
+            MODELS.build(
+                dict(
+                    type=OpenAISDK,
+                    path=self.post_model_name,
+                    openai_api_base=url,
+                    key='EMPTY',
+                    query_per_second=2,
+                    meta_template=api_meta_template,
+                    temperature=1e-6,
+                    max_seq_len=1024,
+                )) for url in self.post_url
+        ]
+        input_prompts = []
+        prompt = POST_PROMPT_EN if self.language == 'en' else POST_PROMPT_CN
+        for question, response, question_type in zip(questions, predictions,
+                                                     question_type):
+            input_prompts.append(
+                prompt.format(question=question,
+                              response=response,
+                              question_type=question_type))
+        result_responses = self.batch_response(self.post_model, input_prompts)
         return result_responses
 
-    def score(self, predictions, references, origin_prompt):
+    def score(self, predictions, references, origin_prompt, test_set):
         if len(predictions) != len(references):
             return {'error': 'preds and refrs have different length'}
         questions = [item[0]['prompt'] for item in origin_prompt]
         count = 0
         correct = 0
         details = []
         results = []
+
+        if self.with_postprocess:
+            if self.question_type:
+                self.question_type = [self.question_type] * len(questions)
+            # test_set type is huggingface Dataset
+            elif 'question_type' in test_set.column_names:
+                self.question_type = test_set['question_type']
+            else:
+                self.question_type = ['问答题'] * len(
+                    questions) if self.language == 'cn' else [
+                        'problem-solving'
+                    ] * len(questions)
+
+            predictions = self.postprocess(questions, predictions,
+                                           self.question_type)
+
         inputs = []
         for pred, ref, ques in zip(predictions, references, questions):
             inputs.append(
                 EVAL_PROMPT.format(answer=pred, gold_answer=ref,
                                    question=ques))
+        result_responses = self.batch_response(self.model, inputs)
 
-        result_responses = self.batch_response(inputs)
         results = [
             extract_boxed_answer(result) == 'yes'
             for result in result_responses
@@ -132,13 +220,3 @@ def score(self, predictions, references, origin_prompt):
         }
 
         return detailed_result
-
-
-if __name__ == '__main__':
-    evaluator = GaoKaoMATHEvaluator('http://0.0.0.0:23333/v1',
-                                    temperature=0.01,
-                                    max_tokens=2048,
-                                    procs=8)
-    predictions = ['1', '2', '3']
-    references = ['1', '2', '3']
-    evaluator.score(predictions, references)
diff --git a/opencompass/utils/model_postprocessors.py b/opencompass/utils/model_postprocessors.py
@@ -24,7 +24,7 @@ def gen_output_naive(ori_data, extractor):
 
 
 @TEXT_POSTPROCESSORS.register_module('naive')
-def navie_model_postprocess(preds: list,
+def naive_model_postprocess(preds: list,
                             model_name: str,
                             custom_instruction: str,
                             api_url: Union[str, list],