update llm benchmark format

Signed-off-by: IcyFeather <[email protected]>
kubeedge · Sep 19, 2024 · cf82d95 · cf82d95
1 parent ef5841b
commit cf82d95
Show file tree

Hide file tree

Showing 7 changed files with 58 additions and 39 deletions.
diff --git a/core/testenvmanager/dataset/dataset.py b/core/testenvmanager/dataset/dataset.py
@@ -19,7 +19,7 @@
 import json
 
 import pandas as pd
-from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse, JsonlDataParse, JSONDataInfoParse
+from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse, JsonlDataParse, JSONMetaDataParse
 
 from core.common import utils
 from core.common.constant import DatasetFormat
@@ -436,8 +436,8 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature
             e.g.: TxtDataParse, CSVDataParse.
 
         """
-        if file.split('/')[-1] == "data_info.json":
-            data = JSONDataInfoParse(data_type=data_type, func=feature_process)
+        if file.split('/')[-1] == "metadata.json":
+            data = JSONMetaDataParse(data_type=data_type, func=feature_process)
             data.parse(file)
             return data
 

diff --git a/examples/government/singletask_learning_bench/README.md b/examples/government/singletask_learning_bench/README.md
@@ -10,19 +10,43 @@ This Benchmark consists of two parts: subjective evaluation data and objective e
 
 ## Design
 
-Dataset format:
+### Metadata Format
 
-|name|optionality|information|
+| Name | Field Name | Option | Description |
+| --- | --- | --- | --- |
+| Data Name | dataset |  Required | Name of the dataset |
+| Data Description | description | Optional | Dataset description, such as usage scope, sample size, etc. |
+| First-level Dimension | level_1_dim | Required | Should fill in "Single Modal" or "Multi-Modal" |
+| Second-level Dimension | level_2_dim | Required | For "Single Modal", fill in "Text", "Image", or "Audio". For "Multi-Modal", fill in "Text-Image", "Text-Audio", "Image-Audio", or "Text-Image-Audio" |
+| Third-level Dimension | level_3_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document |
+| Fourth-level Dimension | level_4_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document |
+
+metadata example:
+
+```json
+{
+    "dataset": "Medical BenchMark",
+    "description": "xxx",
+    "level_1_dim": "single-modal",
+    "level_2_dim": "text",
+    "level_3_dim": "Q&A",
+    "level_4_dim": "medical"
+}
+```
+
+### Data format:
+
+|name|Option|information|
 |---|---|---|
-|prompt|optional|the background of the LLM testing|
-|question|required|the testing question|
-|response|required|the answer of the question|
-|explanation|optional|the explanation of the answer|
-|judge_prompt|optional|the prompt of the judge model|
-|level_1_dim|optional|single-modal or multi-modal|
-|level_2_dim|optional|single-modal: text, image, video; multi-modal: text-image, text-video, text-image-video|
-|level_3_dim|required|details|
-|level_4_dim|required|details|
+|prompt|Optional|the background of the LLM testing|
+|question|Required|the testing question|
+|response|Required|the answer of the question|
+|explanation|Optional|the explanation of the answer|
+|judge_prompt|Optional|the prompt of the judge model|
+|level_1_dim|Optional|single-modal or multi-modal|
+|level_2_dim|Optional|single-modal: text, image, video; multi-modal: text-image, text-video, text-image-video|
+|level_3_dim|Required|details|
+|level_4_dim|Required|details|
 
 data example:
 
@@ -32,7 +56,7 @@ data example:
     "question": "Which one is the correct answer of xxx? A. xxx B. xxx C. xxx D. xxx",
     "response": "C",
     "explanation": "xxx",
-    "level_1_dim": "singel-modal",
+    "level_1_dim": "single-modal",
     "level_2_dim": "text",
     "level_3_dim": "knowledge Q&A",
     "level_4_dim": "medical knowledge"
@@ -52,24 +76,20 @@ You can download dataset in [kaggle](https://www.kaggle.com/datasets/hsj576/gove
 dataset/government
 ├── objective
 │   ├── test_data
-│   │   ├── data_info.json
 │   │   ├── data.jsonl
-│   │   └── prompts.json
+│   │   └── metadata.json
 │   └── train_data
 └── subjective
     ├── test_data
     │   ├── data_full.jsonl
-    │   ├── data_info.json
     │   ├── data.jsonl
-    │   └── prompts.json
+    │   └── metadata.json
     └── train_data
 ```
 
 ## Prepare Environment
 
-You need to install the changed-sedna package, which added `JSONDataInfoParse` in `sedna.datasources`
-
-Replace the file in `yourpath/anaconda3/envs/ianvs/lib/python3.x/site-packages/sedna` with `examples/resources/sedna-jsondatainfo.zip`
+You should change your sedna package like this: [my sedna repo commit](https://github.com/IcyFeather233/sedna/commit/e13b82363c03dc771fca4922a24798554ca32a9f)
 
 ## Run Ianvs
 

diff --git a/examples/government/singletask_learning_bench/objective/testalgorithms/gen/basemodel.py b/examples/government/singletask_learning_bench/objective/testalgorithms/gen/basemodel.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import absolute_import, division, LOGGER.info_function
+from __future__ import absolute_import, division
 
 import os
 import tempfile
@@ -59,17 +59,16 @@ def save(self, model_path):
 
     def predict(self, data, input_shape=None, **kwargs):
         LOGGER.info("BaseModel predict")
-
-        if 'infer_system_prompt' in data.prompts:
-            infer_system_prompt = data.prompts['infer_system_prompt']
+        LOGGER.info(f"Dataset: {data.dataset_name}")
+        LOGGER.info(f"Description: {data.description}")
+        LOGGER.info(f"Data Level 1 Dim: {data.level_1_dim}")
+        LOGGER.info(f"Data Level 2 Dim: {data.level_2_dim}")
 
         answer_list = []
         for line in tqdm(data.x, desc="Processing", unit="question"):
             # 3-shot
             indices = random.sample([i for i, l in enumerate(data.x) if l != line], 3)
             history = []
-            if infer_system_prompt:
-                history.append({"role": "system", "content": infer_system_prompt})
             for idx in indices:
                 history.append({"role": "user", "content": data.x[idx]})
                 history.append({"role": "assistant", "content": data.y[idx]})

diff --git a/examples/government/singletask_learning_bench/objective/testenv/testenv.yaml b/examples/government/singletask_learning_bench/objective/testenv/testenv.yaml
@@ -4,7 +4,7 @@ testenv:
     # the url address of train dataset index; string type;
     train_data: "/home/icyfeather/Projects/ianvs/dataset/government/objective/train_data/data.jsonl"
     # the url address of test dataset index; string type;
-    test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/objective/test_data/data_info.json"
+    test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/objective/test_data/metadata.json"
 
   # metrics configuration for test case's evaluation; list type;
   metrics:

diff --git a/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/basemodel.py b/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/basemodel.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import absolute_import, division, LOGGER.info_function
+from __future__ import absolute_import, division
 
 import os
 import tempfile
@@ -59,25 +59,23 @@ def save(self, model_path):
 
     def predict(self, data, input_shape=None, **kwargs):
         LOGGER.info("BaseModel predict")
-
-        if 'infer_system_prompt' in data.prompts:
-            infer_system_prompt = data.prompts['infer_system_prompt']
+        LOGGER.info(f"Dataset: {data.dataset_name}")
+        LOGGER.info(f"Description: {data.description}")
+        LOGGER.info(f"Data Level 1 Dim: {data.level_1_dim}")
+        LOGGER.info(f"Data Level 2 Dim: {data.level_2_dim}")
 
         answer_list = []
         for line in tqdm(data.x, desc="Processing", unit="question"):
             history = []
-            query = line.split('||')[0]
-            if infer_system_prompt:
-                history.append({"role": "system", "content": infer_system_prompt})
-            history.append({"role": "user", "content": query})
+            history.append({"role": "user", "content": line})
             response = self._infer(history)
             answer_list.append(response)
 
         judgement_list = []
 
         # evaluate by llm
         for index in tqdm(range(len(answer_list)), desc="Evaluating", ascii=False, ncols=75):
-            prompt = data.prompts['eval_user_template'].replace('{question}', data.x[index].split('||')[0]).replace('{reference}', data.x[index].split('||')[1]).replace('{answer}', answer_list[index])
+            prompt = data.judge_prompts[index] + answer_list[index]
             judgement = self._openai_generate(prompt)
             judgement_list.append(judgement)
 
@@ -113,6 +111,8 @@ def _infer(self, messages):
 
     def _openai_generate(self, user_question, system=None):
         key = os.getenv("DEEPSEEK_API_KEY")
+        if not key:
+            raise ValueError("You should set DEEPSEEK_API_KEY in your env.")
         client = OpenAI(api_key=key, base_url="https://api.deepseek.com")
 
         messages = []

diff --git a/examples/government/singletask_learning_bench/subjective/testenv/testenv.yaml b/examples/government/singletask_learning_bench/subjective/testenv/testenv.yaml
@@ -4,7 +4,7 @@ testenv:
     # the url address of train dataset index; string type;
     train_data: "/home/icyfeather/Projects/ianvs/dataset/government/subjective/train_data/data.jsonl"
     # the url address of test dataset index; string type;
-    test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/subjective/test_data/data_info.json"
+    test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/subjective/test_data/metadata.json"
 
   # metrics configuration for test case's evaluation; list type;
   metrics:

diff --git a/examples/resources/sedna-jsondatainfo.zip b/examples/resources/sedna-jsondatainfo.zip