Skip to content

Commit

Permalink
update llm benchmark format
Browse files Browse the repository at this point in the history
Signed-off-by: IcyFeather <[email protected]>
  • Loading branch information
IcyFeather233 committed Sep 19, 2024
1 parent ef5841b commit cf82d95
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 39 deletions.
6 changes: 3 additions & 3 deletions core/testenvmanager/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import json

import pandas as pd
from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse, JsonlDataParse, JSONDataInfoParse
from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse, JsonlDataParse, JSONMetaDataParse

from core.common import utils
from core.common.constant import DatasetFormat
Expand Down Expand Up @@ -436,8 +436,8 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature
e.g.: TxtDataParse, CSVDataParse.
"""
if file.split('/')[-1] == "data_info.json":
data = JSONDataInfoParse(data_type=data_type, func=feature_process)
if file.split('/')[-1] == "metadata.json":
data = JSONMetaDataParse(data_type=data_type, func=feature_process)
data.parse(file)
return data

Expand Down
58 changes: 39 additions & 19 deletions examples/government/singletask_learning_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,43 @@ This Benchmark consists of two parts: subjective evaluation data and objective e

## Design

Dataset format:
### Metadata Format

|name|optionality|information|
| Name | Field Name | Option | Description |
| --- | --- | --- | --- |
| Data Name | dataset | Required | Name of the dataset |
| Data Description | description | Optional | Dataset description, such as usage scope, sample size, etc. |
| First-level Dimension | level_1_dim | Required | Should fill in "Single Modal" or "Multi-Modal" |
| Second-level Dimension | level_2_dim | Required | For "Single Modal", fill in "Text", "Image", or "Audio". For "Multi-Modal", fill in "Text-Image", "Text-Audio", "Image-Audio", or "Text-Image-Audio" |
| Third-level Dimension | level_3_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document |
| Fourth-level Dimension | level_4_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document |

metadata example:

```json
{
"dataset": "Medical BenchMark",
"description": "xxx",
"level_1_dim": "single-modal",
"level_2_dim": "text",
"level_3_dim": "Q&A",
"level_4_dim": "medical"
}
```

### Data format:

|name|Option|information|
|---|---|---|
|prompt|optional|the background of the LLM testing|
|question|required|the testing question|
|response|required|the answer of the question|
|explanation|optional|the explanation of the answer|
|judge_prompt|optional|the prompt of the judge model|
|level_1_dim|optional|single-modal or multi-modal|
|level_2_dim|optional|single-modal: text, image, video; multi-modal: text-image, text-video, text-image-video|
|level_3_dim|required|details|
|level_4_dim|required|details|
|prompt|Optional|the background of the LLM testing|
|question|Required|the testing question|
|response|Required|the answer of the question|
|explanation|Optional|the explanation of the answer|
|judge_prompt|Optional|the prompt of the judge model|
|level_1_dim|Optional|single-modal or multi-modal|
|level_2_dim|Optional|single-modal: text, image, video; multi-modal: text-image, text-video, text-image-video|
|level_3_dim|Required|details|
|level_4_dim|Required|details|

data example:

Expand All @@ -32,7 +56,7 @@ data example:
"question": "Which one is the correct answer of xxx? A. xxx B. xxx C. xxx D. xxx",
"response": "C",
"explanation": "xxx",
"level_1_dim": "singel-modal",
"level_1_dim": "single-modal",
"level_2_dim": "text",
"level_3_dim": "knowledge Q&A",
"level_4_dim": "medical knowledge"
Expand All @@ -52,24 +76,20 @@ You can download dataset in [kaggle](https://www.kaggle.com/datasets/hsj576/gove
dataset/government
├── objective
│ ├── test_data
│ │ ├── data_info.json
│ │ ├── data.jsonl
│ │ └── prompts.json
│ │ └── metadata.json
│ └── train_data
└── subjective
├── test_data
│ ├── data_full.jsonl
│ ├── data_info.json
│ ├── data.jsonl
│ └── prompts.json
│ └── metadata.json
└── train_data
```

## Prepare Environment

You need to install the changed-sedna package, which added `JSONDataInfoParse` in `sedna.datasources`

Replace the file in `yourpath/anaconda3/envs/ianvs/lib/python3.x/site-packages/sedna` with `examples/resources/sedna-jsondatainfo.zip`
You should change your sedna package like this: [my sedna repo commit](https://github.com/IcyFeather233/sedna/commit/e13b82363c03dc771fca4922a24798554ca32a9f)

## Run Ianvs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import, division, LOGGER.info_function
from __future__ import absolute_import, division

import os
import tempfile
Expand Down Expand Up @@ -59,17 +59,16 @@ def save(self, model_path):

def predict(self, data, input_shape=None, **kwargs):
LOGGER.info("BaseModel predict")

if 'infer_system_prompt' in data.prompts:
infer_system_prompt = data.prompts['infer_system_prompt']
LOGGER.info(f"Dataset: {data.dataset_name}")
LOGGER.info(f"Description: {data.description}")
LOGGER.info(f"Data Level 1 Dim: {data.level_1_dim}")
LOGGER.info(f"Data Level 2 Dim: {data.level_2_dim}")

answer_list = []
for line in tqdm(data.x, desc="Processing", unit="question"):
# 3-shot
indices = random.sample([i for i, l in enumerate(data.x) if l != line], 3)
history = []
if infer_system_prompt:
history.append({"role": "system", "content": infer_system_prompt})
for idx in indices:
history.append({"role": "user", "content": data.x[idx]})
history.append({"role": "assistant", "content": data.y[idx]})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ testenv:
# the url address of train dataset index; string type;
train_data: "/home/icyfeather/Projects/ianvs/dataset/government/objective/train_data/data.jsonl"
# the url address of test dataset index; string type;
test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/objective/test_data/data_info.json"
test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/objective/test_data/metadata.json"

# metrics configuration for test case's evaluation; list type;
metrics:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import, division, LOGGER.info_function
from __future__ import absolute_import, division

import os
import tempfile
Expand Down Expand Up @@ -59,25 +59,23 @@ def save(self, model_path):

def predict(self, data, input_shape=None, **kwargs):
LOGGER.info("BaseModel predict")

if 'infer_system_prompt' in data.prompts:
infer_system_prompt = data.prompts['infer_system_prompt']
LOGGER.info(f"Dataset: {data.dataset_name}")
LOGGER.info(f"Description: {data.description}")
LOGGER.info(f"Data Level 1 Dim: {data.level_1_dim}")
LOGGER.info(f"Data Level 2 Dim: {data.level_2_dim}")

answer_list = []
for line in tqdm(data.x, desc="Processing", unit="question"):
history = []
query = line.split('||')[0]
if infer_system_prompt:
history.append({"role": "system", "content": infer_system_prompt})
history.append({"role": "user", "content": query})
history.append({"role": "user", "content": line})
response = self._infer(history)
answer_list.append(response)

judgement_list = []

# evaluate by llm
for index in tqdm(range(len(answer_list)), desc="Evaluating", ascii=False, ncols=75):
prompt = data.prompts['eval_user_template'].replace('{question}', data.x[index].split('||')[0]).replace('{reference}', data.x[index].split('||')[1]).replace('{answer}', answer_list[index])
prompt = data.judge_prompts[index] + answer_list[index]
judgement = self._openai_generate(prompt)
judgement_list.append(judgement)

Expand Down Expand Up @@ -113,6 +111,8 @@ def _infer(self, messages):

def _openai_generate(self, user_question, system=None):
key = os.getenv("DEEPSEEK_API_KEY")
if not key:
raise ValueError("You should set DEEPSEEK_API_KEY in your env.")
client = OpenAI(api_key=key, base_url="https://api.deepseek.com")

messages = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ testenv:
# the url address of train dataset index; string type;
train_data: "/home/icyfeather/Projects/ianvs/dataset/government/subjective/train_data/data.jsonl"
# the url address of test dataset index; string type;
test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/subjective/test_data/data_info.json"
test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/subjective/test_data/metadata.json"

# metrics configuration for test case's evaluation; list type;
metrics:
Expand Down
Binary file removed examples/resources/sedna-jsondatainfo.zip
Binary file not shown.

0 comments on commit cf82d95

Please sign in to comment.