Skip to content

Commit

Permalink
add llm-benchmarks proposal
Browse files Browse the repository at this point in the history
Signed-off-by: IcyFeather <[email protected]>

add opencompass and llm singletask learning bench

Signed-off-by: IcyFeather <[email protected]>

update llm single task learning bench readme

Signed-off-by: IcyFeather <[email protected]>

add government benchmark

Signed-off-by: IcyFeather <[email protected]>

update government benchmark

Signed-off-by: IcyFeather <[email protected]>

update llm government benchmark implementation

Signed-off-by: IcyFeather <[email protected]>

update llm government benchmark implementation

Signed-off-by: IcyFeather <[email protected]>

update government README

Signed-off-by: IcyFeather <[email protected]>

update llm benchmark format

Signed-off-by: IcyFeather <[email protected]>

update government benchmark

Signed-off-by: IcyFeather <[email protected]>

update government benchmark dataset

Signed-off-by: IcyFeather <[email protected]>

add llm-benchmarks proposal

Signed-off-by: IcyFeather <[email protected]>

update llm benchmark proposal

Signed-off-by: IcyFeather <[email protected]>

update llm benchmark proposal

Signed-off-by: IcyFeather <[email protected]>

update llm benchmark proposal

Signed-off-by: IcyFeather <[email protected]>

translate llm-benchmark proposal

Signed-off-by: IcyFeather <[email protected]>

update proposal, add opencompass tutorial

Signed-off-by: IcyFeather <[email protected]>

update government benchmark sedna package

Signed-off-by: IcyFeather <[email protected]>

update government benchmark

Signed-off-by: IcyFeather <[email protected]>

update llm benchmark format

Signed-off-by: IcyFeather <[email protected]>

update llm benchmark format

Signed-off-by: IcyFeather <[email protected]>

fix pylint check problem

Signed-off-by: IcyFeather <[email protected]>

fix pylint check problem

Signed-off-by: IcyFeather <[email protected]>

fix pylint check problem

Signed-off-by: IcyFeather <[email protected]>

trans Chinese comments to English

Signed-off-by: IcyFeather <[email protected]>

add government llm benchmark

Signed-off-by: IcyFeather <[email protected]>
  • Loading branch information
IcyFeather233 authored and IcyFeather committed Oct 28, 2024
1 parent 5c48872 commit 0abcb9a
Show file tree
Hide file tree
Showing 32 changed files with 2,114 additions and 9 deletions.
1 change: 1 addition & 0 deletions core/common/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class DatasetFormat(Enum):
CSV = "csv"
TXT = "txt"
JSON = "json"
JSONL = "jsonl"


class ParadigmType(Enum):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,8 @@ def _inference(self, job, trained_model):
inference_output_dir = os.path.join(self.workspace, "output/inference/")
os.environ["RESULT_SAVED_URL"] = inference_output_dir
job.load(trained_model)
infer_res = job.predict(inference_dataset.x)
if hasattr(inference_dataset, 'need_other_info'):
infer_res = job.predict(inference_dataset)
else:
infer_res = job.predict(inference_dataset.x)
return infer_res
77 changes: 69 additions & 8 deletions core/testenvmanager/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,16 @@

import os
import tempfile

import pandas as pd
from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse

# pylint: disable=no-name-in-module
# pylint: disable=too-many-instance-attributes
from sedna.datasources import (
CSVDataParse,
TxtDataParse,
JSONDataParse,
JsonlDataParse,
JSONMetaDataParse,
)
from core.common import utils
from core.common.constant import DatasetFormat

Expand All @@ -38,12 +44,28 @@ class Dataset:
def __init__(self, config):
self.train_url: str = ""
self.test_url: str = ""
self.train_index: str = ""
self.test_index: str = ""
self.train_data: str = ""
self.test_data: str = ""
self.train_data_info: str = ""
self.test_data_info: str = ""
self.label: str = ""
self._parse_config(config)

def _check_fields(self):
self._check_dataset_url(self.train_url)
self._check_dataset_url(self.test_url)
if self.train_index:
self._check_dataset_url(self.train_index)
if self.test_index:
self._check_dataset_url(self.test_index)
if self.train_data:
self._check_dataset_url(self.train_data)
if self.test_data:
self._check_dataset_url(self.test_data)
if self.train_data_info:
self._check_dataset_url(self.train_data_info)
if self.test_data_info:
self._check_dataset_url(self.test_data_info)

def _parse_config(self, config):
for attr, value in config.items():
Expand Down Expand Up @@ -103,6 +125,20 @@ def _process_index_file(self, file_url):

return None

def _process_data_file(self, file_url):
file_format = utils.get_file_format(file_url)
if file_format == DatasetFormat.JSONL.value:
return file_url

return None

def _process_data_info_file(self, file_url):
file_format = utils.get_file_format(file_url)
if file_format == DatasetFormat.JSON.value:
return file_url

return None

def process_dataset(self):
"""
process dataset:
Expand All @@ -111,9 +147,26 @@ def process_dataset(self):
in the index file(e.g.: txt index file).
"""
if self.train_index:
self.train_url = self._process_index_file(self.train_index)
elif self.train_data:
self.train_url = self._process_data_file(self.train_data)
elif self.train_data_info:
self.train_url = self._process_data_info_file(self.train_data_info)
# raise NotImplementedError('to be done')
else:
raise NotImplementedError('not one of train_index/train_data/train_data_info')

if self.test_index:
self.test_url = self._process_index_file(self.test_index)
elif self.test_data:
self.test_url = self._process_data_file(self.test_data)
elif self.test_data_info:
self.test_url = self._process_data_info_file(self.test_data_info)
# raise NotImplementedError('to be done')
else:
raise NotImplementedError('not one of test_index/test_data/test_data_info')

self.train_url = self._process_index_file(self.train_url)
self.test_url = self._process_index_file(self.test_url)

# pylint: disable=too-many-arguments
def split_dataset(self, dataset_url, dataset_format, ratio, method="default",
Expand Down Expand Up @@ -388,6 +441,11 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature
e.g.: TxtDataParse, CSVDataParse.
"""
if file.split('/')[-1] == "metadata.json":
data = JSONMetaDataParse(data_type=data_type, func=feature_process)
data.parse(file)
return data

data_format = utils.get_file_format(file)

data = None
Expand All @@ -397,11 +455,14 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature

if data_format == DatasetFormat.TXT.value:
data = TxtDataParse(data_type=data_type, func=feature_process)
#print(file)
data.parse(file, use_raw=use_raw)

if data_format == DatasetFormat.JSON.value:
data = JSONDataParse(data_type=data_type, func=feature_process)
data.parse(file)

if data_format == DatasetFormat.JSONL.value:
data = JsonlDataParse(data_type=data_type, func=feature_process)
data.parse(file)

return data
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 0abcb9a

Please sign in to comment.