Skip to content

Commit

Permalink
updated smart_coding large model benchmark
Browse files Browse the repository at this point in the history
Signed-off-by: boX <[email protected]>
  • Loading branch information
safe-b committed Sep 29, 2024
1 parent b59e3ab commit 6f64867
Show file tree
Hide file tree
Showing 31 changed files with 14,133 additions and 9 deletions.
1 change: 1 addition & 0 deletions core/common/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class DatasetFormat(Enum):
CSV = "csv"
TXT = "txt"
JSON = "json"
JSONL = "jsonl"


class ParadigmType(Enum):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,8 @@ def _inference(self, job, trained_model):
inference_output_dir = os.path.join(self.workspace, "output/inference/")
os.environ["RESULT_SAVED_URL"] = inference_output_dir
job.load(trained_model)
infer_res = job.predict(inference_dataset.x)
if hasattr(inference_dataset, 'need_other_info'):
infer_res = job.predict(inference_dataset)
else:
infer_res = job.predict(inference_dataset.x)
return infer_res
68 changes: 62 additions & 6 deletions core/testenvmanager/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@

import os
import tempfile
import json

import pandas as pd
from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse
from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse, JsonlDataParse, JSONMetaDataParse

from core.common import utils
from core.common.constant import DatasetFormat
Expand All @@ -38,12 +39,28 @@ class Dataset:
def __init__(self, config):
self.train_url: str = ""
self.test_url: str = ""
self.train_index: str = ""
self.test_index: str = ""
self.train_data: str = ""
self.test_data: str = ""
self.train_data_info: str = ""
self.test_data_info: str = ""
self.label: str = ""
self._parse_config(config)

def _check_fields(self):
self._check_dataset_url(self.train_url)
self._check_dataset_url(self.test_url)
if self.train_index:
self._check_dataset_url(self.train_index)
if self.test_index:
self._check_dataset_url(self.test_index)
if self.train_data:
self._check_dataset_url(self.train_data)
if self.test_data:
self._check_dataset_url(self.test_data)
if self.train_data_info:
self._check_dataset_url(self.train_data_info)
if self.test_data_info:
self._check_dataset_url(self.test_data_info)

def _parse_config(self, config):
for attr, value in config.items():
Expand Down Expand Up @@ -103,6 +120,20 @@ def _process_index_file(self, file_url):

return None

def _process_data_file(self, file_url):
file_format = utils.get_file_format(file_url)
if file_format == DatasetFormat.JSONL.value:
return file_url

return None

def _process_data_info_file(self, file_url):
file_format = utils.get_file_format(file_url)
if file_format == DatasetFormat.JSON.value:
return file_url

return None

def process_dataset(self):
"""
process dataset:
Expand All @@ -111,9 +142,26 @@ def process_dataset(self):
in the index file(e.g.: txt index file).
"""
if self.train_index:
self.train_url = self._process_index_file(self.train_index)
elif self.train_data:
self.train_url = self._process_data_file(self.train_data)
elif self.train_data_info:
self.train_url = self._process_data_info_file(self.train_data_info)
# raise NotImplementedError('to be done')
else:
raise NotImplementedError('not one of train_index/train_data/train_data_info')

if self.test_index:
self.test_url = self._process_index_file(self.test_index)
elif self.test_data:
self.test_url = self._process_data_file(self.test_data)
elif self.test_data_info:
self.test_url = self._process_data_info_file(self.test_data_info)
# raise NotImplementedError('to be done')
else:
raise NotImplementedError('not one of test_index/test_data/test_data_info')

self.train_url = self._process_index_file(self.train_url)
self.test_url = self._process_index_file(self.test_url)

# pylint: disable=too-many-arguments
def split_dataset(self, dataset_url, dataset_format, ratio, method="default",
Expand Down Expand Up @@ -388,6 +436,11 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature
e.g.: TxtDataParse, CSVDataParse.
"""
if file.split('/')[-1] == "metadata.json":
data = JSONMetaDataParse(data_type=data_type, func=feature_process)
data.parse(file)
return data

data_format = utils.get_file_format(file)

data = None
Expand All @@ -397,11 +450,14 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature

if data_format == DatasetFormat.TXT.value:
data = TxtDataParse(data_type=data_type, func=feature_process)
#print(file)
data.parse(file, use_raw=use_raw)

if data_format == DatasetFormat.JSON.value:
data = JSONDataParse(data_type=data_type, func=feature_process)
data.parse(file)

if data_format == DatasetFormat.JSONL.value:
data = JsonlDataParse(data_type=data_type, func=feature_process)
data.parse(file)

return data
Binary file modified docs/guides/images/ianvs_arch.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ The format of the issue test set is as follows:
"body":"This is not possible right now afaik :/\r\n\r\nMaybe we could have something like this ? wdyt ?\r\n\r\n```python\r\nds = interleave_datasets(\r\n [shuffled_dataset_a, dataset_b],\r\n probabilities=probabilities,\r\n stopping_strategy='all_exhausted',\r\n reshuffle_each_iteration=True,\r\n)",
"answer_1": {
"user_login": "name",
"created_at":"time"
"created_at":"time",
"updated_at": "time",
"body":"This is not possible right now afaik :/\r\n\r\nMaybe we could have something like this ? wdyt ?\r\n\r\n```python\r\nds = interleave_datasets(\r\n [shuffled_dataset_a, dataset_b],\r\n probabilities=probabilities,\r\n stopping_strategy='all_exhausted',\r\n reshuffle_each_iteration=True,\r\n)",
},
"answer_2": {
"user_login": "name",
"created_at":"time"
"created_at":"time",
"updated_at": "time",
"body":"XXXX"
},
Expand Down
Loading

0 comments on commit 6f64867

Please sign in to comment.