forked from open-compass/opencompass
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_ruler.py
100 lines (86 loc) · 4.45 KB
/
eval_ruler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from opencompass.partitioners import (
NaivePartitioner,
NumWorkerPartitioner,
)
from mmengine.config import read_base
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
with read_base():
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
models as qwen2_7b_instruct_model,
)
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import (
models as llama3_8b_instruct_model,
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import (
models as internlm2_5_7b_chat_1m,
)
from opencompass.configs.datasets.ruler.ruler_niah_gen import niah_datasets # Niah
from opencompass.configs.datasets.ruler.ruler_vt_gen import vt_datasets # VT
from opencompass.configs.datasets.ruler.ruler_fwe_gen import fwe_datasets # FWE
from opencompass.configs.datasets.ruler.ruler_cwe_gen import cwe_datasets # CWE
from opencompass.configs.datasets.ruler.ruler_qa_gen import qa_datasets # QA
from opencompass.configs.summarizers.groups.ruler import ruler_summary_groups
import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# Evaluation config
NUM_SAMPLES = 500
# Change the context lengths to be tested
max_seq_lens = [1024 * 4, 1024 * 8, 1024 * 16, 1024 * 32]
abbr_suffixs = ['4k', '8k', '16k', '32k']
work_dir = './outputs/ruler'
# Model Settings
qwen2_7b_instruct_model[0]['max_seq_len'] = 33792
qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 33792
qwen2_7b_instruct_model[0]['engine_config']['tp'] = 2
qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 2
llama3_8b_instruct_model[0]['max_seq_len'] = 33792
llama3_8b_instruct_model[0]['engine_config']['session_len'] = 33792
llama3_8b_instruct_model[0]['engine_config']['tp'] = 2
llama3_8b_instruct_model[0]['run_cfg']['num_gpus'] = 2
model_settings = [
[qwen2_7b_instruct_model[0], 'Qwen/Qwen2-7B-Instruct'],
[llama3_8b_instruct_model[0], 'meta-llama/Meta-Llama-3-8B-Instruct'],
[internlm2_5_7b_chat_1m[0], 'internlm/internlm2_5-7b-chat-1m'],
]
# Dataset Model Combination
datasets = []
models = []
model_dataset_combinations = []
# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
for model, model_path in model_settings:
_tmp_datasets = []
for dataset in import_datasets:
tmp_dataset = dataset.deepcopy()
tmp_dataset['tokenizer_model'] = model_path
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
_tmp_datasets.append(tmp_dataset)
model_dataset_combinations.append(dict(models=[model], datasets=_tmp_datasets))
models.append(model)
datasets.extend(_tmp_datasets)
infer = dict(
partitioner=dict(type=NumWorkerPartitioner),
runner=dict(
type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask), retry=5
),
)
eval = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=OpenICLEvalTask)),
)
summarizer = dict(
dataset_abbrs=abbr_suffixs,
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
),
)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind internlm2_5-7b-chat-1m-turbomind
# --------- --------- ------------- ------ ----------------------------- ------------------------------- ----------------------------------
# 4k - naive_average gen 93.66 93.48 91.20
# 8k - naive_average gen 88.38 89.95 89.07
# 16k - naive_average gen 84.27 0.14 87.61
# 32k - naive_average gen 81.36 0.00 84.59
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$