-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add Impl for llm edge benchmark suite
Signed-off-by: yexiaochuan <[email protected]>
- Loading branch information
yexiaochuan
committed
Sep 28, 2024
1 parent
4de73b2
commit 2893646
Showing
10 changed files
with
325 additions
and
0 deletions.
There are no files selected for viewing
2 changes: 2 additions & 0 deletions
2
examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs | ||
|
29 changes: 29 additions & 0 deletions
29
examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
benchmarkingjob: | ||
name: "benchmarkingjob" | ||
workspace: "./workspace" | ||
|
||
testenv: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml" | ||
|
||
test_object: | ||
type: "algorithms" | ||
algorithms: | ||
- name: "llama-cpp" | ||
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml" | ||
|
||
rank: | ||
sort_by: | ||
- { "latency": "descend" } | ||
- { "throughput": "ascend" } | ||
- { "perplexity": "ascend" } | ||
|
||
visualization: | ||
mode: "selected_only" | ||
method: "print_table" | ||
|
||
selected_dataitem: | ||
paradigms: [ "all" ] | ||
modules: [ "all" ] | ||
hyperparameters: [ "all" ] | ||
metrics: [ "latency", "throughput", "perplexity" ] | ||
|
||
save_mode: "selected_and_all" |
19 changes: 19 additions & 0 deletions
19
...llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
algorithm: | ||
paradigm_type: "singletasklearning" | ||
|
||
initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf" | ||
|
||
modules: | ||
- type: "basemodel" | ||
name: "LlamaCppModel" | ||
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py" | ||
hyperparameters: | ||
- model_path: | ||
values: | ||
- "models/qwen/qwen_1_5_0_5b_q8_0.gguf" | ||
- quantization_type: | ||
values: | ||
- "q8_0" | ||
- n_ctx: | ||
values: | ||
- 2048 |
135 changes: 135 additions & 0 deletions
135
...s/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
from sedna.common.class_factory import ClassFactory, ClassType | ||
from llama_cpp import Llama | ||
from contextlib import redirect_stderr | ||
import os | ||
import psutil | ||
import time | ||
|
||
|
||
@ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel") | ||
class LlamaCppModel: | ||
def __init__(self, **kwargs): | ||
""" | ||
init llama-cpp | ||
""" | ||
model_path = kwargs.get("model_path") | ||
if not model_path: | ||
raise ValueError("Model path is required.") | ||
quantization_type = kwargs.get("quantization_type", None) | ||
if quantization_type: | ||
print(f"Using quantization type: {quantization_type}") | ||
# Init LLM model | ||
self.model = Llama( | ||
model_path=model_path, | ||
n_ctx=kwargs.get("n_ctx", 512), | ||
n_gpu_layers=kwargs.get("n_gpu_layers", 0), | ||
seed=kwargs.get("seed", -1), | ||
f16_kv=kwargs.get("f16_kv", True), | ||
logits_all=kwargs.get("logits_all", False), | ||
vocab_only=kwargs.get("vocab_only", False), | ||
use_mlock=kwargs.get("use_mlock", False), | ||
embedding=kwargs.get("embedding", False), | ||
) | ||
|
||
def predict(self, data=None, input_shape=None, **kwargs): | ||
# TODO the prompt get from dataa | ||
prompt = ( | ||
"Q: Name the planets in the solar system? A: " | ||
) | ||
process = psutil.Process(os.getpid()) | ||
start_time = time.time() | ||
|
||
import io | ||
from contextlib import redirect_stdout | ||
|
||
f = io.StringIO() | ||
with redirect_stderr(f): | ||
output = self.model( | ||
prompt=prompt, | ||
max_tokens=kwargs.get("max_tokens", 32), | ||
stop=kwargs.get("stop", ["Q:", "\n"]), | ||
echo=kwargs.get("echo", True), | ||
temperature=kwargs.get("temperature", 0.8), | ||
top_p=kwargs.get("top_p", 0.95), | ||
top_k=kwargs.get("top_k", 40), | ||
repeat_penalty=kwargs.get("repeat_penalty", 1.1), | ||
) | ||
|
||
stdout_output = f.getvalue() | ||
|
||
end_time = time.time() | ||
total_time = end_time - start_time | ||
# parse timing info | ||
timings = self._parse_timings(stdout_output) | ||
prefill_latency = timings.get('prompt_eval_time', 0.0) # ms | ||
generated_text = output['choices'][0]['text'] | ||
|
||
mem_info = process.memory_info().rss # byte | ||
mem_usage = mem_info # byte | ||
|
||
result_with_time_mem = { | ||
"generated_text": generated_text, | ||
"total_time": timings.get('total_time', 0.0), # ms | ||
"prefill_latency": timings.get('prompt_eval_time', 0.0), # m | ||
"mem_usage": mem_usage # byte | ||
} | ||
|
||
|
||
predict_dict = { | ||
"results": [result_with_time_mem] | ||
} | ||
|
||
return predict_dict | ||
def _parse_timings(self, stdout_output): | ||
import re | ||
timings = {} | ||
for line in stdout_output.split('\n'): | ||
match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line) | ||
if match: | ||
key = match.group(1).strip() | ||
value = float(match.group(2)) | ||
|
||
key = key.lower().replace(' ', '_') | ||
timings[key] = value | ||
print(f"Captured timing: {key} = {value}") | ||
else: | ||
print("No match found for this line.") | ||
|
||
return timings | ||
|
||
def evaluate(self, data, model_path=None, **kwargs): | ||
""" | ||
evaluate model | ||
""" | ||
if data is None or data.x is None: | ||
raise ValueError("Evaluation data is None.") | ||
|
||
if model_path: | ||
self.load(model_path) | ||
|
||
# do predict | ||
predict_dict = self.predict(data.x, **kwargs) | ||
|
||
# compute metrics | ||
metric = kwargs.get("metric") | ||
if metric is None: | ||
raise ValueError("No metric provided in kwargs.") | ||
|
||
metric_name, metric_func = metric # | ||
|
||
if callable(metric_func): | ||
metric_value = metric_func(None, predict_dict["results"]) | ||
return {metric_name: metric_value} | ||
else: | ||
raise ValueError(f"Metric function {metric_name} is not callable or not provided.") | ||
|
||
def save(self, model_path): | ||
pass | ||
|
||
def load(self, model_url): | ||
pass | ||
def train(self, train_data, valid_data=None, **kwargs): | ||
raise NotImplementedError("Training is not supported for LlamaCppModel.") | ||
def train(self, train_data, valid_data=None, **kwargs): | ||
print("Training is not supported for this model. Skipping training step.") | ||
return |
22 changes: 22 additions & 0 deletions
22
...mark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import os | ||
import argparse | ||
from modelscope import snapshot_download | ||
|
||
def download_model(model_id, revision, local_dir): | ||
try: | ||
model_dir = snapshot_download(model_id, revision=revision, cache_dir=local_dir) | ||
print(f"Model successfully downloaded to: {model_dir}") | ||
return model_dir | ||
except Exception as e: | ||
print(f"Error downloading model: {str(e)}") | ||
return None | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Download a model from ModelScope") | ||
parser.add_argument("--model_id", type=str, required=True, help="ModelScope model ID") | ||
parser.add_argument("--revision", type=str, default="master", help="Model revision") | ||
parser.add_argument("--local_dir", type=str, required=True, help="Local directory to save the model") | ||
|
||
args = parser.parse_args() | ||
|
||
download_model(args.model_id, args.revision, args.local_dir) |
32 changes: 32 additions & 0 deletions
32
examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Copyright 2023 The KubeEdge Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from sedna.common.class_factory import ClassType, ClassFactory | ||
|
||
__all__ = ["latency"] | ||
|
||
|
||
@ClassFactory.register(ClassType.GENERAL, alias="latency") | ||
def latency(y_true, y_pred): | ||
predicts = y_pred | ||
results_list = y_pred.get('results', []) | ||
total_time = 0.0 | ||
count = 0 | ||
for result in results_list: | ||
if isinstance(result, dict) and 'total_time' in result: | ||
total_time += result['total_time'] # /ms | ||
count += 1 | ||
average_latency = total_time / count if count > 0 else 0.0 | ||
print(f"Average Latency: {average_latency} ms") | ||
return average_latency |
18 changes: 18 additions & 0 deletions
18
examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from sedna.common.class_factory import ClassType, ClassFactory | ||
|
||
__all__ = ["mem_usage"] | ||
|
||
@ClassFactory.register(ClassType.GENERAL, alias="mem_usage") | ||
def mem_usage(y_true, y_pred): | ||
results_list = y_pred.get('results', []) | ||
total_mem_usage = 0.0 | ||
count = 0 | ||
for result in results_list: | ||
if isinstance(result, dict) and 'mem_usage' in result: | ||
mem_usage_bytes = result['mem_usage'] | ||
mem_usage_mb = mem_usage_bytes / (1024 * 1024) # byte -> MB | ||
total_mem_usage += mem_usage_mb | ||
count += 1 | ||
average_mem_usage = total_mem_usage / count if count > 0 else 0.0 | ||
print(f"Average Memory Usage: {average_mem_usage} MB") | ||
return average_mem_usage |
17 changes: 17 additions & 0 deletions
17
...es/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from sedna.common.class_factory import ClassType, ClassFactory | ||
|
||
__all__ = ["prefill_latency"] | ||
|
||
@ClassFactory.register(ClassType.GENERAL, alias="prefill_latency") | ||
def prefill_latency(y_true, y_pred): | ||
results_list = y_pred.get('results', []) | ||
|
||
total_prefill_time = 0.0 | ||
count = 0 | ||
for result in results_list: | ||
if isinstance(result, dict) and 'prefill_latency' in result: | ||
total_prefill_time += result['prefill_latency'] | ||
count += 1 | ||
average_prefill_latency = total_prefill_time / count if count > 0 else 0.0 | ||
print(f"Average Prefill Latency: {average_prefill_latency} ms") | ||
return average_prefill_latency |
15 changes: 15 additions & 0 deletions
15
examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
testenv: | ||
dataset: | ||
train_url: "dataset/train_data/index.txt" | ||
test_url: "dataset/test_data/index.txt" | ||
|
||
|
||
metrics: | ||
- name: "latency" | ||
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py" | ||
- name: "throughput" | ||
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py" | ||
- name: "prefill_latency" | ||
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py" | ||
- name: "mem_usage" | ||
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py" |
36 changes: 36 additions & 0 deletions
36
examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Copyright 2023 The KubeEdge Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
|
||
from sedna.common.class_factory import ClassType, ClassFactory | ||
|
||
__all__ = ["throughput"] | ||
|
||
@ClassFactory.register(ClassType.GENERAL, alias="throughput") | ||
def throughput(y_true, y_pred): | ||
results_list = y_pred.get('results', []) | ||
|
||
total_time = 0.0 # /ms | ||
num_requests = 0 | ||
for result in results_list: | ||
if isinstance(result, dict) and 'total_time' in result: | ||
total_time += result['total_time'] | ||
num_requests += 1 | ||
if total_time > 0: | ||
throughput_value = num_requests / (total_time / 1000) | ||
else: | ||
throughput_value = 0.0 | ||
print(f"Throughput: {throughput_value} requests/second") | ||
return throughput_value |