Skip to content

Commit

Permalink
add Impl for llm edge benchmark suite
Browse files Browse the repository at this point in the history
Signed-off-by: yexiaochuan <[email protected]>
  • Loading branch information
yexiaochuan committed Sep 28, 2024
1 parent 4de73b2 commit 2893646
Show file tree
Hide file tree
Showing 10 changed files with 325 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
benchmarkingjob:
name: "benchmarkingjob"
workspace: "./workspace"

testenv: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml"

test_object:
type: "algorithms"
algorithms:
- name: "llama-cpp"
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml"

rank:
sort_by:
- { "latency": "descend" }
- { "throughput": "ascend" }
- { "perplexity": "ascend" }

visualization:
mode: "selected_only"
method: "print_table"

selected_dataitem:
paradigms: [ "all" ]
modules: [ "all" ]
hyperparameters: [ "all" ]
metrics: [ "latency", "throughput", "perplexity" ]

save_mode: "selected_and_all"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
algorithm:
paradigm_type: "singletasklearning"

initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf"

modules:
- type: "basemodel"
name: "LlamaCppModel"
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py"
hyperparameters:
- model_path:
values:
- "models/qwen/qwen_1_5_0_5b_q8_0.gguf"
- quantization_type:
values:
- "q8_0"
- n_ctx:
values:
- 2048
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
from sedna.common.class_factory import ClassFactory, ClassType
from llama_cpp import Llama
from contextlib import redirect_stderr
import os
import psutil
import time


@ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel")
class LlamaCppModel:
def __init__(self, **kwargs):
"""
init llama-cpp
"""
model_path = kwargs.get("model_path")
if not model_path:
raise ValueError("Model path is required.")
quantization_type = kwargs.get("quantization_type", None)
if quantization_type:
print(f"Using quantization type: {quantization_type}")
# Init LLM model
self.model = Llama(
model_path=model_path,
n_ctx=kwargs.get("n_ctx", 512),
n_gpu_layers=kwargs.get("n_gpu_layers", 0),
seed=kwargs.get("seed", -1),
f16_kv=kwargs.get("f16_kv", True),
logits_all=kwargs.get("logits_all", False),
vocab_only=kwargs.get("vocab_only", False),
use_mlock=kwargs.get("use_mlock", False),
embedding=kwargs.get("embedding", False),
)

def predict(self, data=None, input_shape=None, **kwargs):
# TODO the prompt get from dataa
prompt = (
"Q: Name the planets in the solar system? A: "
)
process = psutil.Process(os.getpid())
start_time = time.time()

import io
from contextlib import redirect_stdout

f = io.StringIO()
with redirect_stderr(f):
output = self.model(
prompt=prompt,
max_tokens=kwargs.get("max_tokens", 32),
stop=kwargs.get("stop", ["Q:", "\n"]),
echo=kwargs.get("echo", True),
temperature=kwargs.get("temperature", 0.8),
top_p=kwargs.get("top_p", 0.95),
top_k=kwargs.get("top_k", 40),
repeat_penalty=kwargs.get("repeat_penalty", 1.1),
)

stdout_output = f.getvalue()

end_time = time.time()
total_time = end_time - start_time
# parse timing info
timings = self._parse_timings(stdout_output)
prefill_latency = timings.get('prompt_eval_time', 0.0) # ms
generated_text = output['choices'][0]['text']

mem_info = process.memory_info().rss # byte
mem_usage = mem_info # byte

result_with_time_mem = {
"generated_text": generated_text,
"total_time": timings.get('total_time', 0.0), # ms
"prefill_latency": timings.get('prompt_eval_time', 0.0), # m
"mem_usage": mem_usage # byte
}


predict_dict = {
"results": [result_with_time_mem]
}

return predict_dict
def _parse_timings(self, stdout_output):
import re
timings = {}
for line in stdout_output.split('\n'):
match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line)
if match:
key = match.group(1).strip()
value = float(match.group(2))

key = key.lower().replace(' ', '_')
timings[key] = value
print(f"Captured timing: {key} = {value}")
else:
print("No match found for this line.")

return timings

def evaluate(self, data, model_path=None, **kwargs):
"""
evaluate model
"""
if data is None or data.x is None:
raise ValueError("Evaluation data is None.")

if model_path:
self.load(model_path)

# do predict
predict_dict = self.predict(data.x, **kwargs)

# compute metrics
metric = kwargs.get("metric")
if metric is None:
raise ValueError("No metric provided in kwargs.")

metric_name, metric_func = metric #

if callable(metric_func):
metric_value = metric_func(None, predict_dict["results"])
return {metric_name: metric_value}
else:
raise ValueError(f"Metric function {metric_name} is not callable or not provided.")

def save(self, model_path):
pass

def load(self, model_url):
pass
def train(self, train_data, valid_data=None, **kwargs):
raise NotImplementedError("Training is not supported for LlamaCppModel.")
def train(self, train_data, valid_data=None, **kwargs):
print("Training is not supported for this model. Skipping training step.")
return
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os
import argparse
from modelscope import snapshot_download

def download_model(model_id, revision, local_dir):
try:
model_dir = snapshot_download(model_id, revision=revision, cache_dir=local_dir)
print(f"Model successfully downloaded to: {model_dir}")
return model_dir
except Exception as e:
print(f"Error downloading model: {str(e)}")
return None

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download a model from ModelScope")
parser.add_argument("--model_id", type=str, required=True, help="ModelScope model ID")
parser.add_argument("--revision", type=str, default="master", help="Model revision")
parser.add_argument("--local_dir", type=str, required=True, help="Local directory to save the model")

args = parser.parse_args()

download_model(args.model_id, args.revision, args.local_dir)
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright 2023 The KubeEdge Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sedna.common.class_factory import ClassType, ClassFactory

__all__ = ["latency"]


@ClassFactory.register(ClassType.GENERAL, alias="latency")
def latency(y_true, y_pred):
predicts = y_pred
results_list = y_pred.get('results', [])
total_time = 0.0
count = 0
for result in results_list:
if isinstance(result, dict) and 'total_time' in result:
total_time += result['total_time'] # /ms
count += 1
average_latency = total_time / count if count > 0 else 0.0
print(f"Average Latency: {average_latency} ms")
return average_latency
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from sedna.common.class_factory import ClassType, ClassFactory

__all__ = ["mem_usage"]

@ClassFactory.register(ClassType.GENERAL, alias="mem_usage")
def mem_usage(y_true, y_pred):
results_list = y_pred.get('results', [])
total_mem_usage = 0.0
count = 0
for result in results_list:
if isinstance(result, dict) and 'mem_usage' in result:
mem_usage_bytes = result['mem_usage']
mem_usage_mb = mem_usage_bytes / (1024 * 1024) # byte -> MB
total_mem_usage += mem_usage_mb
count += 1
average_mem_usage = total_mem_usage / count if count > 0 else 0.0
print(f"Average Memory Usage: {average_mem_usage} MB")
return average_mem_usage
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from sedna.common.class_factory import ClassType, ClassFactory

__all__ = ["prefill_latency"]

@ClassFactory.register(ClassType.GENERAL, alias="prefill_latency")
def prefill_latency(y_true, y_pred):
results_list = y_pred.get('results', [])

total_prefill_time = 0.0
count = 0
for result in results_list:
if isinstance(result, dict) and 'prefill_latency' in result:
total_prefill_time += result['prefill_latency']
count += 1
average_prefill_latency = total_prefill_time / count if count > 0 else 0.0
print(f"Average Prefill Latency: {average_prefill_latency} ms")
return average_prefill_latency
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
testenv:
dataset:
train_url: "dataset/train_data/index.txt"
test_url: "dataset/test_data/index.txt"


metrics:
- name: "latency"
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py"
- name: "throughput"
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py"
- name: "prefill_latency"
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py"
- name: "mem_usage"
url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py"
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2023 The KubeEdge Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.



from sedna.common.class_factory import ClassType, ClassFactory

__all__ = ["throughput"]

@ClassFactory.register(ClassType.GENERAL, alias="throughput")
def throughput(y_true, y_pred):
results_list = y_pred.get('results', [])

total_time = 0.0 # /ms
num_requests = 0
for result in results_list:
if isinstance(result, dict) and 'total_time' in result:
total_time += result['total_time']
num_requests += 1
if total_time > 0:
throughput_value = num_requests / (total_time / 1000)
else:
throughput_value = 0.0
print(f"Throughput: {throughput_value} requests/second")
return throughput_value

0 comments on commit 2893646

Please sign in to comment.