diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md new file mode 100644 index 00000000..3a3835c7 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md @@ -0,0 +1,2 @@ +Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs + diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml new file mode 100644 index 00000000..69c2b72c --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml @@ -0,0 +1,29 @@ +benchmarkingjob: + name: "benchmarkingjob" + workspace: "./workspace" + + testenv: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml" + + test_object: + type: "algorithms" + algorithms: + - name: "llama-cpp" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml" + + rank: + sort_by: + - { "latency": "descend" } + - { "throughput": "ascend" } + - { "perplexity": "ascend" } + + visualization: + mode: "selected_only" + method: "print_table" + + selected_dataitem: + paradigms: [ "all" ] + modules: [ "all" ] + hyperparameters: [ "all" ] + metrics: [ "latency", "throughput", "perplexity" ] + + save_mode: "selected_and_all" \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml new file mode 100644 index 00000000..9a0f95cf --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml @@ -0,0 +1,19 @@ +algorithm: + paradigm_type: "singletasklearning" + + initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf" + + modules: + - type: "basemodel" + name: "LlamaCppModel" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py" + hyperparameters: + - model_path: + values: + - "models/qwen/qwen_1_5_0_5b_q8_0.gguf" + - quantization_type: + values: + - "q8_0" + - n_ctx: + values: + - 2048 \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py new file mode 100644 index 00000000..83fae762 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py @@ -0,0 +1,135 @@ +from sedna.common.class_factory import ClassFactory, ClassType +from llama_cpp import Llama +from contextlib import redirect_stderr +import os +import psutil +import time + + +@ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel") +class LlamaCppModel: + def __init__(self, **kwargs): + """ + init llama-cpp + """ + model_path = kwargs.get("model_path") + if not model_path: + raise ValueError("Model path is required.") + quantization_type = kwargs.get("quantization_type", None) + if quantization_type: + print(f"Using quantization type: {quantization_type}") + # Init LLM model + self.model = Llama( + model_path=model_path, + n_ctx=kwargs.get("n_ctx", 512), + n_gpu_layers=kwargs.get("n_gpu_layers", 0), + seed=kwargs.get("seed", -1), + f16_kv=kwargs.get("f16_kv", True), + logits_all=kwargs.get("logits_all", False), + vocab_only=kwargs.get("vocab_only", False), + use_mlock=kwargs.get("use_mlock", False), + embedding=kwargs.get("embedding", False), + ) + + def predict(self, data=None, input_shape=None, **kwargs): + # TODO the prompt get from dataa + prompt = ( + "Q: Name the planets in the solar system? A: " + ) + process = psutil.Process(os.getpid()) + start_time = time.time() + + import io + from contextlib import redirect_stdout + + f = io.StringIO() + with redirect_stderr(f): + output = self.model( + prompt=prompt, + max_tokens=kwargs.get("max_tokens", 32), + stop=kwargs.get("stop", ["Q:", "\n"]), + echo=kwargs.get("echo", True), + temperature=kwargs.get("temperature", 0.8), + top_p=kwargs.get("top_p", 0.95), + top_k=kwargs.get("top_k", 40), + repeat_penalty=kwargs.get("repeat_penalty", 1.1), + ) + + stdout_output = f.getvalue() + + end_time = time.time() + total_time = end_time - start_time + # parse timing info + timings = self._parse_timings(stdout_output) + prefill_latency = timings.get('prompt_eval_time', 0.0) # ms + generated_text = output['choices'][0]['text'] + + mem_info = process.memory_info().rss # byte + mem_usage = mem_info # byte + + result_with_time_mem = { + "generated_text": generated_text, + "total_time": timings.get('total_time', 0.0), # ms + "prefill_latency": timings.get('prompt_eval_time', 0.0), # m + "mem_usage": mem_usage # byte + } + + + predict_dict = { + "results": [result_with_time_mem] + } + + return predict_dict + def _parse_timings(self, stdout_output): + import re + timings = {} + for line in stdout_output.split('\n'): + match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line) + if match: + key = match.group(1).strip() + value = float(match.group(2)) + + key = key.lower().replace(' ', '_') + timings[key] = value + print(f"Captured timing: {key} = {value}") + else: + print("No match found for this line.") + + return timings + + def evaluate(self, data, model_path=None, **kwargs): + """ + evaluate model + """ + if data is None or data.x is None: + raise ValueError("Evaluation data is None.") + + if model_path: + self.load(model_path) + + # do predict + predict_dict = self.predict(data.x, **kwargs) + + # compute metrics + metric = kwargs.get("metric") + if metric is None: + raise ValueError("No metric provided in kwargs.") + + metric_name, metric_func = metric # + + if callable(metric_func): + metric_value = metric_func(None, predict_dict["results"]) + return {metric_name: metric_value} + else: + raise ValueError(f"Metric function {metric_name} is not callable or not provided.") + + def save(self, model_path): + pass + + def load(self, model_url): + pass + def train(self, train_data, valid_data=None, **kwargs): + raise NotImplementedError("Training is not supported for LlamaCppModel.") + def train(self, train_data, valid_data=None, **kwargs): + print("Training is not supported for this model. Skipping training step.") + return \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py new file mode 100644 index 00000000..b8b43094 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py @@ -0,0 +1,22 @@ +import os +import argparse +from modelscope import snapshot_download + +def download_model(model_id, revision, local_dir): + try: + model_dir = snapshot_download(model_id, revision=revision, cache_dir=local_dir) + print(f"Model successfully downloaded to: {model_dir}") + return model_dir + except Exception as e: + print(f"Error downloading model: {str(e)}") + return None + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download a model from ModelScope") + parser.add_argument("--model_id", type=str, required=True, help="ModelScope model ID") + parser.add_argument("--revision", type=str, default="master", help="Model revision") + parser.add_argument("--local_dir", type=str, required=True, help="Local directory to save the model") + + args = parser.parse_args() + + download_model(args.model_id, args.revision, args.local_dir) \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py new file mode 100644 index 00000000..b9ff7048 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py @@ -0,0 +1,32 @@ +# Copyright 2023 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["latency"] + + +@ClassFactory.register(ClassType.GENERAL, alias="latency") +def latency(y_true, y_pred): + predicts = y_pred + results_list = y_pred.get('results', []) + total_time = 0.0 + count = 0 + for result in results_list: + if isinstance(result, dict) and 'total_time' in result: + total_time += result['total_time'] # /ms + count += 1 + average_latency = total_time / count if count > 0 else 0.0 + print(f"Average Latency: {average_latency} ms") + return average_latency \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py new file mode 100644 index 00000000..aec119cf --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py @@ -0,0 +1,18 @@ +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["mem_usage"] + +@ClassFactory.register(ClassType.GENERAL, alias="mem_usage") +def mem_usage(y_true, y_pred): + results_list = y_pred.get('results', []) + total_mem_usage = 0.0 + count = 0 + for result in results_list: + if isinstance(result, dict) and 'mem_usage' in result: + mem_usage_bytes = result['mem_usage'] + mem_usage_mb = mem_usage_bytes / (1024 * 1024) # byte -> MB + total_mem_usage += mem_usage_mb + count += 1 + average_mem_usage = total_mem_usage / count if count > 0 else 0.0 + print(f"Average Memory Usage: {average_mem_usage} MB") + return average_mem_usage \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py new file mode 100644 index 00000000..bc57a365 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py @@ -0,0 +1,17 @@ +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["prefill_latency"] + +@ClassFactory.register(ClassType.GENERAL, alias="prefill_latency") +def prefill_latency(y_true, y_pred): + results_list = y_pred.get('results', []) + + total_prefill_time = 0.0 + count = 0 + for result in results_list: + if isinstance(result, dict) and 'prefill_latency' in result: + total_prefill_time += result['prefill_latency'] + count += 1 + average_prefill_latency = total_prefill_time / count if count > 0 else 0.0 + print(f"Average Prefill Latency: {average_prefill_latency} ms") + return average_prefill_latency \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml new file mode 100644 index 00000000..ce49f111 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml @@ -0,0 +1,15 @@ +testenv: + dataset: + train_url: "dataset/train_data/index.txt" + test_url: "dataset/test_data/index.txt" + + + metrics: + - name: "latency" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py" + - name: "throughput" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py" + - name: "prefill_latency" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py" + - name: "mem_usage" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py" diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py new file mode 100644 index 00000000..366dfa20 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py @@ -0,0 +1,36 @@ +# Copyright 2023 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["throughput"] + +@ClassFactory.register(ClassType.GENERAL, alias="throughput") +def throughput(y_true, y_pred): + results_list = y_pred.get('results', []) + + total_time = 0.0 # /ms + num_requests = 0 + for result in results_list: + if isinstance(result, dict) and 'total_time' in result: + total_time += result['total_time'] + num_requests += 1 + if total_time > 0: + throughput_value = num_requests / (total_time / 1000) + else: + throughput_value = 0.0 + print(f"Throughput: {throughput_value} requests/second") + return throughput_value \ No newline at end of file