From 289364684cbc9ccc598e4b1eb3f30c4327ef7a15 Mon Sep 17 00:00:00 2001
From: yexiaochuan <yxc2020@foxmail.com>
Date: Tue, 24 Sep 2024 22:51:12 +0800
Subject: [PATCH] add Impl for llm edge benchmark suite

Signed-off-by: yexiaochuan <yxc2020@foxmail.com>
---
 .../README.md                                 |   2 +
 .../benchmarkingjob.yaml                      |  29 ++++
 .../testalgorithms/algorithm.yaml             |  19 +++
 .../testalgorithms/basemodel.py               | 135 ++++++++++++++++++
 .../download_model_modelscope.py              |  22 +++
 .../testenv/latency.py                        |  32 +++++
 .../testenv/mem_usage.py                      |  18 +++
 .../testenv/prefill_latency.py                |  17 +++
 .../testenv/testenv.yaml                      |  15 ++
 .../testenv/throughput.py                     |  36 +++++
 10 files changed, 325 insertions(+)
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml
 create mode 100644 examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py

diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md
new file mode 100644
index 00000000..3a3835c7
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md
@@ -0,0 +1,2 @@
+Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs
+
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml
new file mode 100644
index 00000000..69c2b72c
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml
@@ -0,0 +1,29 @@
+benchmarkingjob:
+  name: "benchmarkingjob"
+  workspace: "./workspace"
+
+  testenv: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml"
+
+  test_object:
+    type: "algorithms"
+    algorithms:
+      - name: "llama-cpp"
+        url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml"
+
+  rank:
+    sort_by: 
+      - { "latency": "descend" }
+      - { "throughput": "ascend" }
+      - { "perplexity": "ascend" }
+
+    visualization:
+      mode: "selected_only"
+      method: "print_table"
+
+    selected_dataitem:
+      paradigms: [ "all" ]
+      modules: [ "all" ]
+      hyperparameters: [ "all" ]
+      metrics: [ "latency", "throughput", "perplexity" ]
+
+    save_mode: "selected_and_all"
\ No newline at end of file
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml
new file mode 100644
index 00000000..9a0f95cf
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml
@@ -0,0 +1,19 @@
+algorithm:
+  paradigm_type: "singletasklearning"
+
+  initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf"
+
+  modules:
+    - type: "basemodel"
+      name: "LlamaCppModel"
+      url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py"
+      hyperparameters:
+        - model_path:
+            values:
+              - "models/qwen/qwen_1_5_0_5b_q8_0.gguf"
+        - quantization_type:
+            values:
+              - "q8_0"
+        - n_ctx:
+            values:
+              - 2048
\ No newline at end of file
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py
new file mode 100644
index 00000000..83fae762
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py
@@ -0,0 +1,135 @@
+from sedna.common.class_factory import ClassFactory, ClassType
+from llama_cpp import Llama
+from contextlib import redirect_stderr
+import os
+import psutil
+import time
+
+
+@ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel")
+class LlamaCppModel:
+    def __init__(self, **kwargs):
+        """
+        init llama-cpp
+        """
+        model_path = kwargs.get("model_path")
+        if not model_path:
+            raise ValueError("Model path is required.")
+        quantization_type = kwargs.get("quantization_type", None)
+        if quantization_type:
+            print(f"Using quantization type: {quantization_type}")
+        # Init LLM model
+        self.model = Llama(
+            model_path=model_path,
+            n_ctx=kwargs.get("n_ctx", 512),
+            n_gpu_layers=kwargs.get("n_gpu_layers", 0),
+            seed=kwargs.get("seed", -1),
+            f16_kv=kwargs.get("f16_kv", True),
+            logits_all=kwargs.get("logits_all", False),
+            vocab_only=kwargs.get("vocab_only", False),
+            use_mlock=kwargs.get("use_mlock", False),
+            embedding=kwargs.get("embedding", False),
+        )
+
+    def predict(self, data=None, input_shape=None, **kwargs):
+        # TODO the prompt get from dataa
+        prompt = (
+            "Q: Name the planets in the solar system? A: "
+        )
+        process = psutil.Process(os.getpid())
+        start_time = time.time()
+
+        import io
+        from contextlib import redirect_stdout
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            output = self.model(
+                prompt=prompt,
+                max_tokens=kwargs.get("max_tokens", 32),
+                stop=kwargs.get("stop", ["Q:", "\n"]),
+                echo=kwargs.get("echo", True),
+                temperature=kwargs.get("temperature", 0.8),
+                top_p=kwargs.get("top_p", 0.95),
+                top_k=kwargs.get("top_k", 40),
+                repeat_penalty=kwargs.get("repeat_penalty", 1.1),
+            )
+
+        stdout_output = f.getvalue()
+
+        end_time = time.time()
+        total_time = end_time - start_time  
+        # parse timing info
+        timings = self._parse_timings(stdout_output)
+        prefill_latency = timings.get('prompt_eval_time', 0.0)  # ms
+        generated_text = output['choices'][0]['text']
+
+        mem_info = process.memory_info().rss  # byte
+        mem_usage = mem_info  # byte
+
+        result_with_time_mem = {
+            "generated_text": generated_text,
+            "total_time": timings.get('total_time', 0.0),  # ms
+            "prefill_latency": timings.get('prompt_eval_time', 0.0),  # m
+            "mem_usage": mem_usage  # byte
+        }
+
+
+        predict_dict = {
+            "results": [result_with_time_mem]
+        }
+
+        return predict_dict
+    def _parse_timings(self, stdout_output):
+        import re
+        timings = {}
+        for line in stdout_output.split('\n'):
+            match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line)
+            if match:
+                key = match.group(1).strip()
+                value = float(match.group(2))
+
+                key = key.lower().replace(' ', '_')
+                timings[key] = value
+                print(f"Captured timing: {key} = {value}")
+            else:
+                print("No match found for this line.")
+
+        return timings
+
+    def evaluate(self, data, model_path=None, **kwargs):
+        """
+        evaluate model
+        """
+        if data is None or data.x is None:
+            raise ValueError("Evaluation data is None.")
+
+        if model_path:
+            self.load(model_path)
+
+        # do predict
+        predict_dict = self.predict(data.x, **kwargs)
+
+        # compute metrics
+        metric = kwargs.get("metric")
+        if metric is None:
+            raise ValueError("No metric provided in kwargs.")
+        
+        metric_name, metric_func = metric  # 
+
+        if callable(metric_func):
+            metric_value = metric_func(None, predict_dict["results"])
+            return {metric_name: metric_value}
+        else:
+            raise ValueError(f"Metric function {metric_name} is not callable or not provided.")
+    
+    def save(self, model_path):
+        pass
+
+    def load(self, model_url):
+        pass
+    def train(self, train_data, valid_data=None, **kwargs):
+        raise NotImplementedError("Training is not supported for LlamaCppModel.")
+    def train(self, train_data, valid_data=None, **kwargs):
+        print("Training is not supported for this model. Skipping training step.")
+        return
\ No newline at end of file
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py
new file mode 100644
index 00000000..b8b43094
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py
@@ -0,0 +1,22 @@
+import os
+import argparse
+from modelscope import snapshot_download
+
+def download_model(model_id, revision, local_dir):
+    try:
+        model_dir = snapshot_download(model_id, revision=revision, cache_dir=local_dir)
+        print(f"Model successfully downloaded to: {model_dir}")
+        return model_dir
+    except Exception as e:
+        print(f"Error downloading model: {str(e)}")
+        return None
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download a model from ModelScope")
+    parser.add_argument("--model_id", type=str, required=True, help="ModelScope model ID")
+    parser.add_argument("--revision", type=str, default="master", help="Model revision")
+    parser.add_argument("--local_dir", type=str, required=True, help="Local directory to save the model")
+    
+    args = parser.parse_args()
+    
+    download_model(args.model_id, args.revision, args.local_dir)
\ No newline at end of file
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py
new file mode 100644
index 00000000..b9ff7048
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py
@@ -0,0 +1,32 @@
+# Copyright 2023 The KubeEdge Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sedna.common.class_factory import ClassType, ClassFactory
+
+__all__ = ["latency"]
+
+
+@ClassFactory.register(ClassType.GENERAL, alias="latency")
+def latency(y_true, y_pred):
+    predicts = y_pred
+    results_list = y_pred.get('results', [])
+    total_time = 0.0
+    count = 0
+    for result in results_list:
+        if isinstance(result, dict) and 'total_time' in result:
+            total_time += result['total_time'] # /ms
+            count += 1
+    average_latency = total_time / count if count > 0 else 0.0
+    print(f"Average Latency: {average_latency} ms")
+    return average_latency
\ No newline at end of file
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py
new file mode 100644
index 00000000..aec119cf
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py
@@ -0,0 +1,18 @@
+from sedna.common.class_factory import ClassType, ClassFactory
+
+__all__ = ["mem_usage"]
+
+@ClassFactory.register(ClassType.GENERAL, alias="mem_usage")
+def mem_usage(y_true, y_pred):
+    results_list = y_pred.get('results', [])
+    total_mem_usage = 0.0
+    count = 0
+    for result in results_list:
+        if isinstance(result, dict) and 'mem_usage' in result:
+            mem_usage_bytes = result['mem_usage']
+            mem_usage_mb = mem_usage_bytes / (1024 * 1024)  # byte -> MB
+            total_mem_usage += mem_usage_mb
+            count += 1
+    average_mem_usage = total_mem_usage / count if count > 0 else 0.0
+    print(f"Average Memory Usage: {average_mem_usage} MB")
+    return average_mem_usage
\ No newline at end of file
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py
new file mode 100644
index 00000000..bc57a365
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py
@@ -0,0 +1,17 @@
+from sedna.common.class_factory import ClassType, ClassFactory
+
+__all__ = ["prefill_latency"]
+
+@ClassFactory.register(ClassType.GENERAL, alias="prefill_latency")
+def prefill_latency(y_true, y_pred):
+    results_list = y_pred.get('results', [])
+    
+    total_prefill_time = 0.0
+    count = 0
+    for result in results_list:
+        if isinstance(result, dict) and 'prefill_latency' in result:
+            total_prefill_time += result['prefill_latency']
+            count += 1
+    average_prefill_latency = total_prefill_time / count if count > 0 else 0.0
+    print(f"Average Prefill Latency: {average_prefill_latency} ms")
+    return average_prefill_latency
\ No newline at end of file
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml
new file mode 100644
index 00000000..ce49f111
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml
@@ -0,0 +1,15 @@
+testenv:
+  dataset:
+    train_url: "dataset/train_data/index.txt"
+    test_url: "dataset/test_data/index.txt"
+
+
+  metrics:
+    - name: "latency"
+      url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py"
+    - name: "throughput"
+      url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py"
+    - name: "prefill_latency"
+      url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py"
+    - name: "mem_usage"
+      url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py"
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py
new file mode 100644
index 00000000..366dfa20
--- /dev/null
+++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py
@@ -0,0 +1,36 @@
+# Copyright 2023 The KubeEdge Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+from sedna.common.class_factory import ClassType, ClassFactory
+
+__all__ = ["throughput"]
+
+@ClassFactory.register(ClassType.GENERAL, alias="throughput")
+def throughput(y_true, y_pred):
+    results_list = y_pred.get('results', [])
+    
+    total_time = 0.0  # /ms
+    num_requests = 0
+    for result in results_list:
+        if isinstance(result, dict) and 'total_time' in result:
+            total_time += result['total_time']
+            num_requests += 1
+    if total_time > 0:
+        throughput_value = num_requests / (total_time / 1000)
+    else:
+        throughput_value = 0.0
+    print(f"Throughput: {throughput_value} requests/second")
+    return throughput_value
\ No newline at end of file