add nvidia-gpu-rtx4070-8gb and qwen2.5 models (#326)

substratusai · Dec 4, 2024 · bcda5c2 · bcda5c2
1 parent 20d35f9
commit bcda5c2
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 2 deletions.
diff --git a/charts/kubeai/values-nvidia-k8s-device-plugin.yaml b/charts/kubeai/values-nvidia-k8s-device-plugin.yaml
@@ -19,3 +19,7 @@ resourceProfiles:
     nodeSelector:
       nvidia.com/gpu.family: "ampere"
       nvidia.com/gpu.memory: "40960"
+  nvidia-gpu-rtx4070-8gb:
+    nodeSelector:
+      nvidia.com/gpu.family: "ampere"
+      nvidia.com/gpu.memory: "8188"
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -168,7 +168,15 @@ resourceProfiles:
         operator: "Equal"
         value: "present"
         effect: "NoSchedule"
-
+  nvidia-gpu-rtx4070-8gb:
+    imageName: "nvidia-gpu"
+    limits:
+      nvidia.com/gpu: "1"
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Equal"
+        value: "present"
+        effect: "NoSchedule"
 
 cacheProfiles: {}
 

diff --git a/charts/models/values.yaml b/charts/models/values.yaml
@@ -187,7 +187,7 @@ catalog:
     resourceProfile: nvidia-gpu-h100:8
     targetRequests: 500
   llama-3.1-70b-instruct-fp8-gh200:
-    enabled: true
+    enabled: false
     features: [TextGeneration]
     url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
     engine: VLLM
@@ -270,6 +270,32 @@ catalog:
     engine: VLLM
     resourceProfile: nvidia-gpu-l4:1
   # Qwen #
+  qwen2.5-coder-1.5b-cpu:
+    enabled: false
+    features: ["TextGeneration"]
+    url: "ollama://qwen2.5-coder:1.5b"
+    engine: OLlama
+    resourceProfile: cpu:1
+  qwen2.5-coder-1.5b-rtx4070-8gb:
+    enabled: false
+    features: ["TextGeneration"]
+    url: "hf://Qwen/Qwen2.5-Coder-1.5B-Instruct"
+    engine: VLLM
+    env:
+      VLLM_ATTENTION_BACKEND: FLASHINFER
+    args:
+    - --max-model-len=2048
+    - --max-num-seqs=16
+    - --quantization=fp8
+    - --kv-cache-dtype=fp8
+    minReplicas: 1
+    resourceProfile: nvidia-gpu-rtx4070-8gb:1
+  qwen2.5-7b-cpu:
+    enabled: false
+    features: ["TextGeneration"]
+    url: "ollama://qwen2.5:7b"
+    engine: OLlama
+    resourceProfile: cpu:2
   qwen2-500m-cpu:
     enabled: false
     features: ["TextGeneration"]

diff --git a/manifests/models/qwen2.5-7b-cpu.yaml b/manifests/models/qwen2.5-7b-cpu.yaml
@@ -0,0 +1,11 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: qwen2.5-7b-cpu
+spec:
+  features: [TextGeneration]
+  owner:
+  url: ollama://qwen2.5:7b
+  engine: OLlama
+  resourceProfile: cpu:2
diff --git a/manifests/models/qwen2.5-coder-1.5b-cpu.yaml b/manifests/models/qwen2.5-coder-1.5b-cpu.yaml
@@ -0,0 +1,11 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: qwen2.5-coder-1.5b-cpu
+spec:
+  features: [TextGeneration]
+  owner:
+  url: ollama://qwen2.5-coder:1.5b
+  engine: OLlama
+  resourceProfile: cpu:1
diff --git a/manifests/models/qwen2.5-coder-1.5b-rtx4070-8gb.yaml b/manifests/models/qwen2.5-coder-1.5b-rtx4070-8gb.yaml
@@ -0,0 +1,19 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: qwen2.5-coder-1.5b-rtx4070-8gb
+spec:
+  features: [TextGeneration]
+  owner:
+  url: hf://Qwen/Qwen2.5-Coder-1.5B-Instruct
+  engine: VLLM
+  args:
+    - --max-model-len=2048
+    - --max-num-seqs=16
+    - --quantization=fp8
+    - --kv-cache-dtype=fp8
+  env:
+    VLLM_ATTENTION_BACKEND: FLASHINFER
+  minReplicas: 1
+  resourceProfile: nvidia-gpu-rtx4070-8gb:1