Add llama 3.3 70b GH200 model and GH200 profile (#341)

substratusai · Dec 7, 2024 · 0c8094c · 0c8094c
1 parent 821f9e3
commit 0c8094c
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 2 deletions.
diff --git a/charts/kubeai/values-nvidia-k8s-device-plugin.yaml b/charts/kubeai/values-nvidia-k8s-device-plugin.yaml
@@ -11,6 +11,10 @@ resourceProfiles:
     nodeSelector:
       nvidia.com/gpu.family: "hopper"
       nvidia.com/gpu.memory: "81920"
+  nvidia-gpu-gh200:
+    nodeSelector:
+      nvidia.com/gpu.family: "hopper"
+      nvidia.com/gpu.memory: "97871"
   nvidia-gpu-a100-80gb:
     nodeSelector:
       nvidia.com/gpu.family: "ampere"

diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -45,9 +45,9 @@ modelServers:
       google-tpu: "substratusai/vllm:v0.6.4.post1-tpu"
       nvidia-gpu: "vllm/vllm-openai:v0.6.4.post1"
       # TODO (samos123) switch to the official image when it is available.
-      # Note this is simply a clone of drikster80/vllm-gh200-openai:v0.6.3.post1.
       # Source: https://github.com/drikster80/vllm/tree/gh200-docker
-      gh200: "substratusai/vllm-gh200-openai:v0.6.3.post1"
+      # gh200: "drikster80/vllm-gh200-openai:v0.6.4.post1"
+      gh200: "substratusai/vllm-gh200-openai:v0.6.4.post1"
   OLlama:
     images:
       default: "ollama/ollama:latest"

diff --git a/charts/models/values.yaml b/charts/models/values.yaml
@@ -234,6 +234,23 @@ catalog:
     minReplicas: 1
     maxReplicas: 1
     resourceProfile: nvidia-gpu-a100-80gb:8
+  llama-3.3-70b-instruct-bf16-gh200:
+    enabled: false
+    features: [TextGeneration]
+    url: hf://meta-llama/Llama-3.3-70B-Instruct
+    engine: VLLM
+    env:
+      VLLM_ATTENTION_BACKEND: FLASHINFER
+    args:
+      - --max-model-len=32768
+      - --max-num-batched-token=32768
+      - --gpu-memory-utilization=0.98
+      - --kv-cache-dtype=fp8
+      - --cpu-offload-gb=60
+      - --enable-prefix-caching
+      - --disable-log-requests
+    resourceProfile: nvidia-gpu-gh200:1
+    targetRequests: 200
   nomic-embed-text-cpu:
     enabled: false
     features: ["TextEmbedding"]

diff --git a/manifests/models/llama-3.3-70b-instruct-bf16-gh200.yaml b/manifests/models/llama-3.3-70b-instruct-bf16-gh200.yaml
@@ -0,0 +1,21 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.3-70b-instruct-bf16-gh200
+spec:
+  features: [TextGeneration]
+  url: hf://meta-llama/Llama-3.3-70B-Instruct
+  engine: VLLM
+  args:
+    - --max-model-len=32768
+    - --max-num-batched-token=32768
+    - --gpu-memory-utilization=0.98
+    - --kv-cache-dtype=fp8
+    - --cpu-offload-gb=60
+    - --enable-prefix-caching
+    - --disable-log-requests
+  env:
+    VLLM_ATTENTION_BACKEND: FLASHINFER
+  targetRequests: 200
+  resourceProfile: nvidia-gpu-gh200:1