add llama 3.1 70b fp8 model on 1 x gh200 (#302)

substratusai · Nov 8, 2024 · 4f2bf76 · 4f2bf76
1 parent 766d1d8
commit 4f2bf76
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 3 deletions.
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -20,7 +20,11 @@ modelServers:
       default: "vllm/vllm-openai:v0.6.3.post1"
       cpu: "substratusai/vllm:v0.6.3.post1-cpu"
       google-tpu: "substratusai/vllm:v0.6.3.post1-tpu"
-      gh200: "drikster80/vllm-gh200-openai:v0.6.3.post1"
+      nvidia-gpu: "vllm/vllm-openai:v0.6.3.post1"
+      # TODO (samos123) switch to the official image when it is available.
+      # Note this is simply a clone of drikster80/vllm-gh200-openai:v0.6.3.post1.
+      # Source: https://github.com/drikster80/vllm/tree/gh200-docker
+      gh200: "substratusai/vllm-gh200-openai:v0.6.3.post1"
   OLlama:
     images:
       default: "ollama/ollama:latest"

diff --git a/charts/models/values.yaml b/charts/models/values.yaml
@@ -161,6 +161,25 @@ catalog:
     # You can also use nvidia-gpu-a100-80gb:8
     resourceProfile: nvidia-gpu-h100:8
     targetRequests: 500
+  llama-3.1-70b-instruct-fp8-gh200:
+    enabled: true
+    features: [TextGeneration]
+    url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
+    engine: VLLM
+    env:
+      VLLM_ATTENTION_BACKEND: FLASHINFER
+    args:
+      - --max-model-len=32768
+      - --max-num-batched-token=32768
+      - --max-num-seqs=1024
+      - --gpu-memory-utilization=0.9
+      - --enable-prefix-caching
+      - --enable-chunked-prefill=false
+      - --disable-log-requests
+      - --kv-cache-dtype=fp8
+      - --enforce-eager
+    resourceProfile: nvidia-gpu-gh200:1
+    targetRequests: 1024
   llama-3.1-70b-instruct-awq-int4-gh200:
     enabled: false
     features: [TextGeneration]

diff --git a/manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml b/manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml
@@ -1,3 +1,4 @@
+# Source: models/templates/models.yaml
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
@@ -13,5 +14,4 @@ spec:
     - --enable-prefix-caching
     - --disable-log-requests
   targetRequests: 50
-  minReplicas: 1
-  resourceProfile: nvidia-gpu-gh200:1
+  resourceProfile: nvidia-gpu-gh200:1
diff --git a/manifests/models/llama-3.1-70b-instruct-fp8-gh200.yaml b/manifests/models/llama-3.1-70b-instruct-fp8-gh200.yaml
@@ -0,0 +1,24 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-70b-instruct-fp8-gh200
+spec:
+  features: [TextGeneration]
+  owner:
+  url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
+  engine: VLLM
+  args:
+    - --max-model-len=32768
+    - --max-num-batched-token=32768
+    - --max-num-seqs=1024
+    - --gpu-memory-utilization=0.9
+    - --enable-prefix-caching
+    - --enable-chunked-prefill=false
+    - --disable-log-requests
+    - --kv-cache-dtype=fp8
+    - --enforce-eager
+  env:
+    VLLM_ATTENTION_BACKEND: FLASHINFER
+  targetRequests: 1024
+  resourceProfile: nvidia-gpu-gh200:1