Skip to content

Commit

Permalink
add llama 3.1 70b fp8 model on 1 x gh200 (#302)
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 authored Nov 8, 2024
1 parent 766d1d8 commit 4f2bf76
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 3 deletions.
6 changes: 5 additions & 1 deletion charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ modelServers:
default: "vllm/vllm-openai:v0.6.3.post1"
cpu: "substratusai/vllm:v0.6.3.post1-cpu"
google-tpu: "substratusai/vllm:v0.6.3.post1-tpu"
gh200: "drikster80/vllm-gh200-openai:v0.6.3.post1"
nvidia-gpu: "vllm/vllm-openai:v0.6.3.post1"
# TODO (samos123) switch to the official image when it is available.
# Note this is simply a clone of drikster80/vllm-gh200-openai:v0.6.3.post1.
# Source: https://github.com/drikster80/vllm/tree/gh200-docker
gh200: "substratusai/vllm-gh200-openai:v0.6.3.post1"
OLlama:
images:
default: "ollama/ollama:latest"
Expand Down
19 changes: 19 additions & 0 deletions charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,25 @@ catalog:
# You can also use nvidia-gpu-a100-80gb:8
resourceProfile: nvidia-gpu-h100:8
targetRequests: 500
llama-3.1-70b-instruct-fp8-gh200:
enabled: true
features: [TextGeneration]
url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
engine: VLLM
env:
VLLM_ATTENTION_BACKEND: FLASHINFER
args:
- --max-model-len=32768
- --max-num-batched-token=32768
- --max-num-seqs=1024
- --gpu-memory-utilization=0.9
- --enable-prefix-caching
- --enable-chunked-prefill=false
- --disable-log-requests
- --kv-cache-dtype=fp8
- --enforce-eager
resourceProfile: nvidia-gpu-gh200:1
targetRequests: 1024
llama-3.1-70b-instruct-awq-int4-gh200:
enabled: false
features: [TextGeneration]
Expand Down
4 changes: 2 additions & 2 deletions manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
Expand All @@ -13,5 +14,4 @@ spec:
- --enable-prefix-caching
- --disable-log-requests
targetRequests: 50
minReplicas: 1
resourceProfile: nvidia-gpu-gh200:1
resourceProfile: nvidia-gpu-gh200:1
24 changes: 24 additions & 0 deletions manifests/models/llama-3.1-70b-instruct-fp8-gh200.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: llama-3.1-70b-instruct-fp8-gh200
spec:
features: [TextGeneration]
owner:
url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
engine: VLLM
args:
- --max-model-len=32768
- --max-num-batched-token=32768
- --max-num-seqs=1024
- --gpu-memory-utilization=0.9
- --enable-prefix-caching
- --enable-chunked-prefill=false
- --disable-log-requests
- --kv-cache-dtype=fp8
- --enforce-eager
env:
VLLM_ATTENTION_BACKEND: FLASHINFER
targetRequests: 1024
resourceProfile: nvidia-gpu-gh200:1

0 comments on commit 4f2bf76

Please sign in to comment.