diff --git a/charts/kubeai/values-nvidia-k8s-device-plugin.yaml b/charts/kubeai/values-nvidia-k8s-device-plugin.yaml index 1fe70fd0..2ae5202c 100644 --- a/charts/kubeai/values-nvidia-k8s-device-plugin.yaml +++ b/charts/kubeai/values-nvidia-k8s-device-plugin.yaml @@ -11,6 +11,10 @@ resourceProfiles: nodeSelector: nvidia.com/gpu.family: "hopper" nvidia.com/gpu.memory: "81920" + nvidia-gpu-gh200: + nodeSelector: + nvidia.com/gpu.family: "hopper" + nvidia.com/gpu.memory: "97871" nvidia-gpu-a100-80gb: nodeSelector: nvidia.com/gpu.family: "ampere" diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml index 596a644b..d315db13 100644 --- a/charts/kubeai/values.yaml +++ b/charts/kubeai/values.yaml @@ -45,9 +45,9 @@ modelServers: google-tpu: "substratusai/vllm:v0.6.4.post1-tpu" nvidia-gpu: "vllm/vllm-openai:v0.6.4.post1" # TODO (samos123) switch to the official image when it is available. - # Note this is simply a clone of drikster80/vllm-gh200-openai:v0.6.3.post1. # Source: https://github.com/drikster80/vllm/tree/gh200-docker - gh200: "substratusai/vllm-gh200-openai:v0.6.3.post1" + # gh200: "drikster80/vllm-gh200-openai:v0.6.4.post1" + gh200: "substratusai/vllm-gh200-openai:v0.6.4.post1" OLlama: images: default: "ollama/ollama:latest" diff --git a/charts/models/values.yaml b/charts/models/values.yaml index 4c1b18e9..7d7f2714 100644 --- a/charts/models/values.yaml +++ b/charts/models/values.yaml @@ -234,6 +234,23 @@ catalog: minReplicas: 1 maxReplicas: 1 resourceProfile: nvidia-gpu-a100-80gb:8 + llama-3.3-70b-instruct-bf16-gh200: + enabled: false + features: [TextGeneration] + url: hf://meta-llama/Llama-3.3-70B-Instruct + engine: VLLM + env: + VLLM_ATTENTION_BACKEND: FLASHINFER + args: + - --max-model-len=32768 + - --max-num-batched-token=32768 + - --gpu-memory-utilization=0.98 + - --kv-cache-dtype=fp8 + - --cpu-offload-gb=60 + - --enable-prefix-caching + - --disable-log-requests + resourceProfile: nvidia-gpu-gh200:1 + targetRequests: 200 nomic-embed-text-cpu: enabled: false features: ["TextEmbedding"] diff --git a/manifests/models/llama-3.3-70b-instruct-bf16-gh200.yaml b/manifests/models/llama-3.3-70b-instruct-bf16-gh200.yaml new file mode 100644 index 00000000..80a2cbad --- /dev/null +++ b/manifests/models/llama-3.3-70b-instruct-bf16-gh200.yaml @@ -0,0 +1,21 @@ +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: llama-3.3-70b-instruct-bf16-gh200 +spec: + features: [TextGeneration] + url: hf://meta-llama/Llama-3.3-70B-Instruct + engine: VLLM + args: + - --max-model-len=32768 + - --max-num-batched-token=32768 + - --gpu-memory-utilization=0.98 + - --kv-cache-dtype=fp8 + - --cpu-offload-gb=60 + - --enable-prefix-caching + - --disable-log-requests + env: + VLLM_ATTENTION_BACKEND: FLASHINFER + targetRequests: 200 + resourceProfile: nvidia-gpu-gh200:1