Skip to content

Commit

Permalink
add nvidia-gpu-rtx4070-8gb and qwen2.5 models (#326)
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 authored Dec 4, 2024
1 parent 20d35f9 commit bcda5c2
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 2 deletions.
4 changes: 4 additions & 0 deletions charts/kubeai/values-nvidia-k8s-device-plugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ resourceProfiles:
nodeSelector:
nvidia.com/gpu.family: "ampere"
nvidia.com/gpu.memory: "40960"
nvidia-gpu-rtx4070-8gb:
nodeSelector:
nvidia.com/gpu.family: "ampere"
nvidia.com/gpu.memory: "8188"
10 changes: 9 additions & 1 deletion charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,15 @@ resourceProfiles:
operator: "Equal"
value: "present"
effect: "NoSchedule"

nvidia-gpu-rtx4070-8gb:
imageName: "nvidia-gpu"
limits:
nvidia.com/gpu: "1"
tolerations:
- key: "nvidia.com/gpu"
operator: "Equal"
value: "present"
effect: "NoSchedule"

cacheProfiles: {}

Expand Down
28 changes: 27 additions & 1 deletion charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ catalog:
resourceProfile: nvidia-gpu-h100:8
targetRequests: 500
llama-3.1-70b-instruct-fp8-gh200:
enabled: true
enabled: false
features: [TextGeneration]
url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
engine: VLLM
Expand Down Expand Up @@ -270,6 +270,32 @@ catalog:
engine: VLLM
resourceProfile: nvidia-gpu-l4:1
# Qwen #
qwen2.5-coder-1.5b-cpu:
enabled: false
features: ["TextGeneration"]
url: "ollama://qwen2.5-coder:1.5b"
engine: OLlama
resourceProfile: cpu:1
qwen2.5-coder-1.5b-rtx4070-8gb:
enabled: false
features: ["TextGeneration"]
url: "hf://Qwen/Qwen2.5-Coder-1.5B-Instruct"
engine: VLLM
env:
VLLM_ATTENTION_BACKEND: FLASHINFER
args:
- --max-model-len=2048
- --max-num-seqs=16
- --quantization=fp8
- --kv-cache-dtype=fp8
minReplicas: 1
resourceProfile: nvidia-gpu-rtx4070-8gb:1
qwen2.5-7b-cpu:
enabled: false
features: ["TextGeneration"]
url: "ollama://qwen2.5:7b"
engine: OLlama
resourceProfile: cpu:2
qwen2-500m-cpu:
enabled: false
features: ["TextGeneration"]
Expand Down
11 changes: 11 additions & 0 deletions manifests/models/qwen2.5-7b-cpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: qwen2.5-7b-cpu
spec:
features: [TextGeneration]
owner:
url: ollama://qwen2.5:7b
engine: OLlama
resourceProfile: cpu:2
11 changes: 11 additions & 0 deletions manifests/models/qwen2.5-coder-1.5b-cpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: qwen2.5-coder-1.5b-cpu
spec:
features: [TextGeneration]
owner:
url: ollama://qwen2.5-coder:1.5b
engine: OLlama
resourceProfile: cpu:1
19 changes: 19 additions & 0 deletions manifests/models/qwen2.5-coder-1.5b-rtx4070-8gb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: qwen2.5-coder-1.5b-rtx4070-8gb
spec:
features: [TextGeneration]
owner:
url: hf://Qwen/Qwen2.5-Coder-1.5B-Instruct
engine: VLLM
args:
- --max-model-len=2048
- --max-num-seqs=16
- --quantization=fp8
- --kv-cache-dtype=fp8
env:
VLLM_ATTENTION_BACKEND: FLASHINFER
minReplicas: 1
resourceProfile: nvidia-gpu-rtx4070-8gb:1

0 comments on commit bcda5c2

Please sign in to comment.