Skip to content

Commit

Permalink
Llama 3.1 70b on L4 with pipeline parallelism (#307)
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 authored Nov 10, 2024
1 parent 4f2bf76 commit c30396a
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 0 deletions.
6 changes: 6 additions & 0 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ resourceProfiles:
# Perhaps this is just needed for GKE Autopilot which defaults
# to 1Gi for CPU-only.
# ephemeral-storage: "2Gi"
nvidia-gpu-t4:
imageName: "nvidia-gpu"
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
nvidia-gpu-l4:
imageName: "nvidia-gpu"
limits:
Expand Down
25 changes: 25 additions & 0 deletions charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,31 @@ catalog:
- --disable-log-requests
resourceProfile: nvidia-gpu-h100:2
targetRequests: 500
llama-3.1-70b-instruct-fp8-l4:
enabled: false
features: [TextGeneration]
url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
engine: VLLM
env:
VLLM_ATTENTION_BACKEND: FLASHINFER
args:
- --max-model-len=32768
- --max-num-batched-token=32768
- --max-num-seqs=512
- --gpu-memory-utilization=0.9
# Pipeline parallelism performs better than tensor over PCI.
- --pipeline-parallel-size=4
# A minimum of tensor parallel 2 was needed to not have OOM errors.
# We use 8 GPUs so parallelism strategy of 4 x 2 works well.
- --tensor-parallel-size=2
- --enable-prefix-caching
- --enable-chunked-prefill=false
- --disable-log-requests
- --kv-cache-dtype=fp8
# Enforce eager wasn't supported with FLASHINFER.
- --enforce-eager
resourceProfile: nvidia-gpu-l4:8
targetRequests: 500
llama-3.1-405b-instruct-fp8-h100:
enabled: false
features: [TextGeneration]
Expand Down
26 changes: 26 additions & 0 deletions manifests/models/llama-3.1-70b-instruct-fp8-l4.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: llama-3.1-70b-instruct-fp8-l4
spec:
features: [TextGeneration]
owner:
url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
engine: VLLM
args:
- --max-model-len=32768
- --max-num-batched-token=32768
- --max-num-seqs=512
- --gpu-memory-utilization=0.9
- --pipeline-parallel-size=4
- --tensor-parallel-size=2
- --enable-prefix-caching
- --enable-chunked-prefill=false
- --disable-log-requests
- --kv-cache-dtype=fp8
- --enforce-eager
env:
VLLM_ATTENTION_BACKEND: FLASHINFER
targetRequests: 500
resourceProfile: nvidia-gpu-l4:8

0 comments on commit c30396a

Please sign in to comment.