Llama 3.1 70b on L4 with pipeline parallelism (#307)

substratusai · Nov 10, 2024 · c30396a · c30396a
1 parent 4f2bf76
commit c30396a
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 0 deletions.
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -67,6 +67,12 @@ resourceProfiles:
       # Perhaps this is just needed for GKE Autopilot which defaults
       # to 1Gi for CPU-only.
       # ephemeral-storage: "2Gi"
+  nvidia-gpu-t4:
+    imageName: "nvidia-gpu"
+    limits:
+      nvidia.com/gpu: "1"
+    requests:
+      nvidia.com/gpu: "1"
   nvidia-gpu-l4:
     imageName: "nvidia-gpu"
     limits:

diff --git a/charts/models/values.yaml b/charts/models/values.yaml
@@ -144,6 +144,31 @@ catalog:
       - --disable-log-requests
     resourceProfile: nvidia-gpu-h100:2
     targetRequests: 500
+  llama-3.1-70b-instruct-fp8-l4:
+    enabled: false
+    features: [TextGeneration]
+    url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
+    engine: VLLM
+    env:
+      VLLM_ATTENTION_BACKEND: FLASHINFER
+    args:
+      - --max-model-len=32768
+      - --max-num-batched-token=32768
+      - --max-num-seqs=512
+      - --gpu-memory-utilization=0.9
+      # Pipeline parallelism performs better than tensor over PCI.
+      - --pipeline-parallel-size=4
+      # A minimum of tensor parallel 2 was needed to not have OOM errors.
+      # We use 8 GPUs so parallelism strategy of 4 x 2 works well.
+      - --tensor-parallel-size=2
+      - --enable-prefix-caching
+      - --enable-chunked-prefill=false
+      - --disable-log-requests
+      - --kv-cache-dtype=fp8
+      # Enforce eager wasn't supported with FLASHINFER.
+      - --enforce-eager
+    resourceProfile: nvidia-gpu-l4:8
+    targetRequests: 500
   llama-3.1-405b-instruct-fp8-h100:
     enabled: false
     features: [TextGeneration]

diff --git a/manifests/models/llama-3.1-70b-instruct-fp8-l4.yaml b/manifests/models/llama-3.1-70b-instruct-fp8-l4.yaml
@@ -0,0 +1,26 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-70b-instruct-fp8-l4
+spec:
+  features: [TextGeneration]
+  owner:
+  url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
+  engine: VLLM
+  args:
+    - --max-model-len=32768
+    - --max-num-batched-token=32768
+    - --max-num-seqs=512
+    - --gpu-memory-utilization=0.9
+    - --pipeline-parallel-size=4
+    - --tensor-parallel-size=2
+    - --enable-prefix-caching
+    - --enable-chunked-prefill=false
+    - --disable-log-requests
+    - --kv-cache-dtype=fp8
+    - --enforce-eager
+  env:
+    VLLM_ATTENTION_BACKEND: FLASHINFER
+  targetRequests: 500
+  resourceProfile: nvidia-gpu-l4:8