From 312ec7bd0347cf18c55aefdfbfaef398c95dbde2 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Fri, 10 May 2024 11:44:38 -0600
Subject: [PATCH 1/3] TEMP: no shared tokenizer from PR-3512

Gotta get conflicts with the main line figured resolved first. Just
don't use the shared tokenizer for now.

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/entrypoints/grpc/grpc_server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/grpc/grpc_server.py b/vllm/entrypoints/grpc/grpc_server.py
index f4450e175dc24..9aead23aaee06 100644
--- a/vllm/entrypoints/grpc/grpc_server.py
+++ b/vllm/entrypoints/grpc/grpc_server.py
@@ -118,7 +118,8 @@ def __init__(self, engine: AsyncLLMEngine, args: argparse.Namespace):
 
     async def _post_init(self):
         self.config = await self.engine.get_model_config()
-        self.tokenizer_group = await self.engine.get_tokenizer_group()
+         # self.tokenizer_group = await self.engine.get_tokenizer_group()
+        self.tokenizer_group = self.engine.engine.tokenizer
         self.tokenizer = await self.engine.get_tokenizer()
 
         # Swap in the special TGIS stats logger

From bd23984499fb111a8b2f50317da122c335572a29 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Mon, 29 Apr 2024 15:45:15 -0700
Subject: [PATCH 2/3] [Core] Make Ray an optional "extras" requirement

Still included in built docker images
---
 Dockerfile            |  2 +-
 requirements-cuda.txt |  1 -
 requirements-rocm.txt |  3 +--
 setup.py              | 20 ++++++++++++++++----
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ddca95c0e8786..27dbe6ff88122 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -104,7 +104,7 @@ RUN ldconfig /usr/local/cuda-12.4/compat/
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install dist/*.whl --verbose
+    pip install "$(echo dist/*.whl)[ray]" --verbose
 #################### vLLM installation IMAGE ####################
 
 
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index acb0164007dba..99891e5b64ab3 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -2,7 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for NVIDIA GPUs
-ray >= 2.9
 nvidia-ml-py # for pynvml package
 vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.3.0
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 903845b64d98f..80f4f4431d830 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -1,5 +1,4 @@
 # Common dependencies
 -r requirements-common.txt
 
-# Dependencies for AMD GPUs
-ray == 2.9.3
+# No specific dependencies currently for AMD GPUs
diff --git a/setup.py b/setup.py
index a66af2c5d556f..cae4468cae67d 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 import subprocess
 import sys
 from shutil import which
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import torch
 from packaging.version import Version, parse
@@ -380,6 +380,20 @@ def _read_requirements(filename: str) -> List[str]:
     return requirements
 
 
+def get_extra_requirements() -> Optional[Dict[str, List[str]]]:
+    extras = {"tensorizer": ["tensorizer>=2.9.0"]}
+    if _is_cuda():
+        extras["ray"] = ["ray>=2.9"]
+    elif _is_hip():
+        extras["ray"] = ["ray==2.9.3"]
+    elif _is_neuron() or _is_cpu():
+        pass
+    else:
+        raise ValueError(
+            "Unsupported platform, please use CUDA, ROCM or Neuron.")
+    return extras
+
+
 ext_modules = []
 
 if _is_cuda():
@@ -425,9 +439,7 @@ def _read_requirements(filename: str) -> List[str]:
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
-    extras_require={
-        "tensorizer": ["tensorizer>=2.9.0"],
-    },
+    extras_require=get_extra_requirements(),
     cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
     package_data=package_data,
 )

From 3be261cf3e4bfe3eee1650ca6d59e50d52d3d169 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 21 May 2024 12:16:19 +0200
Subject: [PATCH 3/3] Dockerfile.ubi: remove leftover flash-attn references

---
 Dockerfile.ubi | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index d4fbd52d1c8ce..16ad4e6018f39 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -240,10 +240,6 @@ RUN pip install \
     mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/ && \
     chmod 0755 /opt/vllm/lib/libnccl.so.2.18.1
 
-# Install flash attention (from pre-built wheel)
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install \
         # additional dependencies for the TGIS gRPC server