From 7f8d612d24c66e9b5f8c0aa6cb562e853e9523a0 Mon Sep 17 00:00:00 2001
From: Earthwalker <48991073+etwk@users.noreply.github.com>
Date: Tue, 30 Jul 2024 03:42:21 +0800
Subject: [PATCH] [TPU] Support tensor parallelism in async llm engine (#6891)

---
 Dockerfile.tpu                  |  3 +++
 vllm/engine/async_llm_engine.py | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 4fc14d6bd186c..adebb8ab5adca 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -12,6 +12,9 @@ RUN pip install "numpy<2"
 RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 
+# Fix FastAPI dependence
+RUN pip install "starlette<0.38.0"
+
 # Build vLLM.
 COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 93cc319f11c42..d3f9a0ab00f10 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -407,8 +407,14 @@ def _get_executor_cls(
             from vllm.executor.neuron_executor import NeuronExecutorAsync
             executor_class = NeuronExecutorAsync
         elif engine_config.device_config.device_type == "tpu":
-            from vllm.executor.tpu_executor import TPUExecutorAsync
-            executor_class = TPUExecutorAsync
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync
+                executor_class = RayTPUExecutorAsync
+            else:
+                assert distributed_executor_backend is None
+                from vllm.executor.tpu_executor import TPUExecutorAsync
+                executor_class = TPUExecutorAsync
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync