From 7f8d612d24c66e9b5f8c0aa6cb562e853e9523a0 Mon Sep 17 00:00:00 2001 From: Earthwalker <48991073+etwk@users.noreply.github.com> Date: Tue, 30 Jul 2024 03:42:21 +0800 Subject: [PATCH] [TPU] Support tensor parallelism in async llm engine (#6891) --- Dockerfile.tpu | 3 +++ vllm/engine/async_llm_engine.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 4fc14d6bd186c..adebb8ab5adca 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -12,6 +12,9 @@ RUN pip install "numpy<2" RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html +# Fix FastAPI dependence +RUN pip install "starlette<0.38.0" + # Build vLLM. COPY . /workspace/vllm ENV VLLM_TARGET_DEVICE="tpu" diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 93cc319f11c42..d3f9a0ab00f10 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -407,8 +407,14 @@ def _get_executor_cls( from vllm.executor.neuron_executor import NeuronExecutorAsync executor_class = NeuronExecutorAsync elif engine_config.device_config.device_type == "tpu": - from vllm.executor.tpu_executor import TPUExecutorAsync - executor_class = TPUExecutorAsync + if distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync + executor_class = RayTPUExecutorAsync + else: + assert distributed_executor_backend is None + from vllm.executor.tpu_executor import TPUExecutorAsync + executor_class = TPUExecutorAsync elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync