From 28c145eb5755902505c066dc3b1e5315572cc6e7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 14 Jun 2024 14:40:09 -0700
Subject: [PATCH 01/98] [Bugfix] Fix typo in Pallas backend (#5558)

---
 vllm/attention/backends/pallas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 75f2465264ad3..b203c5ec54c92 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -110,7 +110,7 @@ def __init__(
             raise NotImplementedError("TPU version must be 4 or higher.")
 
         self.megacore_mode = None
-        tpu_type = torch_xla.tpu.get_tp_groupu_env()["TYPE"].lower()
+        tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower()
         if not tpu_type.endswith("lite"):
             if self.num_kv_heads % 2 == 0:
                 self.megacore_mode = "kv_head"

From f5bb85b435e6fe3db57fae1e25e09914015ef957 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 14 Jun 2024 14:47:45 -0700
Subject: [PATCH 02/98] [Core][Distributed] improve p2p cache generation
 (#5528)

---
 .../device_communicators/cuda_wrapper.py      | 146 ++++++++++++
 .../custom_all_reduce_utils.py                | 215 ++++++++++--------
 2 files changed, 265 insertions(+), 96 deletions(-)
 create mode 100644 vllm/distributed/device_communicators/cuda_wrapper.py

diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
new file mode 100644
index 0000000000000..24308235c4a48
--- /dev/null
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -0,0 +1,146 @@
+"""This file is a pure Python wrapper for the cudart library.
+It avoids the need to compile a separate shared library, and is
+convenient for use when we just need to call a few functions.
+"""
+
+import ctypes
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+# this line makes it possible to directly load `libcudart.so` using `ctypes`
+import torch  # noqa
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+# === export types and functions from cudart to Python ===
+# for the original cudart definition, please check
+# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html
+
+cudaError_t = ctypes.c_int
+cudaMemcpyKind = ctypes.c_int
+
+
+class cudaIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: List[Any]
+
+
+class CudaRTLibrary:
+    exported_functions = [
+        # ​cudaError_t cudaSetDevice ( int  device )
+        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
+        # cudaError_t 	cudaDeviceSynchronize ( void )
+        Function("cudaDeviceSynchronize", cudaError_t, []),
+        # ​cudaError_t cudaDeviceReset ( void )
+        Function("cudaDeviceReset", cudaError_t, []),
+
+        # const char* 	cudaGetErrorString ( cudaError_t error )
+        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
+
+        # ​cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
+        Function("cudaMalloc", cudaError_t,
+                 [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]),
+        # ​cudaError_t 	cudaFree ( void* devPtr )
+        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
+        # ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
+        Function("cudaMemset", cudaError_t,
+                 [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]),
+        # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
+        Function("cudaMemcpy", cudaError_t, [
+            ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind
+        ]),
+
+        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
+        Function("cudaIpcGetMemHandle", cudaError_t,
+                 [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]),
+        # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
+        Function("cudaIpcOpenMemHandle", cudaError_t, [
+            ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint
+        ]),
+    ]
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: Dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+
+    def __init__(self, so_file: Optional[str] = None):
+        if so_file is None:
+            assert torch.version.cuda is not None
+            major_version = torch.version.cuda.split(".")[0]
+            so_file = f"libcudart.so.{major_version}"
+        if so_file not in CudaRTLibrary.path_to_library_cache:
+            lib = ctypes.CDLL(so_file)
+            CudaRTLibrary.path_to_library_cache[so_file] = lib
+        self.lib = CudaRTLibrary.path_to_library_cache[so_file]
+
+        if so_file not in CudaRTLibrary.path_to_dict_mapping:
+            _funcs = {}
+            for func in CudaRTLibrary.exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
+        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]
+
+    def CUDART_CHECK(self, result: cudaError_t) -> None:
+        if result != 0:
+            error_str = self.cudaGetErrorString(result)
+            raise RuntimeError(f"CUDART error: {error_str}")
+
+    def cudaGetErrorString(self, error: cudaError_t) -> str:
+        return self.funcs["cudaGetErrorString"](error).decode("utf-8")
+
+    def cudaSetDevice(self, device: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))
+
+    def cudaDeviceSynchronize(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())
+
+    def cudaDeviceReset(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())
+
+    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
+        return devPtr
+
+    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
+        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))
+
+    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int,
+                   count: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))
+
+    def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p,
+                   count: int) -> None:
+        cudaMemcpyDefault = 4
+        kind = cudaMemcpyDefault
+        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))
+
+    def cudaIpcGetMemHandle(self,
+                            devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
+        handle = cudaIpcMemHandle_t()
+        self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"](
+            ctypes.byref(handle), devPtr))
+        return handle
+
+    def cudaIpcOpenMemHandle(self,
+                             handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
+        cudaIpcMemLazyEnablePeerAccess = 1
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"](
+            ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess))
+        return devPtr
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index c9573edb08f33..e6957b1196969 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,87 +1,98 @@
+import ctypes
 import json
 import os
-import sys
-import tempfile
-import time
-from contextlib import contextmanager
-from typing import Callable, Dict, List, Optional
+from itertools import product
+from typing import Dict, Optional, Sequence
 
-import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 
 import vllm.envs as envs
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.logger import init_logger
 from vllm.utils import cuda_device_count_stateless
 
 logger = init_logger(__name__)
 
 
-@contextmanager
-def mute_output():
-    with open(os.devnull, "w") as f:
-        sys.stderr = f
-        sys.stdout = f
-        yield
-
-
-def producer(i: int,
-             init_method: str,
+def producer(batch_src: Sequence[int],
+             producer_queue,
+             consumer_queue,
+             result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
         os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
-    with mute_output():
-        dist.init_process_group(
-            backend="gloo",
-            init_method=init_method,
-            world_size=2,
-            rank=0,
-        )
-        # produce a tensor in GPU i
-        data = torch.zeros((128, ), device=f"cuda:{i}")
-        # get the information to reconstruct the shared tensor
-        func, args = torch.multiprocessing.reductions.reduce_tensor(data)
-        args = list(args)
-        dist.broadcast_object_list([(func, args)], src=0)
-        dist.barrier()
-        torch.cuda.synchronize()
-        assert torch.all(data == 1).item()
-
-
-def consumer(j: int,
-             init_method: str,
+
+    lib = CudaRTLibrary()
+    for i in batch_src:
+        lib.cudaSetDevice(i)
+        pointer = lib.cudaMalloc(1024)
+        lib.cudaMemset(pointer, 1, 1024)
+        lib.cudaDeviceSynchronize()
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        producer_queue.put(handle)
+        open_success = consumer_queue.get()
+        if open_success:
+            # use two queues to simulate barrier
+            producer_queue.put(0)
+            consumer_queue.get()
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def consumer(batch_tgt: Sequence[int],
+             producer_queue,
+             consumer_queue,
+             result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
         os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
-    with mute_output():
-        dist.init_process_group(
-            backend="gloo",
-            init_method=init_method,
-            world_size=2,
-            rank=1,
-        )
-        torch.cuda.set_device(j)
-        recv = [None]
-        dist.broadcast_object_list(recv, src=0)
-        func: Callable
-        args: List
-        func, args = recv[0]  # type: ignore
-        # `args[6]` is the device id
-        # by default pytorch will use `i` from the producer
-        # here we need to set it to `j` to test P2P access
-        args[6] = j
-        data = func(*args)
-        data += 1
-        dist.barrier()
-        torch.cuda.synchronize()
-        assert torch.all(data == 1).item()
-
-
-def can_actually_p2p(i, j):
+
+    lib = CudaRTLibrary()
+    for j in batch_tgt:
+        lib.cudaSetDevice(j)
+        handle = producer_queue.get()
+        open_success = False
+        try:
+            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
+            open_success = True
+        except RuntimeError:
+            # cannot error out here, because the producer process
+            # is still waiting for the response.
+            pass
+        consumer_queue.put(open_success)
+        if open_success:
+            # modify the memory
+            lib.cudaMemset(pointer, 2, 1024)
+            # use two queues to simulate barrier
+            producer_queue.get()
+            consumer_queue.put(0)
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def can_actually_p2p(
+    batch_src: Sequence[int],
+    batch_tgt: Sequence[int],
+):
     """
     Usually, checking if P2P access is enabled can be done by
-    `torch.cuda.can_device_access_peer(i, j)`. However, sometimes
-    the driver might be broken, and `torch.cuda.can_device_access_peer(i, j)`
+    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
+    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
     returns `True` even if P2P access is not actually possible.
     See https://github.com/vllm-project/vllm/issues/2728 and
     https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
@@ -90,41 +101,50 @@ def can_actually_p2p(i, j):
 
     Note on p2p and cuda IPC:
     Usually, one process uses one GPU:
-    GPU i --> cuda context i --> tensor i --> process i
+    GPU src --> cuda context src --> tensor src --> process src
 
     We need to combine p2p and cuda IPC, so that:
-    GPU i --> cuda context i --> tensor i --> process i
-                                 |shared|
-    GPU j --> cuda context j --> tensor j --> process j
-    That is to say, process i creates a tensor in GPU i, passes IPC handle to
-    process j, and process j accesses the tensor in GPU j. Any operation on the
-    tensor in process j will be reflected in the tensor in process i, because
+    GPU src --> cuda context src --> tensor src --> process src
+                                      |shared|
+    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
+    That is to say, process src creates a tensor in GPU src, passes IPC handle to
+    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
+    tensor in process tgt will be reflected in the tensor in process src, because
     they are the same memory segment.
-    It is important to note that process j accesses the tensor in GPU j, not
-    GPU i. That's why we need p2p access. # noqa
-    """
+    It is important to note that process tgt accesses the tensor in GPU tgt, not
+    GPU src. That's why we need p2p access.
+
+    The most time-consuming part is the process creation. To avoid creating
+    processes for every pair of GPUs, we use batched testing. We create two
+    processes for testing all pairs of GPUs in batch. The trick is to reset
+    the device after each test (which is not available in PyTorch).
+    """  # noqa
     cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
     # pass the CUDA_VISIBLE_DEVICES to the child process
     # to make sure they see the same set of GPUs
 
-    # make sure the temp file is not the same across different calls
-    temp_path = tempfile.mktemp() + str(time.time())
-    # create an empty file
-    with open(temp_path, "w"):
-        pass
-    init_method = f"file://{temp_path}"
-
     # make sure the processes are spawned
     smp = mp.get_context("spawn")
-    pi = smp.Process(target=producer,
-                     args=(i, init_method, cuda_visible_devices))
-    pj = smp.Process(target=consumer,
-                     args=(j, init_method, cuda_visible_devices))
-    pi.start()
-    pj.start()
-    pi.join()
-    pj.join()
-    return pi.exitcode == 0 and pj.exitcode == 0
+    producer_queue = smp.Queue()
+    consumer_queue = smp.Queue()
+    result_queue = smp.Queue()
+    p_src = smp.Process(target=producer,
+                        args=(batch_src, producer_queue, consumer_queue,
+                              result_queue, cuda_visible_devices))
+    p_tgt = smp.Process(target=consumer,
+                        args=(batch_tgt, producer_queue, consumer_queue,
+                              result_queue, cuda_visible_devices))
+    p_src.start()
+    p_tgt.start()
+    p_src.join()
+    p_tgt.join()
+    result = []
+    for src, tgt in zip(batch_src, batch_tgt):
+        a = result_queue.get()
+        b = result_queue.get()
+        assert a == b
+        result.append(a)
+    return result
 
 
 # why do we need this cache?
@@ -142,14 +162,14 @@ def can_actually_p2p(i, j):
 _gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
 
 
-def gpu_p2p_access_check(i: int, j: int) -> bool:
-    """Check if GPU i can access GPU j."""
+def gpu_p2p_access_check(src: int, tgt: int) -> bool:
+    """Check if GPU src can access GPU tgt."""
 
     # if the cache variable is already calculated,
     # read from the cache instead of checking it again
     global _gpu_p2p_access_cache
     if _gpu_p2p_access_cache is not None:
-        return _gpu_p2p_access_cache[f"{i}->{j}"]
+        return _gpu_p2p_access_cache[f"{src}->{tgt}"]
 
     is_distributed = dist.is_initialized()
 
@@ -169,9 +189,12 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
         #  enter this block to calculate the cache
         logger.info("generating GPU P2P access cache in %s", path)
         cache = {}
-        for _i in range(num_dev):
-            for _j in range(num_dev):
-                cache[f"{_i}->{_j}"] = can_actually_p2p(_i, _j)
+        ids = list(range(num_dev))
+        # batch of all pairs of GPUs
+        batch_src, batch_tgt = zip(*list(product(ids, ids)))
+        result = can_actually_p2p(batch_src, batch_tgt)
+        for _i, _j, r in zip(batch_src, batch_tgt, result):
+            cache[f"{_i}->{_j}"] = r
         with open(path, "w") as f:
             json.dump(cache, f, indent=4)
     if is_distributed:
@@ -180,7 +203,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
     with open(path, "r") as f:
         cache = json.load(f)
     _gpu_p2p_access_cache = cache
-    return _gpu_p2p_access_cache[f"{i}->{j}"]
+    return _gpu_p2p_access_cache[f"{src}->{tgt}"]
 
 
 __all__ = ["gpu_p2p_access_check"]

From bd7efe95d03773c65fa7dc1e122f3ce0e079a542 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 14 Jun 2024 19:18:22 -0500
Subject: [PATCH 03/98] Add ccache to amd (#5555)

---
 .buildkite/test-template-aws.j2 | 1 +
 Dockerfile.rocm                 | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 09649b625c319..01f7ff1e0e2b5 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -30,6 +30,7 @@ steps:
         command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
         env:
           DOCKER_BUILDKIT: "1"
+        priority: 100
         soft_fail: true
     {% endif %}
     {% endfor %}
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 954958df88fc0..724fa1673c3b3 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -42,6 +42,7 @@ RUN apt-get update && apt-get install -y \
     unzip \
     nvidia-cuda-toolkit \
     tmux \
+    ccache \
  && rm -rf /var/lib/apt/lists/*
 
 ### Mount Point ###
@@ -102,7 +103,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 
 ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
 
-RUN --mount=type=cache,target=/root/.cache/pip \
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
     && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \

From 1b8a0d71cf5aa1a43c14478ec90538c3fbe1b315 Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Sat, 15 Jun 2024 08:23:56 +0800
Subject: [PATCH 04/98] [Core][Bugfix]: fix prefix caching for blockv2 (#5364)

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
---
 tests/core/block/e2e/test_correctness.py | 67 ++++++++++++++++++++++++
 vllm/core/block/prefix_caching_block.py  |  7 ++-
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index ad253635e0ba0..8502eab0f8da0 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -477,3 +477,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
         assert expected_token_ids == actual_token_ids
 
     assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # we keep the blocks small, so that hit eviction quickly
+        "max_model_len": 48,
+        "block_size": 16,
+        "num_gpu_blocks_override": 3,
+
+        # Test APC in v2 block
+        "use_v2_block_manager": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+}])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
+                                                 test_llm_generator):
+    """Verify block manager v2 with auto prefix caching could works normal
+    even when eviction started.
+    With APC enabled, all blocks are held by native block at the beginning.
+    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    block, then it could be reused, or we need to recompute its kv cache.
+    """
+    output_len = 10
+    temperature = 0.0
+
+    prompts = [
+        "You are a helpful assistant. Please answer truthfully and write "
+        "out your thinking step by step to be sure you get the right answer. "
+        "If you make a mistake, attempt to correct it. who are you?",
+        "You are a helpful assistant. Please answer truthfully and write out "
+        "your thinking step by step to be sure you get the right answer. You "
+        "are helpful and harmless and you follow ethical guidelines. "
+        "who are you?"
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 405e9705659df..88dbbfb2f3690 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -176,14 +176,17 @@ def allocate_mutable(self,
 
             self._refcounter.incr(block_id)
 
-            # the block comes from evictor already contain computed result
+            # Now this block is pop from evictor and ready to write
+            # with new content which most probably different with
+            # original content. So need to tell worker to recompute
+            # its kvcache
             block = self._create_block(
                 prev_block=prev_block,
                 token_ids=[],
                 block_size=self._block_size,
                 allocator=self,
                 block_id=block_id,
-                computed=True,
+                computed=False,
             )
             assert block.content_hash is None
 

From 0e9164b40abdb30f1929edb44b56894c9e26c31d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Jun 2024 12:45:31 +0800
Subject: [PATCH 05/98] [mypy] Enable type checking for test directory (#5017)

---
 .github/workflows/mypy.yaml                   |  2 +-
 benchmarks/benchmark_serving.py               | 18 +++----
 benchmarks/benchmark_throughput.py            |  4 +-
 benchmarks/kernels/benchmark_aqlm.py          | 10 ++--
 benchmarks/kernels/benchmark_marlin.py        |  8 +--
 benchmarks/kernels/benchmark_moe.py           | 26 +++++++---
 .../kernels/benchmark_paged_attention.py      | 11 ++--
 benchmarks/kernels/benchmark_rope.py          |  7 +--
 examples/fp8/extract_scales.py                | 12 ++---
 examples/offline_inference_distributed.py     |  8 +--
 format.sh                                     |  2 +-
 tests/core/block/test_block_table.py          |  8 +--
 tests/core/block/test_prefix_caching_block.py |  4 +-
 tests/core/test_chunked_prefill_scheduler.py  | 10 ++--
 tests/core/test_scheduler.py                  | 52 +++++++++----------
 tests/core/utils.py                           | 12 +++--
 tests/distributed/test_pynccl.py              |  5 +-
 tests/distributed/test_utils.py               |  5 +-
 tests/entrypoints/test_openai_server.py       |  5 +-
 tests/kernels/test_attention.py               | 33 ++++++------
 tests/kernels/test_blocksparse_attention.py   | 22 ++++----
 tests/kernels/test_cache.py                   | 32 ++++++------
 tests/kernels/test_cutlass.py                 |  4 +-
 tests/kernels/test_flash_attn.py              |  4 +-
 tests/kernels/test_pos_encoding.py            | 28 +++++-----
 tests/lora/conftest.py                        | 21 ++++++--
 tests/lora/data/long_context_test_data.py     | 24 ++++++++-
 tests/lora/test_baichuan.py                   |  6 ++-
 tests/lora/test_chatglm3.py                   |  6 ++-
 tests/lora/test_gemma.py                      |  6 ++-
 tests/lora/test_layer_variation.py            |  6 +--
 tests/lora/test_layers.py                     | 23 ++++----
 tests/lora/test_llama.py                      |  6 ++-
 tests/lora/test_long_context.py               | 15 +++---
 tests/lora/test_lora_checkpoints.py           |  4 +-
 tests/lora/test_lora_manager.py               |  6 +--
 tests/lora/test_mixtral.py                    |  6 ++-
 tests/lora/test_phi.py                        |  6 ++-
 tests/lora/test_quant_model.py                |  7 ++-
 tests/lora/utils.py                           | 18 +++----
 tests/models/test_fp8.py                      |  3 +-
 tests/prefix_caching/test_prefix_caching.py   |  5 +-
 tests/quantization/test_configs.py            |  3 +-
 tests/samplers/test_logprobs.py               | 11 ++--
 tests/samplers/test_rejection_sampler.py      |  4 +-
 tests/samplers/test_sampler.py                | 41 ++++++++-------
 tests/spec_decode/e2e/conftest.py             | 13 ++---
 tests/spec_decode/test_batch_expansion.py     |  6 ++-
 tests/spec_decode/test_multi_step_worker.py   | 19 ++++---
 tests/spec_decode/test_spec_decode_worker.py  | 17 ++++--
 tests/spec_decode/utils.py                    | 14 +++--
 tests/test_cache_block_hashing.py             |  2 +-
 tests/test_logger.py                          |  1 +
 tests/tokenization/test_detokenize.py         |  4 +-
 tests/utils.py                                |  2 +-
 tests/worker/test_model_runner.py             | 23 ++++----
 vllm/attention/backends/torch_sdpa.py         |  4 +-
 vllm/attention/backends/xformers.py           |  4 +-
 vllm/core/block/block_table.py                |  2 +-
 vllm/core/block/naive_block.py                |  2 +-
 vllm/core/block/prefix_caching_block.py       |  2 +-
 vllm/core/block_manager_v2.py                 |  2 +-
 .../custom_all_reduce_utils.py                |  8 +--
 .../device_communicators/pynccl_wrapper.py    |  2 +-
 vllm/engine/llm_engine.py                     |  4 +-
 vllm/engine/metrics.py                        |  4 +-
 vllm/engine/output_processor/single_step.py   |  6 +--
 vllm/entrypoints/openai/run_batch.py          |  3 +-
 vllm/entrypoints/openai/serving_chat.py       |  2 +-
 vllm/entrypoints/openai/serving_embedding.py  |  2 +-
 vllm/lora/lora.py                             |  3 +-
 vllm/lora/worker_manager.py                   |  2 +-
 vllm/model_executor/layers/linear.py          |  2 +-
 .../layers/quantization/gptq_marlin.py        | 11 ++--
 .../quantization/utils/marlin_24_perms.py     | 18 ++++---
 .../layers/quantization/utils/marlin_perms.py | 18 ++++---
 vllm/model_executor/layers/sampler.py         | 25 +++++----
 vllm/model_executor/model_loader/loader.py    |  7 +--
 .../model_loader/weight_utils.py              |  2 +-
 vllm/model_executor/models/__init__.py        |  4 +-
 vllm/model_executor/models/arctic.py          |  4 +-
 vllm/model_executor/models/commandr.py        |  4 +-
 vllm/model_executor/models/gemma.py           |  4 +-
 vllm/sequence.py                              |  2 +-
 vllm/spec_decode/multi_step_worker.py         | 10 ++--
 vllm/spec_decode/ngram_worker.py              |  6 +--
 vllm/spec_decode/spec_decode_worker.py        |  8 +--
 vllm/spec_decode/util.py                      |  4 +-
 vllm/transformers_utils/detokenizer.py        |  2 +-
 vllm/utils.py                                 | 38 ++++++++------
 vllm/worker/model_runner.py                   |  4 +-
 vllm/worker/worker_base.py                    |  4 +-
 92 files changed, 510 insertions(+), 379 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 22e6c2ef0101e..62f0dbcd93eff 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -47,5 +47,5 @@ jobs:
         mypy vllm/model_executor  --config-file pyproject.toml
         mypy vllm/lora --config-file pyproject.toml
         mypy vllm/logging --config-file pyproject.toml
-        mypy vllm/model_executor --config-file pyproject.toml
+        mypy tests --config-file pyproject.toml
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index df32b366c4140..c136ee572fdf3 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -31,7 +31,7 @@
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
 
 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@@ -200,12 +200,12 @@ def calculate_metrics(
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens = []
+    actual_output_lens: List[int] = []
     total_input = 0
     completed = 0
-    itls = []
-    tpots = []
-    ttfts = []
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             # We use the tokenizer to count the number of output tokens for all
@@ -265,7 +265,7 @@ async def benchmark(
     disable_tqdm: bool,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+        request_func = ASYNC_REQUEST_FUNCS[backend]
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
@@ -292,7 +292,7 @@ async def benchmark(
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
     benchmark_start_time = time.perf_counter()
-    tasks = []
+    tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
         request_func_input = RequestFuncInput(
@@ -310,7 +310,7 @@ async def benchmark(
                              pbar=pbar)))
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
 
-    if not disable_tqdm:
+    if pbar is not None:
         pbar.close()
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
@@ -466,7 +466,7 @@ def main(args: argparse.Namespace):
 
     # Save config and results to json
     if args.save_result:
-        result_json = {}
+        result_json: Dict[str, Any] = {}
 
         # Setup
         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 463d9973d00dd..48dfce4287671 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -108,8 +108,8 @@ def run_vllm(
     )
 
     # Add the requests to the engine.
-    prompts = []
-    sampling_params = []
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
     for prompt, _, output_len in requests:
         prompts.append(prompt)
         sampling_params.append(
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index 59392947b15c8..ac6a9f297f95a 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -86,9 +86,9 @@ def dequant_no_scale(
 # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
 # the generic pytorch version.
 # Just visual comparison.
-def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
 
-    n = parts.sum().item()
+    n = int(parts.sum().item())
 
     device = torch.device('cuda:0')
 
@@ -204,7 +204,7 @@ def main():
         sys.stdout = sys.__stdout__
 
 
-def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
              methods):
 
     # I didn't see visible improvements from increasing these, but feel free :)
@@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
     print('')
 
 
-def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
                nbooks: int, bits: int, method) -> float:
 
-    n = parts.sum().item()
+    n = int(parts.sum().item())
 
     device = torch.device('cuda:0')
 
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index b771911781574..96f01967b351e 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,4 +1,5 @@
 import argparse
+from typing import List
 
 import torch
 import torch.utils.benchmark as benchmark
@@ -23,8 +24,9 @@
 K_FULL_OPTS = [False, True]
 
 
-def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
-              size_m, size_k, size_n):
+def bench_run(results: List[benchmark.Measurement], model: str,
+              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
+              size_m: int, size_k: int, size_n: int):
     label = "Quant Matmul"
 
     sub_label = ("{}, act={} k_full={}, b={}, g={}, "
@@ -156,7 +158,7 @@ def main(args):
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
 
-    results = []
+    results: List[benchmark.Measurement] = []
 
     for model in args.models:
         for layer in WEIGHT_SHAPES[model]:
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index be5dd32bd6f91..62347aaf8ed6d 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,7 +1,7 @@
 import argparse
 import time
 from datetime import datetime
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, TypedDict
 
 import ray
 import torch
@@ -12,8 +12,17 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 
 
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
 def benchmark_config(
-    config: Dict[str, int],
+    config: BenchmarkConfig,
     num_tokens: int,
     num_experts: int,
     shard_intermediate_size: int,
@@ -92,7 +101,7 @@ def run():
     start_event = torch.cuda.Event(enable_timing=True)
     end_event = torch.cuda.Event(enable_timing=True)
 
-    latencies = []
+    latencies: List[float] = []
     for i in range(num_iters):
         prepare(i)
         torch.cuda.synchronize()
@@ -111,7 +120,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
     # Reduced search space for faster tuning.
     # TODO(woosuk): Increase the search space and use a performance model to
     # prune the search space.
-    configs = []
+    configs: List[BenchmarkConfig] = []
     for num_stages in [2, 3, 4, 5]:
         for block_m in [16, 32, 64, 128, 256]:
             for block_k in [64, 128, 256]:
@@ -175,8 +184,8 @@ def tune(
         topk: int,
         dtype: torch.dtype,
         use_fp8: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
+        search_space: List[BenchmarkConfig],
+    ) -> BenchmarkConfig:
         best_config = None
         best_time = float("inf")
         for config in tqdm(search_space):
@@ -199,10 +208,11 @@ def tune(
                 best_config = config
         now = datetime.now()
         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        assert best_config is not None
         return best_config
 
 
-def sort_config(config: Dict[str, int]) -> Dict[str, int]:
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     return {
         "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
         "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
@@ -214,7 +224,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:
 
 
 def save_configs(
-    configs: Dict[int, Dict[str, int]],
+    configs: Dict[int, BenchmarkConfig],
     num_experts: int,
     shard_intermediate_size: int,
     hidden_size: int,
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index a5355f4c13d37..687e2369b758c 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,7 +1,7 @@
 import argparse
 import random
 import time
-from typing import Optional
+from typing import List, Optional
 
 import torch
 
@@ -54,14 +54,17 @@ def main(
 
     # Create the block tables.
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = []
+    block_tables_lst: List[List[int]] = []
     for _ in range(num_seqs):
         block_table = [
             random.randint(0, NUM_BLOCKS - 1)
             for _ in range(max_num_blocks_per_seq)
         ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst,
+                                dtype=torch.int,
+                                device=device)
 
     # Create the KV cache.
     key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 00e55f6060b52..a53c6c77a5828 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,11 +1,12 @@
 import argparse
 from itertools import accumulate
-from typing import Optional
+from typing import List, Optional
 
 import nvtx
 import torch
 
-from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
+                                                         get_rope)
 
 
 def benchmark_rope_kernels_multi_lora(
@@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora(
                             })
     # non-batched RoPE takes only one scaling factor, we create multiple
     # instances to simulate the same behavior
-    non_batched_ropes = []
+    non_batched_ropes: List[RotaryEmbedding] = []
     for scaling_factor in scaling_factors:
         non_batched_ropes.append(
             get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py
index e007a3bc0821c..1dce9d7e993a0 100644
--- a/examples/fp8/extract_scales.py
+++ b/examples/fp8/extract_scales.py
@@ -2,7 +2,7 @@
 import glob
 import json
 import os
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -19,7 +19,7 @@ def _prepare_hf_weights(
     quantized_model_dir: str,
     load_format: str = "auto",
     fall_back_to_pt: bool = True,
-) -> Tuple[str, List[str], bool]:
+) -> Tuple[List[str], bool]:
     if not os.path.isdir(quantized_model_dir):
         raise FileNotFoundError(
             f"The quantized model directory `{quantized_model_dir}` "
@@ -94,7 +94,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str,
 
 
 def _kv_scales_extractor(
-        hf_tensor_files: Iterable[str],
+        hf_tensor_files: List[str],
         use_safetensors: bool,
         rank_keyword: str = "rank",
         expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
@@ -115,7 +115,7 @@ def _kv_scales_extractor(
     for char in rank_keyword:
         assert not char.isdecimal(
         ), f"Rank keyword {rank_keyword} contains a numeric character!"
-    rank_scales_map = {}
+    rank_scales_map: Dict[int, Dict[int, float]] = {}
     for tensor_file in hf_tensor_files:
         try:
             rank_idx = tensor_file.find(rank_keyword)
@@ -141,7 +141,7 @@ def _kv_scales_extractor(
             raise
 
         if rank not in rank_scales_map:
-            layer_scales_map = {}
+            layer_scales_map: Dict[int, float] = {}
             rank_scales_map[rank] = layer_scales_map
         else:
             raise RuntimeError(
@@ -222,7 +222,7 @@ def _metadata_extractor(quantized_model_dir: str,
             "does not exist.")
     metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))
 
-    result = {}
+    result: Dict[str, Any] = {}
     for file in metadata_files:
         with open(file) as f:
             try:
diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py
index 1e59e89509724..677127844ccdd 100644
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@@ -5,7 +5,7 @@
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """
 
-from typing import Dict
+from typing import Any, Dict, List
 
 import numpy as np
 import ray
@@ -40,8 +40,8 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
         # The output is a list of RequestOutput objects that contain the prompt,
         # generated text, and other information.
         outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt = []
-        generated_text = []
+        prompt: List[str] = []
+        generated_text: List[str] = []
         for output in outputs:
             prompt.append(output.prompt)
             generated_text.append(' '.join([o.text for o in output.outputs]))
@@ -71,7 +71,7 @@ def scheduling_strategy_fn():
         pg, placement_group_capture_child_tasks=True))
 
 
-resources_kwarg = {}
+resources_kwarg: Dict[str, Any] = {}
 if tensor_parallel_size == 1:
     # For tensor_parallel_size == 1, we simply set num_gpus=1.
     resources_kwarg["num_gpus"] = 1
diff --git a/format.sh b/format.sh
index 2fd6af03bd573..8c54b56302d5b 100755
--- a/format.sh
+++ b/format.sh
@@ -111,7 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/model_executor  --config-file pyproject.toml
 mypy vllm/lora --config-file pyproject.toml
 mypy vllm/logging --config-file pyproject.toml
-mypy vllm/model_executor --config-file pyproject.toml
+mypy tests --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index 6fb95cfdfab81..496774c8de53c 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 
 from vllm.core.block.block_table import BlockTable
@@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
     token_ids = list(range(sequence_len))
     num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
 
-    block_tables = []
+    block_tables: List[BlockTable] = []
     for i in range(5):
         assert allocator.get_num_free_blocks(
             device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
@@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
     num_immutable_blocks_per_alloc = len(
         chunked_tokens) - num_mutable_blocks_per_alloc
 
-    block_tables = []
+    block_tables: List[BlockTable] = []
     for alloc_i in range(1, 6):
 
         block_tables.append(
@@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
     )
     block_table.allocate(token_ids=token_ids, device=Device.GPU)
 
-    appended_so_far = []
+    appended_so_far: List[int] = []
     for append in chunk_list(token_ids_to_append, append_size):
         block_table.append_token_ids(append)
         appended_so_far.extend(append)
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index bcf08cda09f46..fcf32cbe99472 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -123,7 +123,7 @@ def create_chain(block_size: int,
                      num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
-        blocks = []
+        blocks: List[PrefixCachingBlock] = []
         num_blocks = math.ceil(
             len(token_ids) / block_size) + num_empty_trailing_blocks
 
@@ -608,7 +608,7 @@ def create_immutable_chain(
     ) -> List[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
-        blocks = []
+        blocks: List[Block] = []
         num_blocks = math.ceil(len(token_ids) / block_size)
 
         if num_blocks == 0:
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index f68482cc0d906..a3b76327e0a53 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -483,11 +483,11 @@ def test_chunked_prefill_preempt():
     # The request should be preempted.
     scheduler.block_manager.can_append_slots = MagicMock()
 
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+    def cannot_append_second_group1(seq_group, num_lookahead_slots):
         return seq_group.request_id != "1"
 
     scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
+        cannot_append_second_group1)
 
     # The running prefill is now preempted.
     _, out = schedule_and_update_computed_tokens(scheduler)
@@ -505,11 +505,11 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert seq_group.get_num_uncomputed_tokens() == 30
 
     # We should be able to run prefill twice as it is chunked.
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+    def cannot_append_second_group2(seq_group, num_lookahead_slots):
         return True
 
     scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
+        cannot_append_second_group2)
     _, out = schedule_and_update_computed_tokens(scheduler)
     assert len(out.scheduled_seq_groups) == 1
     assert out.num_prefill_groups == 1
@@ -530,7 +530,7 @@ def test_chunked_prefill_max_seqs():
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running = []
+    running: List[SequenceGroup] = []
 
     _, seq_group = create_dummy_prompt("1", prompt_length=65)
     scheduler.add_seq_group(seq_group)
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 07fc8731e1847..bae958211cb7b 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1,6 +1,6 @@
 import time
 from collections import deque
-from typing import List
+from typing import Deque, List, Set, Tuple
 from unittest.mock import MagicMock
 
 import pytest  # noqa
@@ -65,7 +65,7 @@ def test_scheduler_abort_seq_group():
 
     # Add multiple seq groups to scheduler.
     num_seq_group = 4
-    request_ids = set()
+    request_ids: Set[str] = set()
     for i in range(num_seq_group):
         _, seq_group = create_dummy_prompt(str(i), block_size)
         scheduler.add_seq_group(seq_group)
@@ -347,7 +347,7 @@ def test_prefill_schedule_max_prompt_len():
     Test prompt longer than max_prompt_len is aborted.
     """
     scheduler = initialize_scheduler(max_model_len=30)
-    _, seq_group = create_dummy_prompt(0, prompt_length=60)
+    _, seq_group = create_dummy_prompt("0", prompt_length=60)
     waiting = deque([seq_group])
     budget = create_token_budget()
     remaining_waiting, output = scheduler._schedule_prefills(
@@ -364,7 +364,7 @@ def test_prefill_schedule_token_budget():
     Test token budget respected.
     """
     scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(token_budget=0)
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@@ -419,7 +419,7 @@ def test_prefill_schedule_max_seqs():
     Test max seq respected.
     """
     scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(max_num_seqs=2)
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@@ -453,9 +453,9 @@ def test_prefill_schedule_max_lora():
     """
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
     scheduler = initialize_scheduler(lora_config=lora_config)
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(token_budget=120)
-    curr_loras = set()
+    curr_loras: Set[int] = set()
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
@@ -499,7 +499,7 @@ def test_prefill_schedule_no_block_manager_capacity():
     Test sequence cannot be scheduled due to block manager has no capacity.
     """
     scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget()
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@@ -536,7 +536,7 @@ def test_decode_schedule_preempted():
     Test decodes cannot be scheduled and preempted.
     """
     scheduler = initialize_scheduler()
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     for i in range(3):
@@ -577,7 +577,7 @@ def test_decode_swap_beam_search():
     Test best_of > 1 swap out blocks
     """
     scheduler = initialize_scheduler()
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     budget = create_token_budget()
@@ -628,7 +628,7 @@ def test_schedule_decode_blocks_to_copy_update():
     """
     scheduler = initialize_scheduler()
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     scheduler._allocate_and_set_running(seq_group)
@@ -656,10 +656,10 @@ def test_schedule_decode_blocks_to_copy_update():
 
 def test_schedule_swapped_simple():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
@@ -683,10 +683,10 @@ def test_schedule_swapped_simple():
 
 def test_schedule_swapped_max_token_budget():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
         _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
         scheduler._allocate_and_set_running(seq_group)
@@ -717,10 +717,10 @@ def test_schedule_swapped_max_token_budget():
 
 def test_schedule_swapped_max_seqs():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for i in range(4):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
         scheduler._allocate_and_set_running(seq_group)
@@ -750,10 +750,10 @@ def test_schedule_swapped_max_seqs():
 def test_schedule_swapped_max_loras():
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
     scheduler = initialize_scheduler(lora_config=lora_config)
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
-    curr_loras = set()
-    blocks_to_swap_out = []
+    curr_loras: Set[int] = set()
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
@@ -779,10 +779,10 @@ def test_schedule_swapped_max_loras():
 
 def test_schedule_swapped_cannot_swap_in():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
         _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
         scheduler._allocate_and_set_running(seq_group)
@@ -806,10 +806,10 @@ def test_schedule_swapped_cannot_swap_in():
 
 def test_infeasible_swap():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
         _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
         scheduler._allocate_and_set_running(seq_group)
@@ -834,13 +834,13 @@ def test_infeasible_swap():
 
 def test_schedule_swapped_blocks_to_copy():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     scheduler._swap_out(seq_group, blocks_to_swap_out)
     swapped.append(seq_group)
 
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 2fbf099c5f90b..f249f4b59a2ee 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,5 +1,7 @@
 import time
-from typing import Iterable, Optional, Tuple
+from typing import List, Optional
+from typing import Sequence as GenericSequence
+from typing import Tuple
 
 from vllm import SamplingParams
 from vllm.lora.request import LoRARequest
@@ -46,7 +48,7 @@ def create_dummy_prompt_encoder_decoder(
     lora_request: Optional[LoRARequest] = None,
     use_beam_search: bool = False,
     best_of: int = 1,
-) -> Tuple[Sequence, SequenceGroup]:
+) -> Tuple[Sequence, Sequence, SequenceGroup]:
     if not block_size:
         block_size = decoder_prompt_length
 
@@ -86,7 +88,7 @@ def create_dummy_prompt_encoder_decoder(
 
 def create_seq_group(
         seq_prompt_len: int = 1024,
-        seq_output_lens: Iterable[int] = (128, ),
+        seq_output_lens: GenericSequence[int] = (128, ),
         request_id: str = '0',
         seq_id_start: int = 0,
         sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
@@ -98,7 +100,7 @@ def create_seq_group(
 
     prompt_token_ids = [0] * seq_prompt_len
 
-    seqs = []
+    seqs: List[Sequence] = []
     for seq_id_offset, output_len in enumerate(seq_output_lens):
         seq = Sequence(
             seq_id=seq_id_start + seq_id_offset,
@@ -125,7 +127,7 @@ def create_seq_group(
 
 def create_seq_group_encoder_decoder(
         seq_prompt_len: int = 1024,
-        seq_output_lens: Iterable[int] = (128, ),
+        seq_output_lens: GenericSequence[int] = (128, ),
         request_id: str = '0',
         seq_id_start: int = 0,
         sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index b788e253ab9ef..964dbc5423e75 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,5 +1,6 @@
 import multiprocessing
 import os
+from typing import Dict, List
 
 import pytest
 import torch
@@ -17,9 +18,9 @@
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
-    processes = []
+    processes: List[multiprocessing.Process] = []
     for i in range(number_of_processes):
-        env = {}
+        env: Dict[str, str] = {}
         env['RANK'] = str(i)
         env['LOCAL_RANK'] = str(i)
         env['WORLD_SIZE'] = str(number_of_processes)
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 923ad66c2e08f..49d11daca9aec 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -6,7 +6,7 @@
 
 
 @ray.remote
-class _CUDADeviceCountStatelessTestActor():
+class _CUDADeviceCountStatelessTestActor:
 
     def get_count(self):
         return cuda_device_count_stateless()
@@ -22,7 +22,8 @@ def test_cuda_device_count_stateless():
     """Test that cuda_device_count_stateless changes return value if
     CUDA_VISIBLE_DEVICES is changed."""
 
-    actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote()
+    actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
+        num_gpus=2).remote()
     assert sorted(ray.get(
         actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"]
     assert ray.get(actor.get_count.remote()) == 2
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 2d7e3044d1841..d66b9b0fd3882 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1,6 +1,7 @@
 # imports for guided decoding tests
 import json
 import re
+from typing import List
 
 import jsonschema
 import openai  # use the official client for correctness check
@@ -453,7 +454,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
                                              max_tokens=5,
                                              temperature=0.0,
                                              stream=True)
-    chunks = []
+    chunks: List[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         chunks.append(chunk.choices[0].text)
@@ -499,7 +500,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
         temperature=0.0,
         stream=True,
     )
-    chunks = []
+    chunks: List[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 8bc4766fc93c4..f848ad51c7014 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -72,27 +72,27 @@ def ref_single_query_cached_kv_attention(
     block_size = value_cache.shape[3]
     num_seqs = query.shape[0]
 
-    block_tables = block_tables.cpu().tolist()
-    seq_lens = seq_lens.cpu().tolist()
+    block_tables_lst = block_tables.cpu().tolist()
+    seq_lens_lst = seq_lens.cpu().tolist()
     for i in range(num_seqs):
         q = query[i].unsqueeze(0)
-        block_table = block_tables[i]
-        seq_len = int(seq_lens[i])
+        block_table = block_tables_lst[i]
+        seq_len = int(seq_lens_lst[i])
 
-        keys = []
-        values = []
+        keys_lst: List[torch.Tensor] = []
+        values_lst: List[torch.Tensor] = []
         for j in range(seq_len):
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
 
             k = key_cache[block_number, :, :, block_offset, :]
             k = k.reshape(num_kv_heads, head_size)
-            keys.append(k)
+            keys_lst.append(k)
 
             v = value_cache[block_number, :, :, block_offset]
-            values.append(v)
-        keys = torch.stack(keys, dim=0)
-        values = torch.stack(values, dim=0)
+            values_lst.append(v)
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
         if num_queries_per_kv > 1:
             # Handle MQA and GQA
             keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
@@ -157,14 +157,15 @@ def test_paged_attention(
 
     # Create the block tables.
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = []
+    block_tables_lst: List[List[int]] = []
     for _ in range(num_seqs):
         block_table = [
             random.randint(0, NUM_BLOCKS - 1)
             for _ in range(max_num_blocks_per_seq)
         ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int)
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
 
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
@@ -283,7 +284,7 @@ def ref_multi_query_kv_attention(
     dtype: torch.dtype,
 ) -> torch.Tensor:
     num_seqs = len(cu_seq_lens) - 1
-    ref_outputs = []
+    ref_outputs: List[torch.Tensor] = []
     for i in range(num_seqs):
         start_idx = cu_seq_lens[i]
         end_idx = cu_seq_lens[i + 1]
@@ -303,8 +304,8 @@ def ref_multi_query_kv_attention(
             attn_mask=attn_mask,
         )
         ref_outputs.append(ref_output)
-    ref_output = torch.cat(ref_outputs, dim=0)
-    return ref_output
+
+    return torch.cat(ref_outputs, dim=0)
 
 
 # TODO(woosuk): Add tests for USE_ALIBI=True.
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index 9da13ca6e2310..402545d1980d6 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention(
     block_size = value_cache.shape[3]
     num_seqs = query.shape[0]
 
-    block_tables = block_tables.cpu().tolist()
-    seq_lens = seq_lens.cpu().tolist()
+    block_tables_lst = block_tables.cpu().tolist()
+    seq_lens_lst = seq_lens.cpu().tolist()
     for i in range(num_seqs):
         q = query[i].unsqueeze(0)
-        block_table = block_tables[i]
-        seq_len = int(seq_lens[i])
+        block_table = block_tables_lst[i]
+        seq_len = int(seq_lens_lst[i])
 
-        keys = []
-        values = []
+        keys_lst: List[torch.Tensor] = []
+        values_lst: List[torch.Tensor] = []
         for j in range(seq_len):
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
 
             k = key_cache[block_number, :, :, block_offset, :]
             k = k.reshape(num_kv_heads, head_size)
-            keys.append(k)
+            keys_lst.append(k)
 
             v = value_cache[block_number, :, :, block_offset]
-            values.append(v)
-        keys = torch.stack(keys, dim=0)
-        values = torch.stack(values, dim=0)
+            values_lst.append(v)
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
         if num_queries_per_kv > 1:
             # Handle MQA and GQA
             keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
@@ -432,7 +432,7 @@ def test_varlen_blocksparse_attention_prefill(
         value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
 
     ref_output = ref_multi_query_kv_attention(
-        cu_seq_lens,
+        cu_seq_lens.tolist(),
         query,
         key,
         value,
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 29572cfa57499..23b6baa60c05b 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -1,5 +1,5 @@
 import random
-from typing import Tuple
+from typing import List, Tuple
 
 import pytest
 import torch
@@ -63,7 +63,7 @@ def test_copy_blocks(
     src_blocks = random.sample(range(num_blocks), num_mappings)
     remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
     dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
-    block_mapping = []
+    block_mapping: List[Tuple[int, int]] = []
     for i in range(num_mappings):
         src = src_blocks[i]
         dst1 = dst_blocks[2 * i]
@@ -131,8 +131,8 @@ def test_reshape_and_cache(
     torch.set_default_device(device)
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
-    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long)
 
     qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
     _, key, value = qkv.unbind(dim=1)
@@ -170,12 +170,12 @@ def test_reshape_and_cache(
     # Run the reference implementation.
     reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
     block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indicies = block_indicies.cpu().tolist()
+    block_indicies_lst = block_indicies.cpu().tolist()
     block_offsets = slot_mapping % block_size
-    block_offsets = block_offsets.cpu().tolist()
+    block_offsets_lst = block_offsets.cpu().tolist()
     for i in range(num_tokens):
-        block_idx = block_indicies[i]
-        block_offset = block_offsets[i]
+        block_idx = block_indicies_lst[i]
+        block_offset = block_offsets_lst[i]
         cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
 
@@ -224,8 +224,10 @@ def test_reshape_and_cache_flash(
 
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
-    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device)
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
 
     qkv = torch.randn(num_tokens,
                       3,
@@ -257,13 +259,13 @@ def test_reshape_and_cache_flash(
                                 slot_mapping, kv_cache_dtype)
 
     # Run the reference implementation.
-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
-    block_indicies = block_indicies.cpu().tolist()
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indicies_lst = block_indicies.cpu().tolist()
     block_offsets = slot_mapping % block_size
-    block_offsets = block_offsets.cpu().tolist()
+    block_offsets_lst = block_offsets.cpu().tolist()
     for i in range(num_tokens):
-        block_idx = block_indicies[i]
-        block_offset = block_offsets[i]
+        block_idx = block_indicies_lst[i]
+        block_offset = block_offsets_lst[i]
         cloned_key_cache[block_idx, block_offset, :, :] = key[i]
         cloned_value_cache[block_idx, block_offset, :, :] = value[i]
 
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 777138ace656e..4d09cd8ce9c63 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -17,13 +17,13 @@
 capability = capability[0] * 10 + capability[1]
 
 
-def to_fp8(tensor: torch.tensor):
+def to_fp8(tensor: torch.Tensor):
     finfo = torch.finfo(torch.float8_e4m3fn)
     return torch.round(tensor.clamp(
         min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
 
 
-def to_int8(tensor: torch.tensor):
+def to_int8(tensor: torch.Tensor):
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index 22772d4ea4422..cd06c27175cef 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -25,7 +25,7 @@ def ref_paged_attn(
     block_tables = block_tables.cpu().numpy()
     _, block_size, num_kv_heads, head_size = key_cache.shape
 
-    outputs = []
+    outputs: List[torch.Tensor] = []
     start_idx = 0
     for i in range(num_seqs):
         query_len = query_lens[i]
@@ -70,7 +70,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("dtype", DTYPES)
 @torch.inference_mode
 def test_flash_attn_with_paged_kv(
-    kv_lens: List[Tuple[int, int]],
+    kv_lens: List[int],
     num_heads: Tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index e564e325112a6..4c83659929d41 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -1,5 +1,5 @@
 from itertools import accumulate, product
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import pytest
 import torch
@@ -126,7 +126,7 @@ def test_batched_rotary_embedding(
                                       query,
                                       key,
                                       offsets=torch.zeros(batch_size * seq_len,
-                                                          dtype=int,
+                                                          dtype=torch.long,
                                                           device=device))
     # Compare the results.
     assert torch.allclose(out_query,
@@ -214,20 +214,16 @@ def test_batched_rotary_embedding_multi_lora(
 def test_rope_module_cache():
     MAX_POSITIONS = [123, 1234]
     BASES = [10000, 1000000]
-    ROPE_SCALINGS = [
-        None, {
-            "type": "linear",
-            "factor": (1, )
-        }, {
-            "type": "dynamic",
-            "factor": 1
-        }
-    ]
-    settings = [
-        HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
-        ROPE_SCALINGS, DTYPES
-    ]
-    rope_setting_id_map = {}
+    ROPE_SCALINGS = (None, {
+        "type": "linear",
+        "factor": (1, )
+    }, {
+        "type": "dynamic",
+        "factor": 1
+    })
+    settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
+                ROPE_SCALINGS, DTYPES)
+    rope_setting_id_map: Dict[str, int] = {}
     for setting in product(*settings):
         head_size, rotary_dim, max_position, base, \
             is_neox_stype, rope_scaling, dtype = setting
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 522c635b82d9c..4eab73a71071c 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -2,6 +2,7 @@
 import gc
 import tempfile
 from collections import OrderedDict
+from typing import Dict, List, TypedDict
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -24,7 +25,18 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
 
-LONG_LORA_INFOS = [{
+
+class ContextIDInfo(TypedDict):
+    lora_id: int
+    context_length: str
+
+
+class ContextInfo(TypedDict):
+    lora: str
+    context_length: str
+
+
+LONG_LORA_INFOS: List[ContextIDInfo] = [{
     "lora_id": 1,
     "context_length": "16k",
 }, {
@@ -207,7 +219,7 @@ def long_context_infos(long_context_lora_files_16k_1,
                        long_context_lora_files_16k_2,
                        long_context_lora_files_32k):
     cleanup()
-    infos = {}
+    infos: Dict[int, ContextInfo] = {}
     for lora_checkpoint_info in LONG_LORA_INFOS:
         lora_id = lora_checkpoint_info["lora_id"]
         if lora_id == 1:
@@ -226,7 +238,7 @@ def long_context_infos(long_context_lora_files_16k_1,
 
 
 @pytest.fixture
-def llama_2_7b_engine_extra_embeddings() -> nn.Module:
+def llama_2_7b_engine_extra_embeddings():
     cleanup()
     get_model_old = get_model
 
@@ -244,7 +256,6 @@ def get_model_patched(*, model_config, device_config, **kwargs):
 
 
 @pytest.fixture
-def llama_2_7b_model_extra_embeddings(
-        llama_2_7b_engine_extra_embeddings) -> nn.Module:
+def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
     yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
            model_runner.model)
diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py
index 653e682745464..61b8899f0533c 100644
--- a/tests/lora/data/long_context_test_data.py
+++ b/tests/lora/data/long_context_test_data.py
@@ -1,7 +1,29 @@
 # ruff: noqa
 """This file contains a dictionary of prompts and golden responses."""
 
-prompts_and_responses = {
+from typing import Dict, List, TypedDict
+
+
+class DateJSON(TypedDict):
+    day: int
+    month: int
+    year: int
+
+
+class AnswerJSON(TypedDict):
+    nationality: str
+    date_of_birth: DateJSON
+    date_of_death: DateJSON
+    politician: bool
+    sportsperson: bool
+
+
+class PromptResponse(TypedDict):
+    prompt: str
+    golden_answer: AnswerJSON
+
+
+prompts_and_responses: Dict[str, List[PromptResponse]] = {
     "16k": [{
         "prompt":
         "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]",
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 5ab863eea94b3..e1b81655c5610 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 
 import vllm
@@ -10,7 +12,7 @@
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
@@ -30,7 +32,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py
index bd8cc98ef8ca0..de4cbea80924e 100644
--- a/tests/lora/test_chatglm3.py
+++ b/tests/lora/test_chatglm3.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -6,7 +8,7 @@
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
@@ -26,7 +28,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 0082c6e74e888..709246179bfe4 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -1,10 +1,12 @@
+from typing import List
+
 import vllm
 from vllm.lora.request import LoRARequest
 
 MODEL_PATH = "google/gemma-7b"
 
 
-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         "Quote: Imagination is",
         "Quote: Be yourself;",
@@ -17,7 +19,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py
index 7d37aa6474adc..ec9776b77df76 100644
--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
@@ -26,7 +26,7 @@ def get_lora_model(model_id: str, target_modules: List[str], rank: int):
     return lora_model
 
 
-def do_sample(llm,
+def do_sample(llm: vllm.LLM,
               lora_path: Optional[str] = None,
               lora_id: Optional[int] = None,
               logprobs: int = 0,
@@ -42,8 +42,8 @@ def do_sample(llm,
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
-    generated_logprobs = []
+    generated_texts: List[str] = []
+    generated_logprobs: List[List[List[int]]] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index fc4445c657f1b..4b489670f53f9 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -109,7 +109,7 @@ def populate_loras(
 
     for slot_idx, lora_id in enumerate(id_to_index):
         if lora_id is not None:
-            subloras = []
+            subloras: List[LoRALayerWeights] = []
             sublora_len = layer_weights.shape[0] // repeats
             for i in range(repeats):
                 sublora = DummyLoRAManager().init_random_lora(
@@ -158,7 +158,10 @@ def create_random_inputs(
 
     low, high = input_range
 
-    inputs, index_mapping, prompt_mapping = [], [], []
+    inputs: List[torch.Tensor] = []
+    index_mapping: List[int] = []
+    prompt_mapping: List[int] = []
+
     for _ in range(num_inputs):
         if input_type == torch.int:
             inputs.append(
@@ -222,7 +225,7 @@ def create_random_embedding_layer():
 
         lora_result = lora_embedding(torch.cat(inputs))
 
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = embedding(input_)
@@ -356,7 +359,7 @@ def create_random_embedding_layer():
 
         lora_result = lora_embedding(torch.cat(original_inputs))
 
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, original_input_, lora_id in zip(inputs, original_inputs,
                                                     prompt_mapping):
             lora = lora_dict[lora_id]
@@ -482,7 +485,7 @@ def _pretest():
 
         logits_processor.org_vocab_size = (vocab_size +
                                            lora_config.lora_extra_vocab_size)
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = logits_processor._get_logits(hidden_states=input_,
@@ -598,7 +601,7 @@ def create_random_linear_parallel_layer():
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = linear(input_)[0]
@@ -729,7 +732,7 @@ class FakeConfig:
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             result = linear(input_)[0]
             subloras = sublora_dict[lora_id]
@@ -885,9 +888,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed):
     computed_added_vocab_size = 0
     vocab_size_padded = -1
 
-    all_org_tokens = []
-    all_added_tokens = []
-    token_ids = []
+    all_org_tokens: List[int] = []
+    all_added_tokens: List[int] = []
+    token_ids: List[int] = []
 
     for tp_rank in range(tp_size):
         with patch(
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index 7143a99bea081..ad8490353998f 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import ray
 
@@ -9,7 +11,7 @@
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 
 
-def do_sample(llm, lora_path: str, lora_id: int):
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
@@ -27,7 +29,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index b58145eda2141..b50784a205af7 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -77,7 +77,7 @@ def evaluate_json_response(model_response, golden_response):
 
 
 def generate(
-    llm,
+    llm: vllm.LLM,
     inputs: Tuple[str, SamplingParams, Optional[LoRARequest]],
 ):
     prompts, sampling_param, lora_request = inputs
@@ -159,7 +159,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
         non-batched generation.
     """
     # Create non batched results first to compare against batched results
-    non_batched_results = []
+    non_batched_results: List[str] = []
 
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
@@ -172,7 +172,8 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
     # Create batched results
     # Each element of the batch must be
     # (prompt, prompt_sampling_params, prompt_lora_request)
-    batched_prompts = []
+    batched_prompts: List[Tuple[str, SamplingParams,
+                                Optional[LoRARequest]]] = []
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
         batched_prompts.extend([
@@ -196,7 +197,8 @@ def test_self_consistency(lora_llm, long_context_infos):
     num_loras = len(long_context_infos)
 
     # Create results in order of long_context_infos
-    batched_prompts = []
+    batched_prompts: List[Tuple[str, SamplingParams,
+                                Optional[LoRARequest]]] = []
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
         batched_prompts.extend([
@@ -244,7 +246,7 @@ def test_quality(lora_llm, long_context_infos):
     The test is expected to run for about 1 minute on a p4de.24xlarge
     instance.
     """
-    scores = []
+    scores: List[float] = []
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
         for prompt_and_response in prompts_and_responses[context_len]:
@@ -277,7 +279,8 @@ def test_max_len(lora_llm, long_context_infos):
             generate(lora_llm, (bad_prompt, sampling_params, lora_request))
 
     # Also test batched
-    batched_prompts = []
+    batched_prompts: List[Tuple[str, SamplingParams,
+                                Optional[LoRARequest]]] = []
     for lora_id_with_bad_inputs in long_context_infos:
         for lora_id, info in long_context_infos.items():
             context_len = info["context_length"]
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index d4d1665b624ea..3514dcb7aedf4 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 
 from vllm.lora.models import LoRAModel
@@ -17,7 +19,7 @@ def test_load_checkpoints(
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules = []
+    expected_lora_modules: List[str] = []
     for module in supported_lora_modules:
         if module in packed_modules_mapping:
             expected_lora_modules.extend(packed_modules_mapping[module])
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index c08eee9910149..51a56b121ae2c 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,5 +1,5 @@
 import os
-from typing import List
+from typing import Dict, List
 
 import pytest
 import torch
@@ -62,7 +62,7 @@ def test_from_lora_tensors(sql_lora_files):
 
 def create_lora(lora_id: int, model: nn.Module,
                 sub_modules: List[str]) -> LoRAModel:
-    loras = {}
+    loras: Dict[str, LoRALayerWeights] = {}
     for name in sub_modules:
         w = model.get_submodule(name).weight
         loras[name] = LoRALayerWeights(
@@ -83,7 +83,7 @@ def create_packed_lora(
     empty_replaced_module_name=None,
 ) -> LoRAModel:
     w = model.get_submodule(module_name).weight
-    loras = {}
+    loras: Dict[str, LoRALayerWeights] = {}
     for replaced_module_name in replaced_module_names:
         if replaced_module_name == empty_replaced_module_name:
             continue
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index f6a8a50fa9e50..e7e7724fcec56 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch
 
@@ -7,7 +9,7 @@
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 
-def do_sample(llm, lora_path: str, lora_id: int):
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
         "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
@@ -20,7 +22,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index a2b42ce4cb96f..733eff48a9bf3 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -6,7 +8,7 @@
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
 
 
-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
             sql_prompt=
@@ -35,7 +37,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
         if lora_id else None,
     )
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 3d86a4366aa57..8fd968c69e58f 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -25,7 +25,10 @@ class ModelWithQuantization:
 ]
 
 
-def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
+def do_sample(llm: vllm.LLM,
+              lora_path: str,
+              lora_id: int,
+              max_tokens: int = 256) -> List[str]:
     raw_prompts = [
         "Give me an orange-ish brown color",
         "Give me a neon pink color",
@@ -45,7 +48,7 @@ def format_prompt_tuples(prompt):
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 280e0f2043e68..b73cf5bf55324 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import torch
 
@@ -9,13 +9,13 @@ class DummyLoRAManager:
 
     def __init__(self):
         super().__init__()
-        self._loras = {}
+        self._loras: Dict[str, LoRALayerWeights] = {}
 
     def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
         self._loras[module_name] = lora
 
-    def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
-        return self._loras.get(module_name, None)
+    def get_module_lora(self, module_name: str) -> LoRALayerWeights:
+        return self._loras[module_name]
 
     def init_random_lora(self,
                          module_name: str,
@@ -68,11 +68,11 @@ def init_packed_lora(
         module_name: str,
         input_dim: int,
         output_dims: List[int],
-        noop_lora_index: List[int] = None,
-        rank=8,
+        noop_lora_index: Optional[List[int]] = None,
+        rank: int = 8,
     ):
-        base_loras = []
-        noop_lora_index = set(noop_lora_index or [])
+        base_loras: List[LoRALayerWeights] = []
+        noop_lora_index_set = set(noop_lora_index or [])
 
         for i, out_dim in enumerate(output_dims):
             base_lora = self.init_lora(
@@ -80,7 +80,7 @@ def init_packed_lora(
                 input_dim,
                 out_dim,
                 rank=rank,
-                noop=i in noop_lora_index,
+                noop=i in noop_lora_index_set,
             )
             base_loras.append(base_lora)
         packed_lora = PackedLoRALayerWeights.pack(base_loras)
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 2b560918877a8..4ab968c01da04 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -3,6 +3,7 @@
 Note: these tests will only pass on L4 GPU.
 """
 import os
+from typing import List
 
 import pytest
 import torch
@@ -100,7 +101,7 @@ def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
     ]
 
     params = SamplingParams(max_tokens=20, temperature=0)
-    generations = []
+    generations: List[str] = []
     # Note: these need to be run 1 at a time due to numerical precision,
     # since the expected strs were generated this way.
     for prompt in formatted_prompts:
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 305596e16ef1c..7985001d34eb1 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -2,8 +2,11 @@
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
+from typing import List
+
 import pytest
 
+from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager_v1 import CachedBlockAllocator
 from vllm.utils import Device
 
@@ -43,7 +46,7 @@ def test_block_allocator(
 def test_eviction(num_blocks: int, ):
     block_size = 16
     block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
-    blocks = []
+    blocks: List[PhysicalTokenBlock] = []
 
     for i in range(num_blocks):
         # use i as the block_hash
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 6820b2728e3c9..b63a8d01d6621 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -4,6 +4,7 @@
 """
 
 from dataclasses import dataclass
+from typing import Tuple
 
 import pytest
 
@@ -51,7 +52,7 @@ class ModelPair:
 
 
 @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
-def test_auto_gptq(model_arg_exptype: str) -> None:
+def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
     model_path, quantization_arg, expected_type = model_arg_exptype
 
     try:
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 233540cdc391f..02a953da04659 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch
 
@@ -62,21 +64,22 @@ def test_get_prompt_logprobs(
         for logprobs in result.outputs[0].logprobs:
             assert len(logprobs) == num_top_logprobs
         output_text = result.outputs[0].text
-        output_string_from_most_likely_tokens = []
+        output_string_from_most_likely_tokens_lst: List[str] = []
         for top_logprobs in result.outputs[0].logprobs:
             top_logprob = next(iter(top_logprobs.values()))
-            output_string_from_most_likely_tokens.append(
+            output_string_from_most_likely_tokens_lst.append(
                 top_logprob.decoded_token)
 
         if detokenize:
             output_string_from_most_likely_tokens = "".join(
-                output_string_from_most_likely_tokens)
+                output_string_from_most_likely_tokens_lst)
             assert output_text == output_string_from_most_likely_tokens, (
                 "The output text from the top logprob for each token position "
                 "should be the same as the output text in the result.")
         else:
             assert output_text == ''
-            assert output_string_from_most_likely_tokens == [None] * max_tokens
+            assert output_string_from_most_likely_tokens_lst == ([None] *
+                                                                 max_tokens)
 
         # The first prompt logprob is always None
         assert result.prompt_logprobs[0] is None
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 00a2379502e6d..6dd643bbea2bb 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -246,8 +246,8 @@ def test_rejection_sampling_approximates_target_distribution(
         draft_and_target_probs_equal)
 
     sample_sizes = [10, 100, 1_000, 10_000, 100_000]
-    distance_wrt_reference = []
-    distance_wrt_target = []
+    distance_wrt_reference: List[float] = []
+    distance_wrt_target: List[float] = []
 
     for num_samples in sample_sizes:
         (reference_vs_rejsample_dist,
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index ddc66aa28a094..c6ef4358ea5fa 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -1,6 +1,6 @@
 import itertools
 import random
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 from unittest.mock import patch
 
 import pytest
@@ -49,8 +49,8 @@ def _do_sample(
     sampling_params: SamplingParams,
     device: str,
 ):
-    seq_group_metadata_list = []
-    seq_lens = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
     for i in range(batch_size):
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -212,7 +212,7 @@ def generate_test_case():
         batch_size = random.randint(1, 128)
 
         expected_penalization = []
-        sequence_metadata_list = []
+        sequence_metadata_list: List[SequenceGroupMetadata] = []
         # 20% chance to generate seq group metadata list with all prompts
         is_prompt = random.random() < 0.2
         while batch_size > 0:
@@ -232,8 +232,8 @@ def generate_test_case():
                 eos_token_id=eos_token_id,
                 stop_token_ids=stop_token_ids)
 
-            seq_data = {}
-            seq_group_penalization = []
+            seq_data: Dict[int, SequenceData] = {}
+            seq_group_penalization: List[bool] = []
             for _ in range(num_seqs):
                 num_input = random.randint(1, 100)
                 num_generated = 0 if is_prompt else random.randint(1, 100)
@@ -392,17 +392,16 @@ def generate_test_case():
     else:
         test_cases = [generate_test_case()]
 
-    def run_test_case(*,
-                      expected_penalization=None,
-                      seq_group_metadata_list=None):
+    def run_test_case(*, expected_penalization: List[bool],
+                      seq_group_metadata_list: List[SequenceGroupMetadata]):
         assert expected_penalization, \
             "Invalid test case, need expected_penalization"
         assert seq_group_metadata_list, \
             "Invalid test case, need seq_group_metadata_list"
 
         batch_size = 0
-        seq_lens = []
-        sampling_params_per_row = []
+        seq_lens: List[int] = []
+        sampling_params_per_row: List[SamplingParams] = []
         for sgm in seq_group_metadata_list:
             sampling_params = sgm.sampling_params
 
@@ -472,15 +471,15 @@ def test_sampler_mixed(seed: int, device: str):
     batch_size = random.randint(1, 256)
     input_tensor, fake_logits, sampler = _prepare_test(batch_size)
 
-    seq_group_metadata_list = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
     expected_tokens: List[Optional[List[int]]] = []
-    seq_lens = []
+    seq_lens: List[int] = []
     for i in range(batch_size):
         expected: Optional[List[int]] = None
         sampling_type = random.randint(0, 3)
         if sampling_type == 0:
             sampling_params = SamplingParams(temperature=0)
-            expected = [torch.argmax(fake_logits[i], dim=-1).item()]
+            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
         elif sampling_type in (1, 2):
             n = random.randint(1, 10)
             sampling_params = SamplingParams(
@@ -536,15 +535,18 @@ def test_sampling():
                 ]
                 continue
 
+            expected_tokens_item = expected_tokens[i]
+            assert expected_tokens_item is not None
+
             for n, nth_output in enumerate(sequence_output.samples):
                 if (metadata.sampling_params.temperature == 0
                         or metadata.sampling_params.seed is not None):
                     # Ensure exact matches for greedy or random with seed
-                    assert nth_output.output_token == expected_tokens[i][n]
+                    assert nth_output.output_token == expected_tokens_item[n]
                 else:
                     # For non-seeded random check that one of the high-logit
                     # tokens were chosen
-                    assert nth_output.output_token in expected_tokens[i]
+                    assert nth_output.output_token in expected_tokens_item
 
     # Test batch
     test_sampling()
@@ -588,8 +590,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
     warpers = generation_model._get_logits_warper(generation_config)
     assert len(warpers) == 2  # top_p and top_k
 
-    seq_group_metadata_list = []
-    seq_lens = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
     for i in range(batch_size):
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -622,6 +624,9 @@ def mock_sample(probs, *args, **kwargs):
 
     with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
         sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+    assert sample_probs is not None
+
     hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
     hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
     assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index f8a6de54653c1..86103cf85484a 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -118,16 +118,17 @@ def generate(
             raise ValueError("The lengths of prompts and "
                              "sampling_params must be the same.")
 
-        async def get_output(prompt, sampling_param) -> str:
+        async def get_output(prompt, sampling_param) -> RequestOutput:
             request_id = random_uuid()
             results_generator = self.llm_engine.generate(
                 prompt, sampling_param, request_id)
             final_output = None
             async for request_output in results_generator:
                 final_output = request_output
+            assert final_output is not None
             return final_output
 
-        outputs = []
+        outputs: List[RequestOutput] = []
         try:
             for i in range(num_requests):
                 prompt = prompts[i] if prompts is not None else None
@@ -208,8 +209,8 @@ def maybe_assert_ngram_worker(llm):
 def get_output_from_llm_generator(
         llm_generator, prompts,
         sampling_params) -> Tuple[List[str], List[List[int]]]:
-    tokens = []
-    token_ids = []
+    tokens: List[str] = []
+    token_ids: List[List[int]] = []
     for llm in llm_generator():
         maybe_assert_ngram_worker(llm)
 
@@ -300,8 +301,8 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
     nvmlInit()
     start_time = time.time()
     while True:
-        output = {}
-        output_raw = {}
+        output: Dict[int, str] = {}
+        output_raw: Dict[int, float] = {}
         for device in devices:
             dev_handle = nvmlDeviceGetHandleByIndex(device)
             mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 43cfd78ddb0cc..42dd90422ec47 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch
 
@@ -38,14 +40,14 @@ def test_get_token_ids_to_score(k: int):
         device='cuda',
     )
 
-    expected_output = [
+    expected_output: List[List[int]] = [
         [],
     ]
     for i in range(proposal_token_ids.shape[0]):
         expected_output.append(proposal_token_ids[:i + 1].tolist())
 
     scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
-    actual_output = scorer._get_token_ids_to_score(proposal_token_ids)  # pylint: disable=protected-access
+    actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist())  # pylint: disable=protected-access
 
     actual_output = [
         x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 6cea6668acc91..a6eb628f9198f 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -1,11 +1,12 @@
 import random
+from typing import Dict, List
 from unittest.mock import MagicMock
 
 import pytest
 import torch
 
 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -210,7 +211,7 @@ def test_same_output_for_multi_step():
 
     # Run single-step repeatedly.
     zero_kv_cache(worker.cache_engine)
-    single_step_output = []
+    single_step_output: List[SamplerOutput] = []
     continuations = [[1] for _ in prompts]
     set_random_seed(seed)
 
@@ -232,11 +233,15 @@ def test_same_output_for_multi_step():
             continuations[i].append(seq_group_output.samples[0].output_token)
 
     # Get token ids and logprobs for comparison.
-    multi_step_output_logprobs = [[] for _ in prompts]
-    single_step_output_logprobs = [[] for _ in prompts]
-
-    multi_step_output_token_ids = [[] for _ in prompts]
-    single_step_output_token_ids = [[] for _ in prompts]
+    multi_step_output_logprobs: List[List[Dict[int,
+                                               Logprob]]] = [[]
+                                                             for _ in prompts]
+    single_step_output_logprobs: List[List[Dict[int,
+                                                Logprob]]] = [[]
+                                                              for _ in prompts]
+
+    multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
+    single_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
     for i, _ in enumerate(prompts):
         for multi_step, single_step in zip(multi_step_output,
                                            single_step_output):
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index ef9d32f73d668..afaeffc9681cb 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,5 +1,6 @@
 import random
 from types import SimpleNamespace
+from typing import Dict, List
 from unittest.mock import MagicMock
 
 import pytest
@@ -7,7 +8,7 @@
 
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                       SpecDecodeWorkerMetrics)
@@ -103,7 +104,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
             seq_group_metadata_list=seq_group_metadata_list,
             num_lookahead_slots=k))
 
-    seen_contexts = []
+    seen_contexts: List[List[int]] = []
 
     call_args_list = target_worker.execute_model.call_args_list
     assert len(call_args_list) == 1
@@ -116,7 +117,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
             for seq_data in seq_group_metadata.seq_data.values():
                 seen_contexts.append(seq_data.get_token_ids())
 
-    expected_seen_contexts = []
+    expected_seen_contexts: List[List[int]] = []
 
     for prompt, prev_generated, draft_tokens in zip(
             prompts, prev_output_tokens, proposal_token_ids.tolist()):
@@ -310,8 +311,14 @@ def test_correctly_formats_output(k: int, batch_size: int):
         next(iter(seq_group_metadata.seq_data.keys()))
         for seq_group_metadata in seq_group_metadata_list
     ]
-    actual_output_by_seq = {seq_id: [] for seq_id in seq_ids}
-    expected_output_by_seq = {seq_id: [] for seq_id in seq_ids}
+    actual_output_by_seq: Dict[int, List[SequenceOutput]] = {
+        seq_id: []
+        for seq_id in seq_ids
+    }
+    expected_output_by_seq: Dict[int, List[SequenceOutput]] = {
+        seq_id: []
+        for seq_id in seq_ids
+    }
 
     for step in output:
         for seq_group in step:
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index d52b22c30bd43..ce5b347832c30 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,5 +1,7 @@
 from itertools import count
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, List, Optional
+from typing import Sequence as GenericSequence
+from typing import TypeVar, Union
 from unittest.mock import MagicMock
 
 import torch
@@ -14,6 +16,8 @@
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.worker import Worker
 
+T = TypeVar("T", bound=Worker)
+
 
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
     return (seq_len + block_size - 1) // block_size
@@ -56,13 +60,13 @@ def zero_kv_cache(cache_engine: CacheEngine):
         value_blocks.zero_()
 
 
-def create_worker(cls: type,
+def create_worker(cls: Callable[..., T],
                   model_name: str,
                   block_size: int,
                   num_gpu_blocks: int,
                   seed: int,
                   is_driver_worker: bool = True,
-                  enforce_eager: bool = True):
+                  enforce_eager: bool = True) -> T:
     engine_args = EngineArgs(
         model=model_name,
         seed=seed,
@@ -159,8 +163,8 @@ def assert_logprobs_dict_allclose(
 
 def create_sampler_output_list(
         token_ids: torch.Tensor,
-        probs: Iterable[Optional[torch.Tensor]],
-        logprobs: Iterable[Optional[torch.Tensor]],
+        probs: GenericSequence[Optional[torch.Tensor]],
+        logprobs: GenericSequence[Optional[torch.Tensor]],
         seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
     num_steps, batch_size = token_ids.shape
     token_ids_by_step = token_ids.tolist()
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index 0fbe3dae1ff08..fe413d1228021 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -51,7 +51,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
         max_input_length=None,
     )
 
-    hashes = []
+    hashes: List[List[List[int]]] = []
 
     for prefix in prefixes:
         for lora_int_id in concurrent_lora_int_ids:
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 74f1125fb37c9..52aa73761fd68 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -47,6 +47,7 @@ def test_default_vllm_root_logger_configuration():
     assert not logger.propagate
 
     handler = logger.handlers[0]
+    assert isinstance(handler, logging.StreamHandler)
     assert handler.stream == sys.stdout
     assert handler.level == logging.INFO
 
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 8d019fe5f38ca..12e5ae85adea6 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -153,8 +153,8 @@ def test_decode_sequence_logprobs(complete_sequence: str,
     # Run sequentially.
     seq = create_sequence()
     dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
-    sequential_logprobs_text_chosen_token = []
-    sequential_logprobs_text_other_token = []
+    sequential_logprobs_text_chosen_token: List[str] = []
+    sequential_logprobs_text_other_token: List[str] = []
     for new_token, logprobs in zip(complete_sequence_token_ids,
                                    dummy_logprobs):
         seq.append_token_id(new_token, logprobs)
diff --git a/tests/utils.py b/tests/utils.py
index c84364d20fc63..f2b2d22b1ebc5 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -79,7 +79,7 @@ def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
 
-        self._runner = self._RemoteRunner.remote(
+        self._runner = self._RemoteRunner.remote(  # type: ignore
             cli_args,
             wait_url=self.url_for("health"),
             wait_timeout=self.MAX_SERVER_START_WAIT_S)
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 514a57e17ebf4..dd0d3bf5082d9 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch
 
@@ -35,8 +37,8 @@ def test_prepare_prompt(batch_size):
         enable_chunked_prefill=False,
     )
 
-    seq_lens = []
-    seq_group_metadata_list = []
+    seq_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
     block_tables = {0: [1]}
     for i in range(batch_size):
         # make sure all tokens fit into one block
@@ -151,15 +153,14 @@ def test_prepare_decode_cuda_graph(batch_size):
         enable_chunked_prefill=False,
     )
 
-    context_lens = []
-    seq_group_metadata_list = []
+    context_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
     # Assume each seq group finishes prefill.
     for i in range(batch_size):
         # make sure all tokens fit into one block
         context_len = i % (model_runner.block_size - 1) + 1
         context_lens.append(context_len)
-        seq_data = list(range(context_len))
-        seq_data = SequenceData(seq_data)
+        seq_data = SequenceData(list(range(context_len)))
         seq_data.update_num_computed_tokens(context_len)
         # Append one token ID since prefill is finished.
         seq_data.append_token_id(1, 0)
@@ -257,7 +258,7 @@ def test_empty_seq_group():
         dtype="float16",
         enforce_eager=False,
     )
-    seq_group_metadata_list = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
     model_input = model_runner._prepare_model_input(seq_group_metadata_list)
     input_tokens, input_positions, attn_metadata, slot_mapping = (
         model_input.input_tokens,
@@ -310,10 +311,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     )
 
     # Add prefill requests.
-    seq_lens = []
-    seq_group_metadata_list = []
-    prefill_metadata_list = []
-    decode_metadata_list = []
+    seq_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    prefill_metadata_list: List[SequenceGroupMetadata] = []
+    decode_metadata_list: List[SequenceGroupMetadata] = []
     block_tables = {0: [1]}
     prefill_batch_size = batch_size // 2
     decode_batch_size = batch_size - prefill_batch_size
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 4b08cce99afb0..c01e0a0a3a19c 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -245,7 +245,7 @@ def _make_alibi_bias(
     dtype: torch.dtype,
     seq_lens: List[int],
 ) -> List[torch.Tensor]:
-    attn_biases = []
+    attn_biases: List[torch.Tensor] = []
     for seq_len in seq_lens:
         bias = torch.arange(seq_len, dtype=dtype)
         # NOTE(zhuohan): HF uses
@@ -271,7 +271,7 @@ def _make_sliding_window_bias(
     window_size: Optional[int],
     dtype: torch.dtype,
 ) -> List[torch.Tensor]:
-    attn_biases = []
+    attn_biases: List[torch.Tensor] = []
     for seq_len in seq_lens:
         tensor = torch.full(
             (1, seq_len, seq_len),
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 99a3e88bc07b6..0fecd9f6e610c 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -431,8 +431,8 @@ def _make_alibi_bias(
     num_kv_heads: int,
     dtype: torch.dtype,
     seq_lens: List[int],
-) -> LowerTriangularMaskWithTensorBias:
-    attn_biases = []
+) -> List[AttentionBias]:
+    attn_biases: List[AttentionBias] = []
     for seq_len in seq_lens:
         bias = torch.arange(seq_len, dtype=dtype)
         # NOTE(zhuohan): HF uses
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 26f378ba24b76..d705f3d91a074 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -252,7 +252,7 @@ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
     def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
                                        token_ids: List[int],
                                        device: Device) -> List[Block]:
-        blocks = []
+        blocks: List[Block] = []
         for block_token_ids in chunk_list(token_ids, self._block_size):
             if len(block_token_ids) == self._block_size:
                 # If the block is full, create an immutable block.
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index d033787122d7a..50f27bab33776 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -111,7 +111,7 @@ def fork(self, last_block: Block) -> List[Block]:
         """
         source_blocks = get_all_blocks_recursively(last_block)
 
-        forked_blocks = []
+        forked_blocks: List[Block] = []
         prev_block = None
         for block in source_blocks:
 
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 88dbbfb2f3690..2df7d74e4ff19 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -271,7 +271,7 @@ def fork(self, last_block: Block) -> List[Block]:
         """
         source_blocks = get_all_blocks_recursively(last_block)
 
-        forked_blocks = []
+        forked_blocks: List[Block] = []
         prev_block = None
         for block in source_blocks:
             refcount = self._refcounter.incr(block.block_id)
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 121092cf189bd..309775237a715 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -260,7 +260,7 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float):
         # at max extend.
         if self.enable_caching:
             block_table = self.block_tables[seq.seq_id]
-            block_ids = []
+            block_ids: List[Optional[int]] = []
             for block_id in block_table.physical_block_ids:
                 block_ids.append(block_id)
             self.block_allocator.mark_blocks_as_accessed(
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index e6957b1196969..75b7c374c8e6e 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -2,7 +2,7 @@
 import json
 import os
 from itertools import product
-from typing import Dict, Optional, Sequence
+from typing import Dict, List, Optional, Sequence
 
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -88,7 +88,7 @@ def consumer(batch_tgt: Sequence[int],
 def can_actually_p2p(
     batch_src: Sequence[int],
     batch_tgt: Sequence[int],
-):
+) -> Sequence[bool]:
     """
     Usually, checking if P2P access is enabled can be done by
     `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
@@ -138,7 +138,7 @@ def can_actually_p2p(
     p_tgt.start()
     p_src.join()
     p_tgt.join()
-    result = []
+    result: List[bool] = []
     for src, tgt in zip(batch_src, batch_tgt):
         a = result_queue.get()
         b = result_queue.get()
@@ -188,7 +188,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
         # only the local master process (with local_rank == 0) can
         #  enter this block to calculate the cache
         logger.info("generating GPU P2P access cache in %s", path)
-        cache = {}
+        cache: Dict[str, bool] = {}
         ids = list(range(num_dev))
         # batch of all pairs of GPUs
         batch_src, batch_tgt = zip(*list(product(ids, ids)))
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 50d6719fbfe62..7619c98f22148 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -205,7 +205,7 @@ def __init__(self, so_file: Optional[str] = None):
             raise e
 
         if so_file not in NCCLLibrary.path_to_dict_mapping:
-            _funcs = {}
+            _funcs: Dict[str, Any] = {}
             for func in NCCLLibrary.exported_functions:
                 f = getattr(self.lib, func.name)
                 f.restype = func.restype
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b2f6478cbfd7b..fd64337d4384c 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2,7 +2,7 @@
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional
 from typing import Sequence as GenericSequence
-from typing import Type, TypeVar, Union
+from typing import Set, Type, TypeVar, Union
 
 from transformers import GenerationConfig, PreTrainedTokenizer
 
@@ -973,7 +973,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.model_executor.remove_lora(lora_id)
 
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         return self.model_executor.list_loras()
 
     def check_health(self) -> None:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index ae7ae144bc04f..027f5c7e73c2b 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -144,7 +144,7 @@ def __init__(self, labelnames: List[str], max_model_len: int):
 # end-metrics-definitions
 
 
-def build_1_2_5_buckets(max_value: int):
+def build_1_2_5_buckets(max_value: int) -> List[int]:
     """
     Builds a list of buckets with increasing powers of 10 multiplied by 
     mantissa values (1, 2, 5) until the value exceeds the specified maximum.
@@ -155,7 +155,7 @@ def build_1_2_5_buckets(max_value: int):
     """
     mantissa_lst = [1, 2, 5]
     exponent = 0
-    buckets = []
+    buckets: List[int] = []
     while True:
         for m in mantissa_lst:
             value = m * 10**exponent
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index cad44f476f06e..07a68c65a6dd8 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from vllm.config import SchedulerConfig
 from vllm.core.scheduler import Scheduler
@@ -146,8 +146,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
 
         # Beam search case
         # Select the child sequences to keep in the sequence group.
-        selected_child_seqs = []
-        unselected_child_seqs = []
+        selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
+        unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
         beam_width = seq_group.sampling_params.best_of
         length_penalty = seq_group.sampling_params.length_penalty
 
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 7a6819c35a92d..91e567924b59e 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -2,6 +2,7 @@
 import asyncio
 import sys
 from io import StringIO
+from typing import Awaitable, List
 
 import aiohttp
 
@@ -114,7 +115,7 @@ async def main(args):
     )
 
     # Submit all requests in the file to the engine "concurrently".
-    response_futures = []
+    response_futures: List[Awaitable[BatchRequestOutput]] = []
     for request_json in (await read_file(args.input_file)).strip().split("\n"):
         request = BatchRequestInput.model_validate_json(request_json)
         response_futures.append(run_request(openai_serving_chat, request))
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7cd434fe0d272..76940612496a0 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -487,7 +487,7 @@ async def chat_completion_full_generator(
             final_res = res
         assert final_res is not None
 
-        choices = []
+        choices: List[ChatCompletionResponseChoice] = []
 
         role = self.get_chat_request_role(request)
         for output in final_res.outputs:
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 5a3448de3d7a4..cbf09f173fb66 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -25,7 +25,7 @@ def request_output_to_embedding_response(
     created_time: int,
     model_name: str,
 ) -> EmbeddingResponse:
-    data = []
+    data: List[EmbeddingResponseData] = []
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
         assert final_res is not None
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index d7794aa7cd35c..8f3c7f76932af 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -1,4 +1,5 @@
 from typing import List, Optional
+from typing import Sequence as GenericSequence
 
 import torch
 
@@ -120,7 +121,7 @@ def __init__(
 
     @classmethod
     def pack(
-            cls, loras: List[Optional["LoRALayerWeights"]]
+        cls, loras: GenericSequence[Optional["LoRALayerWeights"]]
     ) -> "PackedLoRALayerWeights":
         """Pack a list of LoRAs into a single LoRA.
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 4657757bd484b..498b2b9ddb18a 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -165,7 +165,7 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
             model = self._lora_manager.model
             supported_lora_modules = model.supported_lora_modules
             packed_modules_mapping = model.packed_modules_mapping
-            expected_lora_modules = []
+            expected_lora_modules: List[str] = []
             for module in supported_lora_modules:
                 if module in packed_modules_mapping:
                     expected_lora_modules.extend(
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index f5b6bdd9f7fd7..58c379bcd88df 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -393,7 +393,7 @@ def weight_loader(self,
                 param_data.copy_(loaded_weight)
                 return
             current_shard_offset = 0
-            shard_offsets = []
+            shard_offsets: List[Tuple[int, int, int]] = []
             for i, output_size in enumerate(self.output_sizes):
                 shard_offsets.append((i, current_shard_offset, output_size))
                 current_shard_offset += output_size
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index ae440743fdf8e..599070f1550ca 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -25,24 +25,25 @@
 
 
 # Permutations for Marlin scale shuffling
-def get_scale_perms(num_bits):
-    scale_perm = []
+def get_scale_perms(num_bits: int):
+    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
     for i in range(4):
         scale_perm_single.extend(
             [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
 
 
-def get_pack_factor(num_bits):
+def get_pack_factor(num_bits: int):
     assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
             ), f"Unsupported num_bits = {num_bits}"
     return 32 // num_bits
 
 
-def marlin_permute_scales(s, size_k, size_n, group_size, num_bits):
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int, num_bits: int):
     scale_perm, scale_perm_single = get_scale_perms(num_bits)
     if group_size < size_k and group_size != -1:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
index 12e77cb710687..93f65a20d4e4a 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
@@ -1,4 +1,6 @@
 """This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
 import numpy
 import torch
 
@@ -11,10 +13,10 @@
 #
 # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
 # (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms_24(num_bits):
-    perm_list = []
+def get_perms_24(num_bits: int):
+    perm_list: List[int] = []
     for i in range(32):
-        perm1 = []
+        perm1: List[int] = []
         col = i // 4
         col_o = col // 2
         for block in [0, 1]:
@@ -39,18 +41,18 @@ def get_perms_24(num_bits):
 
     perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
     perm = torch.from_numpy(perm)
-    scale_perm = []
+    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
     for i in range(8):
         scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
     return perm, scale_perm, scale_perm_single
 
 
-marlin_24_perm = {}
-marlin_24_scale_perm = {}
-marlin_24_scale_perm_single = {}
+marlin_24_perm: Dict[int, torch.Tensor] = {}
+marlin_24_scale_perm: Dict[int, List[int]] = {}
+marlin_24_scale_perm_single: Dict[int, List[int]] = {}
 for num_bits in [4, 8]:
     perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
     marlin_24_perm[num_bits] = perm_24
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_perms.py
index 76bd2ff7c724e..db5e6857a8846 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_perms.py
@@ -1,4 +1,6 @@
 """This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
 import numpy
 import torch
 
@@ -11,10 +13,10 @@
 #
 # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
 # (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms(num_bits):
-    perm_list = []
+def get_perms(num_bits: int):
+    perm_list: List[int] = []
     for i in range(32):
-        perm1 = []
+        perm1: List[int] = []
         col = i // 4
         for block in [0, 1]:
             for row in [
@@ -38,19 +40,19 @@ def get_perms(num_bits):
 
     perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
     perm = torch.from_numpy(perm)
-    scale_perm = []
+    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
     for i in range(4):
         scale_perm_single.extend(
             [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return perm, scale_perm, scale_perm_single
 
 
-marlin_perm = {}
-marlin_scale_perm = {}
-marlin_scale_perm_single = {}
+marlin_perm: Dict[int, torch.Tensor] = {}
+marlin_scale_perm: Dict[int, List[int]] = {}
+marlin_scale_perm_single: Dict[int, List[int]] = {}
 for num_bits in [4, 8]:
     perm, scale_perm, scale_perm_single = get_perms(num_bits)
     marlin_perm[num_bits] = perm
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index a84f562909d50..e07360a2fd682 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -174,7 +174,7 @@ def _apply_min_tokens_penalty(
         min_tokens = sampling_params.min_tokens
         token_ids_to_penalize = sampling_params.all_stop_token_ids
         if min_tokens > 0 and token_ids_to_penalize:
-            seqs_to_penalize = []
+            seqs_to_penalize: List[int] = []
             for j, seq_id in enumerate(seq_ids):
                 seq_data = seq_group.seq_data[seq_id]
                 if len(seq_data.output_token_ids) < min_tokens:
@@ -285,7 +285,7 @@ def _greedy_sample(
         same as the length of selected_seq_groups. If the corresponding
         seq_group has do_sample=False, tuple contains ([], [])
     """
-    samples = samples.tolist()
+    samples_lst = samples.tolist()
     sample_idx = 0
     results: SampleResultType = []
     for seq_group in selected_seq_groups:
@@ -298,7 +298,7 @@ def _greedy_sample(
         assert num_parent_seqs == 1, (
             "Greedy sampling should have only one seq.")
         parent_ids = list(range(num_parent_seqs))
-        next_token_ids = [samples[sample_idx]]
+        next_token_ids = [samples_lst[sample_idx]]
         results.append((next_token_ids, parent_ids))
         sample_idx += num_parent_seqs
     return results
@@ -394,7 +394,7 @@ def _beam_search_sample(
             next_token_ids = next_token_ids.tolist()
         else:
             # Generation phase.
-            cumulative_logprobs: List[int] = [
+            cumulative_logprobs: List[float] = [
                 seq_group.seq_data[seq_id].cumulative_logprob
                 for seq_id in seq_ids
             ]
@@ -466,8 +466,9 @@ def _sample_with_torch(
         categorized_seq_group_ids[sampling_type].append(i)
 
     sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
-    sample_metadata = {}
-    multinomial_samples = {}
+    sample_metadata: Dict[SamplingType,
+                          Tuple[List[int], List[SequenceGroupToSample]]] = {}
+    multinomial_samples: Dict[SamplingType, torch.Tensor] = {}
 
     # Create output tensor for sampled token ids.
     if include_gpu_probs_tensor:
@@ -494,7 +495,7 @@ def _sample_with_torch(
             greedy_samples = torch.argmax(logprobs[long_sample_indices],
                                           dim=-1)
 
-            if include_gpu_probs_tensor:
+            if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.
                 sampled_token_ids_tensor[
                     long_sample_indices] = greedy_samples.unsqueeze(-1)
@@ -522,7 +523,7 @@ def _sample_with_torch(
                 probs[long_sample_indices], max_best_of_in_batch,
                 **seeded_args)
 
-            if include_gpu_probs_tensor:
+            if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.
                 sampled_token_ids_tensor[
                     long_sample_indices] = multinomial_samples[sampling_type]
@@ -571,7 +572,9 @@ def _sample_with_triton_kernel(
         categorized_seq_group_ids[sampling_type].append(i)
 
     sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
-    sample_metadata = {}
+    sample_metadata: Dict[SamplingType,
+                          Tuple[List[int], List[SequenceGroupToSample],
+                                torch.Tensor, torch.Tensor]] = {}
     max_best_of_in_batch = 1
 
     # Counterintiutively, having two loops here is actually faster.
@@ -1008,14 +1011,14 @@ def _build_sampler_output(
             speculative decoding rejection sampling.
     """
 
-    sampler_output = []
+    sampler_output: List[CompletionSequenceGroupOutput] = []
     for (seq_group, sample_result, group_prompt_logprobs,
          group_sample_logprobs) in zip(sampling_metadata.seq_groups,
                                        sample_results, prompt_logprobs,
                                        sample_logprobs):
         seq_ids = seq_group.seq_ids
         next_token_ids, parent_ids = sample_result
-        seq_outputs = []
+        seq_outputs: List[SequenceOutput] = []
         for parent_id, next_token_id, logprobs in zip(parent_ids,
                                                       next_token_ids,
                                                       group_sample_logprobs):
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 06de2fcc1cc70..d3babcf9c3451 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -68,7 +68,7 @@ def _get_model_initialization_kwargs(
         vision_language_config: Optional[VisionLanguageConfig]
 ) -> Dict[str, Any]:
     """Get extra kwargs for model initialization."""
-    extra_kwargs = {}
+    extra_kwargs: Dict[str, Any] = {}
     if hasattr(model_class, "supported_lora_modules"):
         extra_kwargs["lora_config"] = lora_config
     elif lora_config:
@@ -446,7 +446,8 @@ def _filter_subtensors(
         Filter out all tensors that share the same memory or a subset of the
         memory of another tensor.
         """
-        same_storage_groups = collections.defaultdict(list)
+        same_storage_groups: Dict[Any, List[Tuple[
+            str, torch.Tensor]]] = collections.defaultdict(list)
         for key, tensor in tensors.items():
             if tensor.numel():
                 ptr = tensor.untyped_storage().data_ptr()
@@ -455,7 +456,7 @@ def _filter_subtensors(
         def get_end_ptr(tensor: torch.Tensor) -> int:
             return tensor.view(-1)[-1].data_ptr() + tensor.element_size()
 
-        result = {}
+        result: Dict[str, torch.Tensor] = {}
         for group in same_storage_groups.values():
             for k, t in group:
                 a, b = t.data_ptr(), get_end_ptr(t)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 827591b227a2b..943022a3f03c7 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -329,7 +329,7 @@ def np_cache_weights_iterator(
     # dumping the same model weights to numpy at the same time.
     with get_lock(model_name_or_path, cache_dir):
         if not os.path.exists(weight_names_file):
-            weight_names = []
+            weight_names: List[str] = []
             for bin_file in hf_weights_files:
                 state = torch.load(bin_file, map_location="cpu")
                 for name, param in state.items():
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 4446914c67c8e..bed6f518ca033 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -72,11 +72,11 @@
 _OOT_MODELS: Dict[str, Type[nn.Module]] = {}
 
 # Models not supported by ROCm.
-_ROCM_UNSUPPORTED_MODELS = []
+_ROCM_UNSUPPORTED_MODELS: List[str] = []
 
 # Models partially supported by ROCm.
 # Architecture -> Reason.
-_ROCM_PARTIALLY_SUPPORTED_MODELS = {
+_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
     "Qwen2ForCausalLM":
     "Sliding window attention is not yet supported in ROCm's flash attention",
     "MistralForCausalLM":
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 313762b1353d1..5777611079c66 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -453,8 +453,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        mlp_params_mapping = []
-        expert_params_mapping = []
+        mlp_params_mapping: List[Tuple[str, str, int]] = []
+        expert_params_mapping: List[Tuple[str, str, int]] = []
         num_layers = self.config.num_hidden_layers
 
         for layer in range(num_layers):
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 84786921ce1b4..11d88d45e993c 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -20,7 +20,7 @@
 
 # This file is based on the LLama model definition file in transformers
 """PyTorch Cohere model."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 import torch.utils.checkpoint
@@ -352,7 +352,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        loaded_params = set()
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for param_name, shard_name, shard_id in stacked_params_mapping:
                 if shard_name not in name:
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 27dda00b66af4..65f4ebec5bcf0 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
 from functools import lru_cache
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -363,7 +363,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        loaded_params = set()
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for (param_name, shard_name, shard_id) in stacked_params_mapping:
                 if shard_name not in name:
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2f27bf33b166e..54243bfb1e913 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -123,7 +123,7 @@ def __init__(
             output_token_ids = []
 
         self.prompt_token_ids = prompt_token_ids
-        self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids)
+        self._prompt_token_ids_tuple = tuple(prompt_token_ids)
         self.output_token_ids = output_token_ids
         self.cumulative_logprob = 0.0
         # The number of tokens that are computed (that run against the model).
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index fe15ea33b5f36..668ceefe6175f 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,10 +1,10 @@
 import copy
 import weakref
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 import torch
 
-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
@@ -71,7 +71,7 @@ def sampler_output(
                                      sample_len)
 
         # Run model sample_len times.
-        model_outputs = []
+        model_outputs: List[SamplerOutput] = []
         for _ in range(sample_len):
             model_output = super().execute_model(
                 execute_model_req=copied_execute_model_req)
@@ -132,7 +132,7 @@ def _shallow_copy_inputs(
 
         # Shallow-copy the list of SequenceGroupMetadata. This allows us to
         # append tokens and change is_prompt without external side-effects.
-        new_seq_group_metadata_list = []
+        new_seq_group_metadata_list: List[SequenceGroupMetadata] = []
 
         for old_seq_group_metadata in seq_group_metadata_list:
             # We must shallow-copy seq_group_metadata as is_prompt could change.
@@ -140,7 +140,7 @@ def _shallow_copy_inputs(
             new_seq_group_metadata_list.append(seq_group_metadata)
 
             # We must shallow-copy seq_data as we will append token ids
-            new_seq_data = {}
+            new_seq_data: Dict[int, SequenceData] = {}
             for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
                 new_seq_data[seq_id] = copy.copy(old_seq_data)
                 new_seq_data[
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 33af588d0ba29..23a3e1649914b 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -48,7 +48,7 @@ def sampler_output(
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
-    ) -> Tuple[Optional[List[SamplerOutput]], bool]:
+    ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]:
         """NGram match algo to pick proposal candidate. Returns the list of
         sampler output, one per SequenceGroupMetadata.
 
@@ -58,8 +58,8 @@ def sampler_output(
         self._raise_if_unsupported(execute_model_req)
 
         has_spec_out = False
-        token_id_list = []
-        token_prob_list = []
+        token_id_list: List[Optional[torch.Tensor]] = []
+        token_prob_list: List[Optional[torch.Tensor]] = []
         for idx, seq_group_metadata in enumerate(
                 execute_model_req.seq_group_metadata_list):
             seq_data = next(iter(seq_group_metadata.seq_data.values()))
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8b147c80690dd..03fad5663037b 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -7,8 +7,8 @@
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
-                           SequenceGroupMetadata)
+from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
+                           SamplerOutput, SequenceGroupMetadata)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
@@ -516,13 +516,13 @@ def _create_output_sampler_list(
         topk_indices_by_step = topk_indices_by_step.tolist()
 
         # Construct the output on a per-step, per-sequence basis.
-        sampler_output_list = []
+        sampler_output_list: List[SamplerOutput] = []
         for step_index in range(num_steps):
             if all(token_id == -1
                    for token_id in accepted_token_ids_by_step[step_index]):
                 break
 
-            step_output_token_ids = []
+            step_output_token_ids: List[CompletionSequenceGroupOutput] = []
             for sequence_index in range(batch_size):
                 # Each sequence may have a different num_logprobs; retrieve it.
                 num_logprobs = num_logprobs_per_seq[sequence_index]
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 60ed9d39eb8d6..9bbe3f8d16117 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -26,10 +26,10 @@ def get_all_num_logprobs(
     sequence.
     """
 
-    all_num_logprobs = []
+    all_num_logprobs: List[int] = []
     for seq_group_metadata in seq_group_metadata_list:
         num_logprobs = seq_group_metadata.sampling_params.logprobs
-        if seq_group_metadata.sampling_params.logprobs is None:
+        if num_logprobs is None:
             num_logprobs = 0
         all_num_logprobs.append(num_logprobs)
 
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index f064c26c3f40c..e8e53f4946efa 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -44,7 +44,7 @@ def decode_prompt_logprobs_inplace(
         read_offset = 0
         next_iter_prefix_offset = 0
         next_iter_read_offset = 0
-        next_iter_tokens = []
+        next_iter_tokens: List[str] = []
         prev_tokens = None
 
         for token_position, prompt_logprobs_for_token in enumerate(
diff --git a/vllm/utils.py b/vllm/utils.py
index b5c42605ba358..9b39ca77a9801 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -20,12 +20,13 @@
 import numpy as np
 import psutil
 import torch
+import torch.types
+from typing_extensions import ParamSpec
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import enable_trace_function_call, init_logger
 
-T = TypeVar("T")
 logger = init_logger(__name__)
 
 STR_DTYPE_TO_TORCH_DTYPE = {
@@ -37,6 +38,10 @@
     "fp8_e5m2": torch.uint8,
 }
 
+P = ParamSpec('P')
+K = TypeVar("K")
+T = TypeVar("T")
+
 
 class Device(enum.Enum):
     GPU = enum.auto()
@@ -176,7 +181,7 @@ def random_uuid() -> str:
 
 
 @lru_cache(maxsize=None)
-def get_vllm_instance_id():
+def get_vllm_instance_id() -> str:
     """
     If the environment variable VLLM_INSTANCE_ID is set, return it.
     Otherwise, return a random UUID.
@@ -192,7 +197,7 @@ def in_wsl() -> bool:
     return "microsoft" in " ".join(uname()).lower()
 
 
-def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
+def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
     """Take a blocking function, and run it on in an executor thread.
 
     This function prevents the blocking function from blocking the
@@ -200,7 +205,7 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
     The code in this function needs to be thread safe.
     """
 
-    def _async_wrapper(*args, **kwargs) -> asyncio.Future:
+    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
         loop = asyncio.get_event_loop()
         p_func = partial(func, *args, **kwargs)
         return loop.run_in_executor(executor=None, func=p_func)
@@ -325,7 +330,7 @@ def update_environment_variables(envs: Dict[str, str]):
         os.environ[k] = v
 
 
-def chunk_list(lst, chunk_size):
+def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
     """Yield successive chunk_size chunks from lst."""
     return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
 
@@ -336,7 +341,7 @@ def cdiv(a: int, b: int) -> int:
 
 
 def _generate_random_fp8(
-    tensor: torch.tensor,
+    tensor: torch.Tensor,
     low: float,
     high: float,
 ) -> None:
@@ -398,7 +403,10 @@ def create_kv_caches_with_random_flash(
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
     key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
     scale = head_size**-0.5
-    key_caches, value_caches = [], []
+
+    key_caches: List[torch.Tensor] = []
+    value_caches: List[torch.Tensor] = []
+
     for _ in range(num_layers):
         key_value_cache = torch.empty(size=key_value_cache_shape,
                                       dtype=torch_dtype,
@@ -429,7 +437,7 @@ def create_kv_caches_with_random(
     scale = head_size**-0.5
     x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
     key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    key_caches = []
+    key_caches: List[torch.Tensor] = []
     for _ in range(num_layers):
         key_cache = torch.empty(size=key_cache_shape,
                                 dtype=torch_dtype,
@@ -444,7 +452,7 @@ def create_kv_caches_with_random(
         key_caches.append(key_cache)
 
     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_caches = []
+    value_caches: List[torch.Tensor] = []
     for _ in range(num_layers):
         value_cache = torch.empty(size=value_cache_shape,
                                   dtype=torch_dtype,
@@ -484,7 +492,7 @@ def is_pin_memory_available() -> bool:
 
 class CudaMemoryProfiler:
 
-    def __init__(self, device=None):
+    def __init__(self, device: Optional[torch.types.Device] = None):
         self.device = device
 
     def current_memory_usage(self) -> float:
@@ -560,13 +568,13 @@ def get_dtype_size(dtype: torch.dtype) -> int:
     return torch.tensor([], dtype=dtype).element_size()
 
 
-def merge_dicts(dict1: Dict[Any, List[Any]],
-                dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
+def merge_dicts(dict1: Dict[K, List[T]],
+                dict2: Dict[K, List[T]]) -> Dict[K, List[T]]:
     """Merge 2 dicts that have key -> List of items.
 
     When a key conflicts, the values in dict1 is prioritized.
     """
-    merged_dict = defaultdict(list)
+    merged_dict: Dict[K, List[T]] = defaultdict(list)
 
     for key, value in dict1.items():
         merged_dict[key].extend(value)
@@ -577,7 +585,7 @@ def merge_dicts(dict1: Dict[Any, List[Any]],
     return dict(merged_dict)
 
 
-def init_cached_hf_modules():
+def init_cached_hf_modules() -> None:
     """
     Lazy initialization of the Hugging Face modules.
     """
@@ -613,7 +621,7 @@ def find_library(lib_name: str) -> str:
     return locs[0]
 
 
-def find_nccl_library():
+def find_nccl_library() -> str:
     """
     We either use the library file specified by the `VLLM_NCCL_SO_PATH`
     environment variable, or we find the library file brought by PyTorch.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 476e9ba3bb463..d0baa4337f84a 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -779,8 +779,8 @@ def profile_run(self) -> None:
         # that will have unique loras, an therefore the max amount of memory
         # consumption create dummy lora request copies from the lora request
         # passed in, which contains a lora from the lora warmup path.
-        dummy_lora_requests = []
-        dummy_lora_requests_per_seq = []
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
         if self.lora_config:
             assert self.lora_manager is not None
             with self.lora_manager.dummy_lora_cache():
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 258f31de17d87..3d52fd71ec4b8 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -99,8 +99,8 @@ class WorkerWrapperBase:
     """
 
     def __init__(self,
-                 worker_module_name=None,
-                 worker_class_name=None,
+                 worker_module_name: str,
+                 worker_class_name: str,
                  trust_remote_code: bool = False) -> None:
         self.worker_module_name = worker_module_name
         self.worker_class_name = worker_class_name

From 81fbb3655f37e2b3ccbe0e17276c5d813b886417 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Jun 2024 19:29:42 +0800
Subject: [PATCH 06/98] [CI/Build] Test both text and token IDs in batched
 OpenAI Completions API (#5568)

---
 tests/entrypoints/test_openai_server.py | 88 +++++++++++++------------
 1 file changed, 45 insertions(+), 43 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index d66b9b0fd3882..c22a675ff1230 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -655,50 +655,52 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test simple list
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(batch.choices) == 2
-    assert batch.choices[0].text == batch.choices[1].text
-
-    # test n = 2
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        n=2,
-        max_tokens=5,
-        temperature=0.0,
-        extra_body=dict(
-            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
-            # for official client.
-            use_beam_search=True),
-    )
-    assert len(batch.choices) == 4
-    assert batch.choices[0].text != batch.choices[
-        1].text, "beam search should be different"
-    assert batch.choices[0].text == batch.choices[
-        2].text, "two copies of the same prompt should be the same"
-    assert batch.choices[1].text == batch.choices[
-        3].text, "two copies of the same prompt should be the same"
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
 
-    # test streaming
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-    )
-    texts = [""] * 2
-    async for chunk in batch:
-        assert len(chunk.choices) == 1
-        choice = chunk.choices[0]
-        texts[choice.index] += choice.text
-    assert texts[0] == texts[1]
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
 
 
 @pytest.mark.asyncio

From e691918e3bd75a05bc473c77577c494aa6442640 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Sat, 15 Jun 2024 23:59:36 +0900
Subject: [PATCH 07/98] [misc] Do not allow to use lora with chunked prefill.
 (#5538)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/config.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index d9e4a619ee010..54f36e1d66783 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1092,6 +1092,8 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
                 "Due to limitations of the custom LoRA CUDA kernel, "
                 "max_num_batched_tokens must be <= 65528 when "
                 "LoRA is enabled.")
+        if scheduler_config.chunked_prefill_enabled:
+            raise ValueError("LoRA is not supported with chunked prefill yet.")
 
 
 @dataclass

From d919ecc771ece6995a949c3d4284c534a2bd0890 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Sat, 15 Jun 2024 13:38:16 -0400
Subject: [PATCH 08/98] add gptq_marlin test for bug report
 https://github.com/vllm-project/vllm/issues/5088 (#5145)

---
 tests/models/test_gptq_marlin.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index e30100d9bf573..4abbc41c9c287 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -40,6 +40,9 @@
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
     # 8-bit, act_order==True, group_size=32
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
+
+    # 4-bit, act_order==True, group_size=128
+    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
 ]
 
 

From 1c0afa13c57766641e75172ff1cac2e09f79a3b9 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sat, 15 Jun 2024 16:30:51 -0700
Subject: [PATCH 09/98] [BugFix] Don't start a Ray cluster when not using Ray
 (#5570)

---
 vllm/config.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 54f36e1d66783..c0d294ce942e1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -616,9 +616,14 @@ def __init__(
                                      "required for multi-node inference")
                 backend = "ray"
             elif ray_found:
-                from ray.util import get_current_placement_group
-                if self.placement_group or get_current_placement_group():
+                if self.placement_group:
                     backend = "ray"
+                else:
+                    from ray import is_initialized as ray_is_initialized
+                    if ray_is_initialized():
+                        from ray.util import get_current_placement_group
+                        if get_current_placement_group():
+                            backend = "ray"
             self.distributed_executor_backend = backend
             logger.info("Defaulting to use %s for distributed inference",
                         backend)

From 3ce2c050dd919542ef5355635edf71349ea597f2 Mon Sep 17 00:00:00 2001
From: zifeitong <zifei.tong@parasail.io>
Date: Sat, 15 Jun 2024 16:57:54 -0700
Subject: [PATCH 10/98] [Fix] Correct OpenAI batch response format (#5554)

---
 vllm/entrypoints/openai/protocol.py  | 13 ++++++++++++-
 vllm/entrypoints/openai/run_batch.py | 17 +++++++++++++----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 3b56ad63f375d..b57d79859aec5 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -672,6 +672,17 @@ class BatchRequestInput(OpenAIBaseModel):
     body: Union[ChatCompletionRequest, ]
 
 
+class BatchResponseData(OpenAIBaseModel):
+    # HTTP status code of the response.
+    status_code: int = 200
+
+    # An unique identifier for the API request.
+    request_id: str
+
+    # The body of the response.
+    body: Union[ChatCompletionResponse, ]
+
+
 class BatchRequestOutput(OpenAIBaseModel):
     """
     The per-line object of the batch output and error files
@@ -683,7 +694,7 @@ class BatchRequestOutput(OpenAIBaseModel):
     # inputs.
     custom_id: str
 
-    response: Optional[ChatCompletionResponse]
+    response: Optional[BatchResponseData]
 
     # For requests that failed with a non-HTTP error, this will contain more
     # information on the cause of the failure.
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 91e567924b59e..b0c0f4ad2f47e 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -10,7 +10,9 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                               BatchRequestOutput,
-                                              ChatCompletionResponse)
+                                              BatchResponseData,
+                                              ChatCompletionResponse,
+                                              ErrorResponse)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
@@ -77,20 +79,27 @@ async def run_request(chat_serving: OpenAIServingChat,
                       request: BatchRequestInput) -> BatchRequestOutput:
     chat_request = request.body
     chat_response = await chat_serving.create_chat_completion(chat_request)
+
     if isinstance(chat_response, ChatCompletionResponse):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
-            response=chat_response,
+            response=BatchResponseData(
+                body=chat_response, request_id=f"vllm-batch-{random_uuid()}"),
             error=None,
         )
-    else:
+    elif isinstance(chat_response, ErrorResponse):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
-            response=None,
+            response=BatchResponseData(
+                status_code=chat_response.code,
+                request_id=f"vllm-batch-{random_uuid()}"),
             error=chat_response,
         )
+    else:
+        raise ValueError("Request must not be sent in stream mode")
+
     return batch_output
 
 

From f31c1f90e381967d25591a8928782d8a6a13693e Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Sun, 16 Jun 2024 00:48:02 -0700
Subject: [PATCH 11/98] Add basic correctness 2 GPU tests to 4 GPU pipeline
 (#5518)

---
 .buildkite/test-pipeline.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6a2932db9f2dc..6439a315e327a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -32,7 +32,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
 
-- label: Distributed Tests
+- label: Distributed Tests (2 GPUs)
   mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -50,12 +50,16 @@ steps:
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
-- label: Distributed Tests (Multiple Groups)
+- label: Distributed Tests (4 GPUs)
   #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
   - pytest -v -s distributed/test_pynccl.py
+  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
+  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
 
 - label: Engine Test
   mirror_hardwares: [amd]

From 4a6769053ab2616f7f490e6ec5b8241e76ef0c2a Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sun, 16 Jun 2024 10:07:34 -0400
Subject: [PATCH 12/98] [CI][BugFix] Flip is_quant_method_supported condition
 (#5577)

---
 tests/quantization/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 0c92d565d0ddd..29085916afb4d 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -10,5 +10,5 @@ def is_quant_method_supported(quant_method: str) -> bool:
 
     capability = torch.cuda.get_device_capability()
     capability = capability[0] * 10 + capability[1]
-    return (capability <
+    return (capability >=
             QUANTIZATION_METHODS[quant_method].get_min_capability())

From f07d5133202c08899eb5f51134af0f43b7791a33 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 16 Jun 2024 16:07:01 -0700
Subject: [PATCH 13/98] [build][misc] limit numpy version (#5582)

---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index bf9987e3af014..32e2ebe8c6159 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -2,7 +2,7 @@ cmake >= 3.21
 ninja  # For faster builds.
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
-numpy
+numpy < 2.0.0
 requests
 py-cpuinfo
 transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.

From 845a3f26f9706acafe8fa45ae452846d8cc3b97f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 16 Jun 2024 19:08:01 -0700
Subject: [PATCH 14/98] [Doc] add debugging tips for crash and multi-node
 debugging (#5581)

---
 docs/source/getting_started/debugging.rst | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index ff37f4e628692..a22bba1478abb 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -24,6 +24,8 @@ If you have already taken care of the above issues, but the vLLM instance still
 
 With more logging, hopefully you can find the root cause of the issue.
 
+If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
+
 Here are some common issues that can cause hangs:
 
 - **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
@@ -31,15 +33,26 @@ Here are some common issues that can cause hangs:
 
 .. code-block:: python
 
-    # save it as `test.py` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py`
-    # adjust `--nproc-per-node` to the number of GPUs you want to use.
     import torch
     import torch.distributed as dist
     dist.init_process_group(backend="nccl")
-    data = torch.FloatTensor([1,] * 128).to(f"cuda:{dist.get_rank()}")
+    local_rank = dist.get_rank() % torch.cuda.device_count()
+    data = torch.FloatTensor([1,] * 128).to(f"cuda:{local_rank}")
     dist.all_reduce(data, op=dist.ReduceOp.SUM)
     torch.cuda.synchronize()
     value = data.mean().item()
     assert value == dist.get_world_size()
 
+.. tip::
+
+    Save the script as ``test.py``.
+    
+    If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use.
+    
+    If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``:
+  
+    - is the correct IP address of the master node
+    - is reachable from all nodes
+    - is set before running the script.
+
 If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.

From e2b85cf86a522e734a38b1d0314cfe9625003ef9 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Sun, 16 Jun 2024 23:48:06 -0700
Subject: [PATCH 15/98] Fix w8a8 benchmark and add Llama-3-8B (#5562)

---
 .../cutlass_benchmarks/w8a8_benchmarks.py     | 21 ++++++++++++-------
 .../cutlass_benchmarks/weight_shapes.py       |  6 ++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 182105f0b33f2..523e970c2c9be 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -46,7 +46,7 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
 # impl
 
 
-def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                     scale_b: torch.tensor,
                     out_dtype: torch.dtype) -> torch.tensor:
     return torch.mm(a, b)
@@ -115,7 +115,7 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     timers.append(
         bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
                  b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_i8_impl,
+                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
                  "pytorch_bf16_bf16_bf16_matmul-no-scales"))
 
     # cutlass impl
@@ -136,6 +136,13 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     timers = []
 
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
+                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
+
     # pytorch impl: bf16 output, without fp8 fast accum
     timers.append(
         bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
@@ -160,14 +167,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     # cutlass impl: bf16 output
     timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.bfloat16, label, sub_label, cutlass_impl,
-                 "cutlass_fp8_fp8_bf16_scaled_mm"))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
     # cutlass impl: fp16 output
     timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.float16, label, sub_label, cutlass_impl,
-                 "cutlass_fp8_fp8_fp16_scaled_mm"))
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
     return timers
 
 
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index 7ad4a53d376b6..25ec9d6028627 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -22,6 +22,12 @@
         ([4096, 22016], 1),
         ([11008, 4096], 0),
     ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
     "meta-llama/Llama-2-13b-hf": [
         ([5120, 15360], 1),
         ([5120, 5120], 0),

From 9333fb8eb9ed6a62d33ef4d56d589f83a0f19233 Mon Sep 17 00:00:00 2001
From: Amit Garg <gargamit@microsoft.com>
Date: Mon, 17 Jun 2024 09:04:14 -0700
Subject: [PATCH 16/98] [Model] Rename Phi3 rope scaling type (#5595)

---
 vllm/config.py                                |  5 ++++-
 .../model_executor/layers/rotary_embedding.py | 19 ++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index c0d294ce942e1..552d5033fdb9d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1287,7 +1287,10 @@ def _get_and_verify_max_len(
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
-    if rope_scaling is not None and rope_scaling["type"] != "su":
+    # The correct one should be "longrope", kept "su" here
+    # to be backward compatible
+    if rope_scaling is not None and rope_scaling["type"] != "su" \
+        and rope_scaling["type"] != "longrope":
         if disable_sliding_window:
             # TODO(robertgshaw): Find a model that supports rope_scaling
             # with sliding window to see if this case should be allowed.
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 792c4729355a7..5a4940acbbef2 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -467,7 +467,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
         return cache
 
 
-class Phi3SuScaledRotaryEmbedding(nn.Module):
+class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
     """Phi3 family of models scaled rotary embedding.
 
     Based on the original RotaryEmbedding implementation.
@@ -491,11 +491,12 @@ def __init__(
 
         if rotary_dim != head_size:
             raise ValueError(
-                f"`Phi3SuScaledRotaryEmbedding` does not support rotary_dim != \
-                    head_size ({rotary_dim}!={head_size}).")
+                f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \
+                    rotary_dim != head_size ({rotary_dim}!={head_size}).")
         if is_neox_style is False:
             raise ValueError(
-                "`Phi3SuScaledRotaryEmbedding` only supports neox_style.")
+                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
+            )
 
         self.head_size = head_size
         self.max_position_embeddings = max_position_embeddings
@@ -608,7 +609,9 @@ def get_rope(
                                      is_neox_style, dtype)
     else:
         scaling_type = rope_scaling["type"]
-        if scaling_type != "su":
+        # The correct one should be "longrope" but keep "su" here
+        # for backward compatible
+        if scaling_type != "su" and scaling_type != "longrope":
             scaling_factor = rope_scaling["factor"]
         if scaling_type == "linear":
             rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
@@ -633,7 +636,9 @@ def get_rope(
                                                     base, is_neox_style,
                                                     scaling_factor, dtype,
                                                     **extra_kwargs)
-        elif scaling_type == "su":
+        # The correct one should be "longrope" but keep "su" here
+        # for backward compatible
+        elif scaling_type == "su" or scaling_type == "longrope":
             short_factor = rope_scaling["short_factor"]
             long_factor = rope_scaling["long_factor"]
             original_max_position = rope_scaling[
@@ -643,7 +648,7 @@ def get_rope(
                 for k, v in rope_scaling.items()
                 if k in ("short_mscale", "long_mscale")
             }
-            rotary_emb = Phi3SuScaledRotaryEmbedding(
+            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
                 head_size, rotary_dim, max_position, original_max_position,
                 base, is_neox_style, dtype, short_factor, long_factor,
                 **extra_kwargs)

From 9e74d9d003d546c17dca472c3f4b48be651f1d7c Mon Sep 17 00:00:00 2001
From: Charles Riggins <liqianchen123@foxmail.com>
Date: Tue, 18 Jun 2024 00:05:33 +0800
Subject: [PATCH 17/98] Correct alignment in the seq_len diagram. (#5592)

Co-authored-by: Liqian Chen <liqian.chen@deeplang.ai>
---
 vllm/attention/backends/flash_attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 300bab72877b8..1c48e2a0bb33d 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -83,7 +83,7 @@ class FlashAttentionMetadata(AttentionMetadata):
     # |---------------- N iteration ---------------------|
     # |- tokenA -|......................|-- newTokens ---|
     # |---------- context_len ----------|
-    # |-------------------- seq_len ----------------------|
+    # |-------------------- seq_len ---------------------|
     #                                   |-- query_len ---|
 
     # Maximum query length in the batch. None for decoding.

From 890d8d960bb441b4ac46588492db7f16b6da78d7 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 17 Jun 2024 12:32:48 -0400
Subject: [PATCH 18/98] [Kernel] `compressed-tensors` marlin 24 support (#5435)

---
 tests/quantization/test_compressed_tensors.py |  23 ++-
 .../compressed_tensors/compressed_tensors.py  |  48 ++++---
 .../compressed_tensors/schemes/__init__.py    |   2 +
 .../schemes/compressed_tensors_w4a16_24.py    | 134 ++++++++++++++++++
 .../quantization/compressed_tensors/utils.py  |   8 ++
 5 files changed, 196 insertions(+), 19 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 5670498f2d1e9..611c6b8b7fb9a 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -9,7 +9,8 @@
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW4A16,
-    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
@@ -51,8 +52,7 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
 
 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
-    with vllm_runner(model_path, enforce_eager=True,
-                     dtype=torch.float16) as llm:
+    with vllm_runner(model_path, dtype=torch.float16) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
 
@@ -83,3 +83,20 @@ def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
         assert qkv_proj.weight_packed.dtype is torch.int32
         assert qkv_proj.weight_scale.dtype is torch.float16
         assert qkv_proj.weight_packed.pack_factor == 8
+
+
+def test_compressed_tensors_w4a16_marlin24(vllm_runner):
+    model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
+    with vllm_runner(model_path) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
+        assert qkv_proj.weight_packed.dtype is torch.int32
+
+        sampling_params = SamplingParams()
+        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index e134a26efa3d9..92a84b3c0dd89 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -8,16 +8,20 @@
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme, CompressedTensorsW4A16,
-    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match)
+    CompressionFormat, QuantizationArgs, QuantizationStrategy,
+    find_first_name_or_class_match)
 
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str]):
+    def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str],
+                 quant_format: str):
         self.ignore = ignore
         self.layer_quant_details = layer_quant_details
+        self.quant_format = quant_format
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -46,6 +50,7 @@ def get_quant_method(
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         layer_quant_details: Dict[str, Any] = dict()
         ignore: List[str] = config.get("ignore", None)
+        quant_format: str = config.get("format", None)
 
         # The quant_config has multiple config_groups, each containing
         # an input_activations key with details about how the activations are
@@ -69,7 +74,9 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                 except Exception:
                     layer_quant_details[target]["input_activations"] = None
 
-        return cls(layer_quant_details=layer_quant_details, ignore=ignore)
+        return cls(layer_quant_details=layer_quant_details,
+                   ignore=ignore,
+                   quant_format=quant_format)
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -110,17 +117,26 @@ def _get_schema(self, weight_quant: BaseModel,
                     input_quant: BaseModel) -> "CompressedTensorsScheme":
 
         if self._is_w4a16(weight_quant, input_quant):
-            return CompressedTensorsW4A16(num_bits=weight_quant.num_bits,
-                                          strategy=weight_quant.strategy,
-                                          group_size=weight_quant.group_size)
-
-        if self._is_static_tensor_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8StaticTensor()
-
-        if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8DynamicToken()
-
-        raise NotImplementedError("Scheme not supported.")
+            if self.quant_format == CompressionFormat.marlin_24.value:
+                return CompressedTensorsW4A16Sparse24(
+                    strategy=weight_quant.strategy,
+                    num_bits=weight_quant.num_bits,
+                    group_size=weight_quant.group_size)
+            if self.quant_format == CompressionFormat.pack_quantized.value:
+                return CompressedTensorsW4A16(
+                    num_bits=weight_quant.num_bits,
+                    strategy=weight_quant.strategy,
+                    group_size=weight_quant.group_size)
+
+        if self.quant_format == CompressionFormat.int_quantized.value:
+            if self._is_static_tensor_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8StaticTensor()
+
+            if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8DynamicToken()
+
+        raise NotImplementedError(
+            "No compressed-tensors compatible scheme was found.")
 
     def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme":
 
@@ -165,9 +181,9 @@ def create_weights(self, layer: torch.nn.Module,
         scheme = self.quantization_config.get_scheme(layer=layer)
         scheme.create_weights(
             layer=layer,
+            input_size=input_size,
             input_size_per_partition=input_size_per_partition,
             output_partition_sizes=output_partition_sizes,
-            input_size=input_size,
             output_size=output_size,
             params_dtype=params_dtype,
             weight_loader=weight_loader)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index dc84d000803f9..3c95aa11fc76c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -2,6 +2,8 @@
 from .compressed_tensors_unquantized import (  # noqa: F401
     CompressedTensorsUnquantized)
 from .compressed_tensors_w4a16 import CompressedTensorsW4A16  # noqa: F401
+from .compressed_tensors_w4a16_24 import (  # noqa: F401
+    CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
     CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
new file mode 100644
index 0000000000000..d7e04ddb8d94a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -0,0 +1,134 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW4A16Sparse24"]
+
+
+class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
+
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None):
+        self.strategy = strategy
+        self.group_size = group_size
+        self.num_bits = num_bits
+        self.tile_size = 16
+
+        if self.strategy == "group" and self.group_size is None:
+            raise ValueError(
+                "group_size must be given when using strategy group")
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        pack_factor = 32 // self.num_bits
+        output_size_per_partition = sum(output_partition_sizes)
+
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.tile_size // 2,
+                output_size_per_partition * self.tile_size // pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": pack_factor,
+                "marlin_tile_size": self.tile_size,
+                "weight_loader": weight_loader
+            },
+        )
+
+        layer.register_parameter("weight_packed", qweight)
+
+        input_groups = (1 if self.group_size is None else
+                        input_size_per_partition // self.group_size)
+
+        scales = Parameter(
+            torch.empty(
+                input_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                "output_dim": 1,
+                "input_dim": None if input_groups == 1 else 0,
+                "weight_loader": weight_loader
+            },
+        )
+        layer.register_parameter("scale_packed", scales)
+
+        weight_shape = Parameter(torch.empty(2, dtype=torch.int64),
+                                 requires_grad=False)
+
+        layer.register_parameter("weight_shape", weight_shape)
+        set_weight_attrs(weight_shape, {"weight_loader": weight_loader})
+
+        meta = Parameter(
+            torch.empty(
+                input_size_per_partition // 8 // 2 // 2,
+                output_size_per_partition * 2,
+                dtype=torch.int16,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            meta,
+            {
+                "input_dim": 0,
+                "packed_dim": 1,
+                "pack_factor": 1,
+                "output_dim": 1,
+                "marlin_tile_size": 2,
+                "weight_loader": weight_loader
+            },
+        )
+        layer.register_parameter("meta", meta)
+
+        max_workspace_size = (
+            output_size_per_partition //
+            GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL
+        workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int),
+                              requires_grad=False)
+        layer.workspace = workspace
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+        qweight = layer.weight_packed
+        meta = layer.meta
+        scales = layer.scale_packed
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+
+        output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
+                                            workspace, self.num_bits, size_m,
+                                            size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+        return output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index fcc6649101845..b2bec9b603d1a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -6,6 +6,14 @@
 from torch.nn import Module
 
 
+class CompressionFormat(Enum):
+    dense = "dense"
+    sparse_bitmask = "sparse-bitmask"
+    int_quantized = "int-quantized"
+    pack_quantized = "pack-quantized"
+    marlin_24 = "marlin-24"
+
+
 class QuantizationType(str, Enum):
     """
     Enum storing quantization type options

From 1f12122b1714c855c02699775bcd2fb2b34f2577 Mon Sep 17 00:00:00 2001
From: zhyncs <me@zhyncs.com>
Date: Tue, 18 Jun 2024 00:40:35 +0800
Subject: [PATCH 19/98] [Misc] use AutoTokenizer for benchmark serving when
 vLLM not installed (#5588)

---
 benchmarks/backend_request_func.py | 29 ++++++++++++++++++++++++++++-
 benchmarks/benchmark_serving.py    |  5 ++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 52386b8cd62b3..4350b96b04a6a 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -4,10 +4,13 @@
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import aiohttp
+import huggingface_hub.constants
 from tqdm.asyncio import tqdm
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
@@ -388,6 +391,30 @@ def remove_prefix(text: str, prefix: str) -> str:
     return text
 
 
+def get_model(pretrained_model_name_or_path: str):
+    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
+        from modelscope import snapshot_download
+    else:
+        from huggingface_hub import snapshot_download
+
+    model_path = snapshot_download(
+        model_id=pretrained_model_name_or_path,
+        local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+        ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+    return model_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str, trust_remote_code: bool
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+            pretrained_model_name_or_path):
+        pretrained_model_name_or_path = get_model(
+            pretrained_model_name_or_path)
+    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
+                                         trust_remote_code=trust_remote_code)
+
+
 ASYNC_REQUEST_FUNCS = {
     "tgi": async_request_tgi,
     "vllm": async_request_openai_completions,
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index c136ee572fdf3..eef03e7d81c39 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -39,7 +39,10 @@
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
-from vllm.transformers_utils.tokenizer import get_tokenizer
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
 
 
 @dataclass

From 728c4c8a063c25e7a20d6eda20a3f30873bda4c6 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 18 Jun 2024 02:01:25 +0800
Subject: [PATCH 20/98] [Hardware][Intel GPU] Add Intel GPU(XPU) inference
 backend (#3814)

Co-authored-by: Jiang Li <jiang1.li@intel.com>
Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com>
Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
---
 .buildkite/run-xpu-test.sh                    |  14 +
 .buildkite/test-template.j2                   |   5 +
 Dockerfile.xpu                                |  22 +
 benchmarks/benchmark_latency.py               |   2 +-
 benchmarks/benchmark_throughput.py            |   2 +-
 .../getting_started/xpu-installation.rst      |  61 +++
 docs/source/index.rst                         |   1 +
 requirements-xpu.txt                          |  11 +
 setup.py                                      |   8 +
 vllm/_custom_ops.py                           |   3 +-
 vllm/_ipex_ops.py                             | 241 ++++++++++
 vllm/attention/backends/ipex_attn.py          | 355 +++++++++++++++
 vllm/attention/selector.py                    |  15 +-
 vllm/config.py                                |   4 +-
 vllm/distributed/parallel_state.py            |   2 +-
 vllm/engine/arg_utils.py                      |  11 +-
 vllm/engine/async_llm_engine.py               |  11 +
 vllm/engine/llm_engine.py                     |   8 +
 vllm/executor/ray_utils.py                    |   4 +-
 vllm/executor/ray_xpu_executor.py             | 401 +++++++++++++++++
 vllm/executor/xpu_executor.py                 |  98 ++++
 vllm/model_executor/custom_op.py              |   8 +-
 vllm/model_executor/layers/activation.py      |  35 ++
 vllm/model_executor/layers/layernorm.py       |  24 +
 .../model_executor/layers/rotary_embedding.py |  23 +
 .../layers/vocab_parallel_embedding.py        |   2 +-
 vllm/utils.py                                 |  31 +-
 vllm/worker/cache_engine.py                   |   7 +-
 vllm/worker/worker.py                         |   3 +-
 vllm/worker/xpu_model_runner.py               | 417 ++++++++++++++++++
 vllm/worker/xpu_worker.py                     | 193 ++++++++
 31 files changed, 1998 insertions(+), 24 deletions(-)
 create mode 100644 .buildkite/run-xpu-test.sh
 create mode 100644 Dockerfile.xpu
 create mode 100644 docs/source/getting_started/xpu-installation.rst
 create mode 100644 requirements-xpu.txt
 create mode 100644 vllm/_ipex_ops.py
 create mode 100644 vllm/attention/backends/ipex_attn.py
 create mode 100644 vllm/executor/ray_xpu_executor.py
 create mode 100644 vllm/executor/xpu_executor.py
 create mode 100644 vllm/worker/xpu_model_runner.py
 create mode 100644 vllm/worker/xpu_worker.py

diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
new file mode 100644
index 0000000000000..22a7e76937a76
--- /dev/null
+++ b/.buildkite/run-xpu-test.sh
@@ -0,0 +1,14 @@
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t xpu-test -f Dockerfile.xpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f xpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 4a20a462b98ec..3bd1e90c2b711 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -45,6 +45,11 @@ steps:
       queue: intel
     command: bash .buildkite/run-cpu-test.sh
 
+  - label: "XPU Test"
+    agents:
+      queue: intel
+    command: bash .buildkite/run-xpu-test.sh
+
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
new file mode 100644
index 0000000000000..c39e551672d20
--- /dev/null
+++ b/Dockerfile.xpu
@@ -0,0 +1,22 @@
+FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    rm /etc/apt/sources.list.d/intel-graphics.list && \
+    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
+RUN apt-get update  -y \
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-xpu.txt
+
+RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
+
+CMD ["/bin/bash"]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 9937f8333fb7e..11d1bf7a4c58f 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -191,7 +191,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu", "tpu"],
+        choices=["cuda", "cpu", "tpu", "xpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument('--block-size',
                         type=int,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 48dfce4287671..ed65002bc7d3c 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -349,7 +349,7 @@ def main(args: argparse.Namespace):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu", "tpu"],
+        choices=["cuda", "cpu", "tpu", "xpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument(
         "--enable-prefix-caching",
diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst
new file mode 100644
index 0000000000000..4f0d2da25b8e8
--- /dev/null
+++ b/docs/source/getting_started/xpu-installation.rst
@@ -0,0 +1,61 @@
+.. _installation_xpu:
+
+Installation with XPU
+========================
+
+vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+
+Table of contents:
+
+#. :ref:`Requirements <xpu_backend_requirements>`
+#. :ref:`Quick start using Dockerfile <xpu_backend_quick_start_dockerfile>`
+#. :ref:`Build from source <build_xpu_backend_from_source>`
+
+.. _xpu_backend_requirements:
+
+Requirements
+------------
+
+* OS: Linux
+* Supported Hardware: Intel Data Center GPU (Intel ARC GPU WIP)
+* OneAPI requirements: oneAPI 2024.1 
+
+.. _xpu_backend_quick_start_dockerfile:
+
+Quick start using Dockerfile
+----------------------------
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+    $ docker run -it \
+                 --rm \
+                 --network=host \
+                 --device /dev/dri \
+                 -v /dev/dri/by-path:/dev/dri/by-path \
+                 vllm-xpu-env
+
+.. _build_xpu_backend_from_source:
+
+Build from source
+-----------------
+
+- First, install required driver and intel OneAPI 2024.1.
+
+- Second, install Python packages for vLLM XPU backend building:
+
+.. code-block:: console
+
+    $ pip install --upgrade pip
+    $ pip install -v -r requirements-xpu.txt 
+
+- Finally, build and install vLLM XPU backend: 
+
+.. code-block:: console
+
+    $ VLLM_TARGET_DEVICE=xpu python setup.py install
+
+.. note::
+    - FP16 is the default data type in the current XPU backend. The BF16 data
+      type will be supported in the future.
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f5d8627596a70..8795a865c3db6 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -66,6 +66,7 @@ Documentation
    getting_started/cpu-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
+   getting_started/xpu-installation
    getting_started/quickstart
    getting_started/debugging
    getting_started/examples/examples_index
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
new file mode 100644
index 0000000000000..48d899ec70eda
--- /dev/null
+++ b/requirements-xpu.txt
@@ -0,0 +1,11 @@
+# Common dependencies
+-r requirements-common.txt
+
+setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
+
+torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
+intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
+
+triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+
diff --git a/setup.py b/setup.py
index 12a704e08eedb..b2ae6def8cdc6 100644
--- a/setup.py
+++ b/setup.py
@@ -233,6 +233,10 @@ def _is_cpu() -> bool:
     return VLLM_TARGET_DEVICE == "cpu"
 
 
+def _is_xpu() -> bool:
+    return VLLM_TARGET_DEVICE == "xpu"
+
+
 def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
@@ -337,6 +341,8 @@ def get_vllm_version() -> str:
         version += "+tpu"
     elif _is_cpu():
         version += "+cpu"
+    elif _is_xpu():
+        version += "+xpu"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -386,6 +392,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-tpu.txt")
     elif _is_cpu():
         requirements = _read_requirements("requirements-cpu.txt")
+    elif _is_xpu():
+        requirements = _read_requirements("requirements-xpu.txt")
     else:
         raise ValueError(
             "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2f84b8bde6b57..ab2a67950bfea 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -373,7 +373,8 @@ def reshape_and_cache_flash(
                                                    kv_cache_dtype)
 
 
-def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,
+def copy_blocks(key_caches: List[torch.Tensor],
+                value_caches: List[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
     torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
new file mode 100644
index 0000000000000..1e60e0848673b
--- /dev/null
+++ b/vllm/_ipex_ops.py
@@ -0,0 +1,241 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import intel_extension_for_pytorch as ipex
+except ImportError as e:
+    logger.warning("Import error msg: %s", e.msg)
+
+
+class ipex_ops:
+
+    @staticmethod
+    def _reshape_activation_tensor(
+            x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        num = x.size(0)
+        d = x.size(1) // 2
+        x = x.reshape(num, 2, d)
+        x1, x2 = torch.chunk(x, chunks=2, dim=1)
+        x1 = x1.reshape(num, d)
+        x2 = x2.reshape(num, d)
+        return x1, x2
+
+    def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        ipex.llm.functional.silu_mul(x1, x2, out)
+
+    def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        ipex.llm.functional.gelu_mul(x1, x2, out, "none")
+
+    def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
+
+    def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+        out.copy_(torch.nn.functional.gelu(x))
+
+    def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+        out.copy_(torch.nn.functional.gelu(x))
+
+    def paged_attention_v1(
+        out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        kv_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            device=query.device,
+            dtype=torch.int32,
+        ).view(num_kv_heads,
+               1).repeat_interleave(num_queries_per_tokens).flatten()
+        # todo: ipex will refactor namespace
+        torch.xpu.paged_attention_v1(out, query.contiguous(),
+                                     key_cache.view_as(value_cache),
+                                     value_cache, head_mapping, scale,
+                                     block_tables, context_lens, block_size,
+                                     max_context_len, alibi_slopes)
+
+    def paged_attention_v2(
+        out: torch.Tensor,
+        exp_sum: torch.Tensor,
+        max_logits: torch.Tensor,
+        tmp_out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        kv_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            dtype=torch.int32,
+            device=query.device,
+        ).view(num_kv_heads,
+               1).repeat_interleave(num_queries_per_tokens).flatten()
+        # todo: ipex will refactor namespace
+        torch.xpu.paged_attention_v2(out, exp_sum, max_logits, tmp_out,
+                                     query.contiguous(),
+                                     key_cache.view_as(value_cache),
+                                     value_cache, head_mapping, block_tables,
+                                     context_lens, scale, block_size,
+                                     max_context_len, alibi_slopes)
+
+    def rotary_embedding(
+        positions: torch.Tensor,  # [batch_size, seq_len]
+        query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
+        key: torch.Tensor,  # [batch_size, seq_len, num_kv_heads*head_size]
+        head_size: int,
+        cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
+        is_neox: bool,
+    ) -> None:
+        if positions.dim() == 1:
+            positions = positions.unsqueeze(0)
+            query = query.unsqueeze(0)
+            key = key.unsqueeze(0)
+
+        rotary_dim = cos_sin_cache.size(1)
+        query = query.view(*query.shape[:-1], -1, head_size)
+        key = key.view(*key.shape[:-1], -1, head_size)
+
+        query_rot = query[..., :rotary_dim]
+        key_rot = key[..., :rotary_dim]
+
+        cos_sin = cos_sin_cache[positions.long()]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        if is_neox:
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
+                                             rotary_dim, is_neox, positions)
+
+    def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+                                 key: torch.Tensor, head_size: int,
+                                 cos_sin_cache: torch.Tensor, is_neox: bool,
+                                 rot_dim: int,
+                                 cos_sin_cache_offsets: torch.Tensor) -> None:
+        if positions.dim() == 1:
+            positions = positions.unsqueeze(0)
+            query = query.unsqueeze(0)
+            key = key.unsqueeze(0)
+        cos_sin_cache_offsets = cos_sin_cache_offsets.view_as(positions)
+        rotary_dim = cos_sin_cache.size(1)
+        query = query.view(*query.shape[:-1], -1, head_size)
+        key = key.view(*key.shape[:-1], -1, head_size)
+
+        query_rot = query[..., :rotary_dim]
+        key_rot = key[..., :rotary_dim]
+
+        cos_sin = cos_sin_cache[torch.add(positions,
+                                          cos_sin_cache_offsets).long()]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        if is_neox:
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
+                                             rotary_dim, is_neox, positions)
+
+    def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
+                 epsilon: float) -> None:
+        tmp = ipex.llm.functional.rms_norm(input, weight, epsilon)
+        out.copy_(tmp)
+
+    def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+                           weight: torch.Tensor, epsilon: float) -> None:
+        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
+                                               epsilon, True)
+        input.copy_(tmp)
+
+    def varlen_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+        seqlen_q: torch.Tensor,
+        seqlen_k: torch.Tensor,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        pdropout: float,
+        softmax_scale: float,
+        zero_tensors: bool,
+        is_causal: bool,
+        return_softmax: bool,
+        gen_: torch.Generator,
+    ) -> None:
+        ipex.llm.functional.varlen_attention(query, key, value, out, seqlen_q,
+                                             seqlen_k, max_seqlen_q,
+                                             max_seqlen_k, pdropout,
+                                             softmax_scale, zero_tensors,
+                                             is_causal, return_softmax, gen_)
+
+    def reshape_and_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        kv_scale: float,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        ipex.llm.modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache, slot_mapping)
+
+    @staticmethod
+    def copy_blocks(key_caches: List[torch.Tensor],
+                    value_caches: List[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.copy_blocks(key_caches, value_caches, block_mapping)
+
+    def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.swap_blocks(src, dst, block_mapping)
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
new file mode 100644
index 0000000000000..f09b24f2a0304
--- /dev/null
+++ b/vllm/attention/backends/ipex_attn.py
@@ -0,0 +1,355 @@
+""" Attention layer with torch scaled_dot_product_attention
+    and PagedAttention."""
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+
+from vllm._ipex_ops import ipex_ops
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata)
+from vllm.attention.ops.paged_attn import (PagedAttention,
+                                           PagedAttentionMetadata)
+
+_PARTITION_SIZE = 512
+
+
+class IpexAttnBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "ipex-attn"
+
+    @staticmethod
+    def get_impl_cls() -> Type["IpexAttnBackendImpl"]:
+        return IpexAttnBackendImpl
+
+    @staticmethod
+    def make_metadata(*args, **kwargs) -> "IpexAttnMetadata":
+        return IpexAttnMetadata(*args, **kwargs)
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                 num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+
+@dataclass
+class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata):
+    """Metadata for IpexAttnBackend.
+    """
+    # Currently, input sequences can only contain all prompts
+    # or all decoding. True if all sequences are prompts.
+    is_prompt: bool
+    slot_mapping: torch.Tensor
+    seq_lens: Optional[List[int]]
+    seqlen_q: Optional[torch.Tensor]
+    max_seqlen: Optional[int]
+
+    def __post_init__(self):
+        # Set during the execution of the first attention op.
+        # It is a list because it is needed to set per prompt
+        # when alibi slopes is used. It is because of the limitation
+        # from xformer API.
+        # will not appear in the __repr__ and __init__
+        self.attn_bias: Optional[List[torch.Tensor]] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["IpexAttnMetadata"]:
+        # Currently chunked prefill is not supported
+        if self.num_decode_tokens == 0:
+            assert self.num_prefills > 0
+            return self
+
+        return None
+
+    @property
+    def decode_metadata(self) -> Optional["IpexAttnMetadata"]:
+        # Currently chunked prefill is not supported
+        if self.num_prefills > 0:
+            assert self.num_decode_tokens == 0
+            return None
+
+        return self
+
+
+class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        assert blocksparse_params is None, ValueError(
+            "Torch SPDA does not support block-sparse attention.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = sliding_window
+        self.kv_cache_dtype = kv_cache_dtype
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.need_mask = (self.alibi_slopes is not None
+                          or self.sliding_window is not None)
+
+        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {supported_head_sizes}.")
+        if kv_cache_dtype != "auto":
+            raise NotImplementedError(
+                "IPEX backend does not support FP8 KV cache. "
+                "Please use xFormers backend instead.")
+
+    def split_kv_cache(
+        self,
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = 1
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
+                                   -1, x)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
+        return key_cache, value_cache
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: Optional[torch.Tensor],
+        attn_metadata: IpexAttnMetadata,  # type: ignore
+        kv_scale: float = 1.0,
+    ) -> torch.Tensor:
+        """Forward pass with IPEX varlen_attention and PagedAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert kv_scale == 1.0
+        num_tokens, hidden_size = query.shape
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        if kv_cache is not None:
+            key_cache, value_cache = self.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+            ipex_ops.reshape_and_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping.flatten(),
+                self.kv_cache_dtype,
+                kv_scale,
+            )
+
+        if attn_metadata.is_prompt:
+            assert attn_metadata.seq_lens is not None
+            if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
+                if self.num_kv_heads != self.num_heads:
+                    key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
+                    value = value.repeat_interleave(self.num_queries_per_kv,
+                                                    dim=1)
+
+                if attn_metadata.attn_bias is None:
+                    if self.alibi_slopes is not None:
+                        att_masks = _make_alibi_bias(
+                            self.alibi_slopes, query.dtype,
+                            attn_metadata.seq_lens)  # type: ignore
+                    elif self.sliding_window is not None:
+                        att_masks = _make_sliding_window_bias(
+                            attn_metadata.seq_lens, self.sliding_window,
+                            query.dtype)  # type: ignore
+                    else:
+                        att_masks = _make_sliding_window_bias(
+                            attn_metadata.seq_lens, None, dtype=query.dtype)
+                    attn_metadata.attn_bias = att_masks
+
+                output = torch.empty(
+                    (num_tokens, self.num_heads, self.head_size),
+                    dtype=query.dtype,
+                    device=query.device)
+                ipex_ops.varlen_attention(query,
+                                          key,
+                                          value,
+                                          output,
+                                          attn_metadata.seqlen_q,
+                                          attn_metadata.seqlen_q,
+                                          attn_metadata.max_seqlen,
+                                          attn_metadata.max_seqlen,
+                                          pdropout=0.0,
+                                          softmax_scale=self.scale,
+                                          zero_tensors=False,
+                                          is_causal=True,
+                                          return_softmax=False,
+                                          gen_=None)
+            else:
+                # prefix-enabled attention
+                raise RuntimeError(
+                    "IPEX backend doesn't support prefix decoding.")
+
+        else:
+            # Decoding run.
+            max_seq_len = attn_metadata.max_decode_seq_len
+            output = torch.empty_like(query)
+            block_size = value_cache.shape[3]
+            num_seqs, num_heads, head_size = query.shape
+            max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
+                                  _PARTITION_SIZE)
+            # NOTE(woosuk): We use a simple heuristic to decide whether to use
+            # PagedAttention V1 or V2. If the number of partitions is 1, we use
+            # V1 to avoid the overhead of reduction. Also, if the number of
+            # sequences or heads is large, we use V1 since there is enough work
+            # to parallelize.
+            # TODO(woosuk): Tune this heuristic.
+            # For context len > 8192, use V2 kernel to avoid shared memory
+            # shortage.
+            use_v1 = (max_seq_len <= 8192 and
+                      (max_num_partitions == 1 or num_seqs * num_heads > 512))
+            if use_v1:
+                # Run PagedAttention V1.
+                ipex_ops.paged_attention_v1(
+                    output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    self.num_kv_heads,
+                    self.scale,
+                    attn_metadata.block_tables,
+                    attn_metadata.seq_lens_tensor,
+                    block_size,
+                    max_seq_len,
+                    self.alibi_slopes,
+                    self.kv_cache_dtype,
+                    kv_scale,
+                )
+            else:
+                # Run PagedAttention V2.
+                assert _PARTITION_SIZE % block_size == 0
+                tmp_output = torch.empty(
+                    size=(num_seqs, num_heads, max_num_partitions, head_size),
+                    dtype=output.dtype,
+                    device=output.device,
+                )
+                exp_sums = torch.empty(
+                    size=(num_seqs, num_heads, max_num_partitions),
+                    dtype=torch.float32,
+                    device=output.device,
+                )
+                max_logits = torch.empty_like(exp_sums)
+                ipex_ops.paged_attention_v2(
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    self.num_kv_heads,
+                    self.scale,
+                    attn_metadata.block_tables,
+                    attn_metadata.seq_lens_tensor,
+                    block_size,
+                    max_seq_len,
+                    self.alibi_slopes,
+                    self.kv_cache_dtype,
+                    kv_scale,
+                )
+
+            # Reshape the output tensor.
+        return output.view(-1, self.num_heads * self.head_size)
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    dtype: torch.dtype,
+    seq_lens: List[int],
+) -> List[torch.Tensor]:
+    attn_biases = []
+    for seq_len in seq_lens:
+        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
+        # NOTE(zhuohan): HF uses
+        #     `bias = bias[None, :].repeat(seq_len, 1)`
+        # here. We find that both biases give the same results, but
+        # the bias below more accurately follows the original ALiBi
+        # paper.
+        bias = bias[None, :] - bias[:, None]
+
+        num_heads = alibi_slopes.shape[0]
+        bias = bias[None, :].repeat((num_heads, 1, 1))
+        bias.mul_(alibi_slopes[:, None, None])
+        inf_mask = torch.empty(
+            (1, seq_len, seq_len),
+            dtype=bias.dtype,
+            device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1)
+        attn_biases.append((bias + inf_mask).to(dtype))
+
+    return attn_biases
+
+
+def _make_sliding_window_bias(
+    seq_lens: List[int],
+    window_size: Optional[int],
+    dtype: torch.dtype,
+) -> List[torch.Tensor]:
+    attn_biases = []
+    for seq_len in seq_lens:
+        tensor = torch.full(
+            (1, seq_len, seq_len),
+            dtype=dtype,
+            fill_value=1,
+        )
+        shift = 0
+        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
+        if window_size is not None:
+            mask = torch.triu(mask, diagonal=shift - window_size + 1)
+        mask = torch.log(mask)
+        attn_biases.append(mask.to(dtype))
+
+    return attn_biases
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 8b07fb2d768f5..1d56d87ccd119 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -7,7 +7,7 @@
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_cpu, is_hip, is_tpu
+from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu
 
 logger = init_logger(__name__)
 
@@ -19,6 +19,7 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
     PALLAS = enum.auto()
+    IPEX = enum.auto()
 
 
 @lru_cache(maxsize=None)
@@ -58,12 +59,17 @@ def get_attn_backend(
             ROCmFlashAttentionBackend)
         return ROCmFlashAttentionBackend
     elif backend == _Backend.TORCH_SDPA:
-        # TODO: make XPU backend available here.
         assert is_cpu(), RuntimeError(
             "Torch SDPA backend is only used for the CPU device.")
         logger.info("Using Torch SDPA backend.")
         from vllm.attention.backends.torch_sdpa import TorchSDPABackend
         return TorchSDPABackend
+    elif backend == _Backend.IPEX:
+        assert is_xpu(), RuntimeError(
+            "IPEX attention backend is only used for the XPU device.")
+        logger.info("Using IPEX attention backend.")
+        from vllm.attention.backends.ipex_attn import IpexAttnBackend
+        return IpexAttnBackend
     elif backend == _Backend.FLASHINFER:
         logger.info("Using Flashinfer backend.")
         logger.warning("Eager mode is required for the Flashinfer backend. "
@@ -107,6 +113,11 @@ def which_attn_to_use(
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         return _Backend.TORCH_SDPA
 
+    if is_xpu():
+        if selected_backend != _Backend.IPEX:
+            logger.info("Cannot use %s backend on XPU.", selected_backend)
+        return _Backend.IPEX
+
     if is_tpu():
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
diff --git a/vllm/config.py b/vllm/config.py
index 552d5033fdb9d..b1a3a82f5a6c0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -12,7 +12,7 @@
 from vllm.model_executor.models import ModelRegistry
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
-                        is_hip, is_neuron, is_tpu)
+                        is_hip, is_neuron, is_tpu, is_xpu)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -757,6 +757,8 @@ def __init__(self, device: str = "auto") -> None:
                 self.device_type = "tpu"
             elif is_cpu():
                 self.device_type = "cpu"
+            elif is_xpu():
+                self.device_type = "xpu"
             else:
                 # We don't call torch.cuda.is_available() here to
                 # avoid initializing CUDA before workers are forked
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 16c5297af1b53..02b0dcbcb6b24 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -58,7 +58,7 @@ def _split_tensor_dict(
             # because it contains not only the device type but also the device
             # index (e.g. "cuda:0"). We only need the device type.
             # receiving side will set the device index.
-            device = "cpu" if value.is_cpu else "cuda"
+            device = value.device.type
             metadata_list.append(
                 (key, TensorMetadata(device, value.dtype, value.size())))
             tensor_list.append(value)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ba53b5c86fa72..9d04f1dc557fd 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -501,11 +501,12 @@ def add_cli_args(
                   'Enabling this will use the fully sharded layers. '
                   'At high sequence length, max rank or '
                   'tensor parallel size, this is likely faster.'))
-        parser.add_argument("--device",
-                            type=str,
-                            default=EngineArgs.device,
-                            choices=["auto", "cuda", "neuron", "cpu", "tpu"],
-                            help='Device type for vLLM execution.')
+        parser.add_argument(
+            "--device",
+            type=str,
+            default=EngineArgs.device,
+            choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu"],
+            help='Device type for vLLM execution.')
 
         # Related to Vision-language models such as llava
         parser = EngineArgs.add_cli_args_for_vlm(parser)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 03b6d03a9fdef..ab312850b9ec2 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -383,6 +383,17 @@ def from_engine_args(
                 "Distributed execution is not supported with the CPU backend.")
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync
+        elif engine_config.device_config.device_type == "xpu":
+            if distributed_executor_backend is None:
+                from vllm.executor.xpu_executor import XPUExecutorAsync
+                executor_class = XPUExecutorAsync
+            elif distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync
+                executor_class = RayXPUExecutorAsync
+            else:
+                raise RuntimeError(
+                    "Not supported distributed execution model on XPU device.")
         elif distributed_executor_backend == "ray":
             initialize_ray_cluster(engine_config.parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index fd64337d4384c..eed9a17e477f3 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -347,6 +347,14 @@ def from_engine_args(
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
+        elif engine_config.device_config.device_type == "xpu":
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_xpu_executor import RayXPUExecutor
+                executor_class = RayXPUExecutor
+            else:
+                from vllm.executor.xpu_executor import XPUExecutor
+                executor_class = XPUExecutor
         elif distributed_executor_backend == "ray":
             initialize_ray_cluster(engine_config.parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutor
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 4704f5f1b1a10..495fddd175dd4 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -3,7 +3,7 @@
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import get_ip, is_hip
+from vllm.utils import get_ip, is_hip, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -71,7 +71,7 @@ def initialize_ray_cluster(
             "serving.")
 
     # Connect to a ray cluster.
-    if is_hip():
+    if is_hip() or is_xpu():
         ray.init(address=ray_address,
                  ignore_reinit_error=True,
                  num_gpus=parallel_config.world_size)
diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
new file mode 100644
index 0000000000000..dd7c82289341e
--- /dev/null
+++ b/vllm/executor/ray_xpu_executor.py
@@ -0,0 +1,401 @@
+import asyncio
+import os
+import pickle
+from collections import defaultdict
+from itertools import islice, repeat
+from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
+                    Tuple, Union)
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, VisionLanguageConfig)
+from vllm.executor.distributed_gpu_executor import (  # yapf: disable
+    DistributedGPUExecutor, DistributedGPUExecutorAsync)
+from vllm.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+# If the env var is set, it uses the Ray's compiled DAG API
+# which optimizes the control plane overhead.
+# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
+
+
+class RayXPUExecutor(DistributedGPUExecutor):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        speculative_config: Optional[SpeculativeConfig],
+    ) -> None:
+        assert device_config.device_type == "xpu"
+        assert (not speculative_config
+                ), "Speculative decoding not yet supported for XPU backend"
+
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.vision_language_config = vision_language_config
+
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+        # Profile the memory usage and initialize the cache.
+        self.forward_dag = None
+        if USE_RAY_COMPILED_DAG:
+            self.forward_dag = self._compiled_ray_dag()
+
+        # This is non-None when the execute model loop is running
+        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+        # Updated by implementations that require additional args to be passed
+        # to the _run_workers execute_model call
+        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
+
+    def _init_executor(self) -> None:
+        pass
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks.
+
+        This invokes `determine_num_available_blocks` on each worker and takes
+        the min of the results, guaranteeing that the selected cache sizes are
+        compatible with all workers.
+
+        Returns:
+            - Tuple[num_gpu_blocks, num_cpu_blocks]
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers("determine_num_available_blocks", )
+
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+
+        return num_gpu_blocks, num_cpu_blocks
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        if self.parallel_config.tensor_parallel_size == 1:
+            # For single GPU case, we use a ray worker with constrained memory.
+            num_gpus = self.cache_config.gpu_memory_utilization
+        else:
+            # Otherwise, the ray workers are allocated with a full GPU.
+            num_gpus = 1
+
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerWrapper] = []
+
+        # Create the workers.
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=num_gpus,
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerWrapper).remote(
+                worker_module_name="vllm.worker.xpu_worker",
+                worker_class_name="XPUWorker",
+                trust_remote_code=self.model_config.trust_remote_code,
+            )
+
+            worker_ip = ray.get(worker.get_node_ip.remote())
+            if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                # If the worker is on the same node as the driver, we use it
+                # as the resource holder for the driver process.
+                self.driver_dummy_worker = worker
+                self.driver_worker = RayWorkerWrapper(
+                    worker_module_name="vllm.worker.xpu_worker",
+                    worker_class_name="XPUWorker",
+                    trust_remote_code=self.model_config.trust_remote_code,
+                )
+            else:
+                # Else, added to the list of workers.
+                self.workers.append(worker)
+        if self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "GPU node.")
+
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+                                                    use_dummy_driver=True)
+
+        node_workers = defaultdict(list)
+        node_gpus = defaultdict(list)
+
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        # TODO: add env var for xpu
+
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        def collect_arg_helper_func(**kwargs):
+            # avoid writing `{"name": value}` manually
+            return kwargs
+
+        init_worker_all_kwargs = []
+
+        # Initialize the actual workers inside worker wrapper.
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ):
+            local_rank = node_workers[node_id].index(rank)
+            init_worker_all_kwargs.append(
+                collect_arg_helper_func(
+                    model_config=self.model_config,
+                    parallel_config=self.parallel_config,
+                    scheduler_config=self.scheduler_config,
+                    device_config=self.device_config,
+                    cache_config=self.cache_config,
+                    load_config=self.load_config,
+                    local_rank=local_rank,
+                    rank=rank,
+                    distributed_init_method=distributed_init_method,
+                    lora_config=self.lora_config,
+                    vision_language_config=self.vision_language_config,
+                    is_driver_worker=rank == 0,
+                ))
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+
+        self._run_workers("init_device")
+        self._run_workers(
+            "load_model",
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache in all workers.
+        """
+
+        # NOTE: We log here to avoid multiple logs when number of workers is
+        # greater than one. We could log in the engine, but not all executors
+        # have GPUs.
+        logger.info("# GPU blocks: %d, "
+                    "# CPU blocks: %d", num_gpu_blocks, num_cpu_blocks)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        self._run_workers("initialize_cache",
+                          num_gpu_blocks=num_gpu_blocks,
+                          num_cpu_blocks=num_cpu_blocks)
+
+    def _driver_execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        return self.driver_worker.execute_method("execute_model",
+                                                 execute_model_req)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "add_lora",
+            lora_request=lora_request,
+        )
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "remove_lora",
+            lora_id=lora_id,
+        )
+
+    def list_loras(self) -> Set[int]:
+        return self._run_workers("list_loras")
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_remote_workers_only: bool = False,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        use_dummy_driver: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        use_ray_compiled_dag: bool = False,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+
+        - args/kwargs: All workers share the same args/kwargs
+        - args/kwargs and driver_args/driver_kwargs: Driver worker has
+          different args
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
+        """
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        count = len(self.workers)
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, 1, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, 1, None)
+
+        if use_ray_compiled_dag:
+            # Right now, compiled DAG can only accept a single
+            # input. TODO(sang): Fix it.
+            assert self.forward_dag is not None
+            output_channels = self.forward_dag.execute(1)
+        else:
+            # Start the ray workers first.
+            ray_worker_outputs = [
+                worker.execute_method.remote(method, *worker_args,
+                                             **worker_kwargs)
+                for (worker, worker_args, worker_kwargs
+                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+            ]
+        if async_run_remote_workers_only:
+            # Just return futures
+            return ray_worker_outputs
+
+        driver_args = args if all_args is None else all_args[0]
+        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+        # Start the driver worker after all the ray workers.
+        if not use_dummy_driver:
+            driver_worker_output = self.driver_worker.execute_method(
+                method, *driver_args, **driver_kwargs)
+        else:
+            assert self.driver_dummy_worker is not None
+            driver_worker_output = ray.get(
+                self.driver_dummy_worker.execute_method.remote(
+                    method, *driver_args, **driver_kwargs))
+        # Get the results of the ray workers.
+        if self.workers:
+            if use_ray_compiled_dag:
+                try:
+                    ray_worker_outputs = [
+                        pickle.loads(chan.begin_read())
+                        for chan in output_channels
+                    ]
+                finally:
+                    # Has to call end_read in order to reuse the DAG.
+                    for chan in output_channels:
+                        chan.end_read()
+            else:
+                ray_worker_outputs = ray.get(ray_worker_outputs)
+
+        return [driver_worker_output] + ray_worker_outputs
+
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
+
+    def _compiled_ray_dag(self):
+        import pkg_resources
+        required_version = "2.9"
+        current_version = pkg_resources.get_distribution("ray").version
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} or greater is "
+                             f"required, but found {current_version}")
+
+        from ray.dag import InputNode, MultiOutputNode
+        assert self.parallel_config.worker_use_ray
+
+        # Right now, compiled DAG requires at least 1 arg. We send
+        # a dummy value for now. It will be fixed soon.
+        with InputNode() as input_data:
+            forward_dag = MultiOutputNode([
+                worker.execute_model_compiled_dag_remote.
+                bind(  # type: ignore[attr-defined]
+                    input_data) for worker in self.workers
+            ])
+        return forward_dag.experimental_compile()
+
+    def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
+
+    def _check_if_any_actor_is_dead(self):
+        if not self.workers:
+            return
+
+        dead_actors = []
+        for actor in self.workers:
+            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
+            if actor_state["State"] == "DEAD":
+                dead_actors.append(actor)
+        if dead_actors:
+            raise RuntimeError("At least one Worker is dead. "
+                               f"Dead Workers: {dead_actors}. ")
+
+
+class RayXPUExecutorAsync(RayXPUExecutor, DistributedGPUExecutorAsync):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.driver_exec_method = make_async(self.driver_worker.execute_method)
+
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        return await self.driver_exec_method("execute_model",
+                                             execute_model_req)
+
+    async def _start_worker_execution_loop(self):
+        coros = [
+            worker.execute_method.remote("start_worker_execution_loop")
+            for worker in self.workers
+        ]
+        return await asyncio.gather(*coros)
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
new file mode 100644
index 0000000000000..d37200bd02de3
--- /dev/null
+++ b/vllm/executor/xpu_executor.py
@@ -0,0 +1,98 @@
+from typing import List, Optional
+
+import torch
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, VisionLanguageConfig)
+from vllm.executor.executor_base import ExecutorAsyncBase
+from vllm.executor.gpu_executor import GPUExecutor
+from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import make_async
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class XPUExecutor(GPUExecutor):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        speculative_config: Optional[SpeculativeConfig],
+    ) -> None:
+        assert device_config.device_type == "xpu"
+        assert (not speculative_config
+                ), "Speculative decoding not yet supported for XPU backend"
+
+        model_config = _verify_and_get_model_config(model_config)
+
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.vision_language_config = vision_language_config
+        self.speculative_config = None
+
+        # Instantiate the worker and load the model to GPU.
+        self._init_executor()
+
+    def _create_worker(self,
+                       local_rank: int = 0,
+                       rank: int = 0,
+                       distributed_init_method: Optional[str] = None):
+        if self.speculative_config is None:
+            worker_module_name = "vllm.worker.xpu_worker"
+            worker_class_name = "XPUWorker"
+        else:
+            raise NotImplementedError(
+                "XPU does not support speculative decoding")
+
+        wrapper = WorkerWrapperBase(
+            worker_module_name=worker_module_name,
+            worker_class_name=worker_class_name,
+        )
+        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
+                                                      distributed_init_method))
+        return wrapper.worker
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+
+class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req)
+        return output
+
+
+def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
+    if config.dtype == torch.bfloat16:
+        logger.warning(
+            "bfloat16 is not fully supported on XPU, casting to float16.")
+        config.dtype = torch.float16
+    if not config.enforce_eager:
+        logger.warning(
+            "CUDA graph is not supported on XPU, fallback to the eager "
+            "mode.")
+        config.enforce_eager = True
+    return config
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 56aa629ae3455..0db72d8d95f24 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 
-from vllm.utils import is_cpu, is_hip, is_tpu
+from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu
 
 
 class CustomOp(nn.Module):
@@ -29,9 +29,7 @@ def forward_hip(self, *args, **kwargs):
         return self.forward_cuda(*args, **kwargs)
 
     def forward_xpu(self, *args, **kwargs):
-        # By default, we assume that XPU ops are compatible with CUDA ops.
-        # NOTE(woosuk): This is a placeholder for future extensions.
-        return self.forward_cuda(*args, **kwargs)
+        raise NotImplementedError
 
     def forward_cpu(self, *args, **kwargs):
         # By default, we assume that CPU ops are compatible with CUDA ops.
@@ -58,5 +56,7 @@ def dispatch_forward(self):
             return self.forward_cpu
         elif is_tpu():
             return self.forward_tpu
+        elif is_xpu():
+            return self.forward_xpu
         else:
             return self.forward_cuda
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 4d076421f9d2a..eb0606948686d 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -37,6 +37,15 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         ops.silu_and_mul(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
 
 class GeluAndMul(CustomOp):
     """An activation function for GeGLU.
@@ -71,6 +80,18 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
             ops.gelu_tanh_and_mul(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        if self.approximate == "none":
+            ops.gelu_and_mul(out, x)
+        elif self.approximate == "tanh":
+            ops.gelu_tanh_and_mul(out, x)
+        return out
+
     def extra_repr(self) -> str:
         return f'approximate={repr(self.approximate)}'
 
@@ -90,6 +111,13 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         ops.gelu_new(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
 
 class FastGELU(CustomOp):
 
@@ -105,6 +133,13 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         ops.gelu_fast(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
 
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 4533adf8f83aa..14f5e2378a421 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -67,6 +67,30 @@ def forward_cuda(
         )
         return out
 
+    def forward_xpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        if residual is not None:
+            ops.fused_add_rms_norm(
+                x,
+                residual,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return x, residual
+        out = torch.empty_like(x)
+        ops.rms_norm(
+            out,
+            x,
+            self.weight.data,
+            self.variance_epsilon,
+        )
+        return out
+
     def extra_repr(self) -> str:
         s = f"hidden_size={self.weight.data.size(0)}"
         s += f", eps={self.variance_epsilon}"
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 5a4940acbbef2..9c0a74cdab96e 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -221,6 +221,29 @@ def forward_cuda(
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
+                                                   dtype=query.dtype)
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
+        if offsets is not None:
+            ops.batched_rotary_embedding(positions, query, key, self.head_size,
+                                         self.cos_sin_cache,
+                                         self.is_neox_style, self.rotary_dim,
+                                         offsets)
+        else:
+            ops.rotary_embedding(positions, query, key, self.head_size,
+                                 self.cos_sin_cache, self.is_neox_style)
+        return query, key
+
     def forward_tpu(
         self,
         positions: torch.Tensor,
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 60eb5b404e2ca..1a26c5c63fedc 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -307,7 +307,7 @@ def forward(self, input_):
         else:
             masked_input = input_
             # Get the embeddings.
-        output_parallel = F.embedding(masked_input, self.weight)
+        output_parallel = F.embedding(masked_input.long(), self.weight)
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel.masked_fill_(input_mask.unsqueeze(1), 0)
diff --git a/vllm/utils.py b/vllm/utils.py
index 9b39ca77a9801..1adfa9218c047 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -160,6 +160,26 @@ def is_tpu() -> bool:
     return libtpu is not None
 
 
+@lru_cache(maxsize=None)
+def is_xpu() -> bool:
+    from importlib.metadata import version
+    is_xpu_flag = "xpu" in version("vllm")
+    # vllm is not build with xpu
+    if not is_xpu_flag:
+        return False
+    try:
+        import intel_extension_for_pytorch as ipex  # noqa: F401
+        _import_ipex = True
+    except ImportError as e:
+        logger.warning("Import Error for IPEX: %s", e.msg)
+        _import_ipex = False
+    # ipex dependency is not ready
+    if not _import_ipex:
+        logger.warning("not found ipex lib")
+        return False
+    return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+
 @lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
@@ -482,6 +502,9 @@ def is_pin_memory_available() -> bool:
         print_warning_once("Using 'pin_memory=False' as WSL is detected. "
                            "This may slow down the performance.")
         return False
+    elif is_xpu():
+        print_warning_once("Pin memory is not supported on XPU.")
+        return False
     elif is_neuron():
         print_warning_once("Pin memory is not supported on Neuron.")
         return False
@@ -497,8 +520,12 @@ def __init__(self, device: Optional[torch.types.Device] = None):
 
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
-        torch.cuda.reset_peak_memory_stats(self.device)
-        mem = torch.cuda.max_memory_allocated(self.device)
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats(self.device)
+            mem = torch.cuda.max_memory_allocated(self.device)
+        elif is_xpu():
+            torch.xpu.reset_peak_memory_stats(self.device)
+            mem = torch.xpu.max_memory_allocated(self.device)
         return mem
 
     def __enter__(self):
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 341b177d4af2a..fbd1343fea19c 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -4,7 +4,7 @@
 import torch
 
 from vllm.attention import get_attn_backend
-from vllm.config import CacheConfig, ModelConfig, ParallelConfig
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
                         is_pin_memory_available)
@@ -25,10 +25,12 @@ def __init__(
         cache_config: CacheConfig,
         model_config: ModelConfig,
         parallel_config: ParallelConfig,
+        device_config: DeviceConfig,
     ) -> None:
         self.cache_config = cache_config
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.device_config = device_config
 
         self.head_size = model_config.get_head_size()
         self.num_layers = model_config.get_num_layers(parallel_config)
@@ -55,7 +57,8 @@ def __init__(
         )
 
         # Initialize the cache.
-        self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda")
+        self.gpu_cache = self._allocate_kv_cache(
+            self.num_gpu_blocks, self.device_config.device_type)
         self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
 
     def _allocate_kv_cache(
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 7a378a862d0c0..f9b8a065a8b24 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -205,7 +205,8 @@ def initialize_cache(self, num_gpu_blocks: int,
     def _init_cache_engine(self):
         assert self.cache_config.num_gpu_blocks is not None
         self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config)
+                                        self.parallel_config,
+                                        self.device_config)
         self.gpu_cache = self.cache_engine.gpu_cache
 
     def _warm_up_model(self) -> None:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
new file mode 100644
index 0000000000000..f30de703e805d
--- /dev/null
+++ b/vllm/worker/xpu_model_runner.py
@@ -0,0 +1,417 @@
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.attention import get_attn_backend
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.distributed import broadcast_tensor_dict
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
+from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
+
+logger = init_logger(__name__)
+
+_PAD_SLOT_ID = -1
+_BATCH_SIZE_ALIGNMENT = 8
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
+    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
+]
+
+
+class XPUModelRunner:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        *args,
+        **kwargs,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.lora_config = lora_config
+        self.load_config = load_config
+        self.cache_config = cache_config
+        self.vision_language_config = vision_language_config
+        self.is_driver_worker = is_driver_worker
+
+        self.sliding_window = model_config.get_sliding_window()
+        self.device_config = device_config
+        self.device = self.device_config.device
+
+        self.kv_cache_dtype = kv_cache_dtype
+        self.block_size = cache_config.block_size
+        self.max_context_len_to_capture = (
+            self.model_config.max_context_len_to_capture
+            if self.model_config is not None else 0)
+
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.model_config.get_head_size(),
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.kv_cache_dtype,
+            self.block_size,
+        )
+
+        # Lazy initialization.
+        self.model: nn.Module  # Set after init_Model
+
+    def load_model(self) -> None:
+        with CudaMemoryProfiler() as m:
+            self.model = get_model(
+                model_config=self.model_config,
+                device_config=self.device_config,
+                load_config=self.load_config,
+                lora_config=self.lora_config,
+                vision_language_config=self.vision_language_config,
+                parallel_config=self.parallel_config,
+                scheduler_config=self.scheduler_config,
+                cache_config=self.cache_config,
+            )
+
+        self.model_memory_usage = m.consumed_memory
+        logger.info("Loading model weights took %.4f GB",
+                    self.model_memory_usage / float(2**30))
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        # Enable top-k sampling to reflect the accurate memory usage.
+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        max_num_seqs = self.scheduler_config.max_num_seqs
+
+        # Profile memory usage with max_num_sequences sequences and the total
+        # number of tokens equal to max_num_batched_tokens.
+        seqs: List[SequenceGroupMetadata] = []
+        # Additional GPU memory may be needed for vision encoding, which needs
+        # to be accounted for when calculating the GPU blocks for
+        # vLLM blocker manager.
+        # To exercise the worst scenario for GPU memory consumption,
+        # the number of seqs (batch_size) is chosen to maximize the number
+        # of images processed.
+        for group_id in range(max_num_seqs):
+            seq_len = (max_num_batched_tokens // max_num_seqs +
+                       (group_id < max_num_batched_tokens % max_num_seqs))
+
+            seq_data = SequenceData([0] * seq_len)
+            dummy_multi_modal_data = None
+            seq = SequenceGroupMetadata(
+                request_id=str(group_id),
+                is_prompt=True,
+                seq_data={group_id: seq_data},
+                sampling_params=sampling_params,
+                block_tables=None,
+                lora_request=None,
+                multi_modal_data=dummy_multi_modal_data,
+            )
+            seqs.append(seq)
+
+        # Run the model with the dummy inputs.
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        kv_caches = [None] * num_layers
+        self.execute_model(seqs, kv_caches)
+        torch.xpu.synchronize()
+        return
+
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+               Optional[torch.Tensor]]:
+        multi_modal_input = None
+        if self.is_driver_worker:
+            # NOTE: We assume that all sequences in the group are all prompts or
+            # all decodes.
+            is_prompt = seq_group_metadata_list[0].is_prompt
+            # Prepare input tensors.
+            if is_prompt:
+                (input_tokens, input_positions, attn_metadata, seq_lens,
+                 multi_modal_input
+                 ) = self._prepare_prompt(seq_group_metadata_list)
+            else:
+                (input_tokens, input_positions,
+                 attn_metadata) = self._prepare_decode(seq_group_metadata_list)
+                seq_lens = []
+            sampling_metadata = SamplingMetadata.prepare(
+                seq_group_metadata_list,
+                seq_lens,
+                # subquery_lens is not needed if chunked prefill is not
+                # supported. Since CPU worker doesn't support chunked prefill
+                # just use seq_lens instead.
+                seq_lens,
+                self.device,
+                pin_memory=False)
+            # Broadcast the metadata.
+            metadata_dict = {
+                "input_tokens": input_tokens,
+                "input_positions": input_positions,
+                "selected_token_indices":
+                sampling_metadata.selected_token_indices,
+            }
+            metadata_dict.update(attn_metadata.asdict_zerocopy())
+            broadcast_tensor_dict(metadata_dict, src=0)
+        else:
+            metadata_dict = broadcast_tensor_dict(src=0)
+            input_tokens = metadata_dict.pop("input_tokens")
+            input_positions = metadata_dict.pop("input_positions")
+            selected_token_indices = metadata_dict.pop(
+                "selected_token_indices")
+            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
+            sampling_metadata = SamplingMetadata(
+                seq_groups=None,
+                selected_token_indices=selected_token_indices,
+                categorized_sample_indices=None,
+                num_prompts=0,
+            )
+
+        return (input_tokens, input_positions, attn_metadata,
+                sampling_metadata, multi_modal_input)
+
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+        slot_mapping: List[int] = []
+        seq_lens: List[int] = []
+        block_tables: List[List[int]] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert not seq_group_metadata.is_prompt
+            assert seq_group_metadata.token_chunk_size == 1
+
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append(generation_token)
+
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append(position)
+
+                seq_len = seq_len if self.sliding_window is None else min(
+                    seq_len, self.sliding_window)
+                seq_lens.append(seq_len)
+
+                block_table = seq_group_metadata.block_tables[seq_id]
+                block_number = block_table[position // self.block_size]
+                block_offset = position % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append(slot)
+
+                if self.sliding_window is not None:
+                    sliding_window_blocks = (self.sliding_window //
+                                             self.block_size)
+                    block_table = block_table[-sliding_window_blocks:]
+                block_tables.append(block_table)
+
+        max_decode_seq_len = max(seq_lens)
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.long,
+                                    device=self.device)
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=self.device)
+
+        max_block_table_len = max(
+            len(block_table) for block_table in block_tables)
+        block_tables = make_tensor_with_pad(
+            block_tables,
+            max_len=max_block_table_len,
+            pad=0,
+            dtype=torch.int,
+            device=self.device,
+        )
+
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=False,
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seqlen_q=None,
+            max_seqlen=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_seq_len=max_decode_seq_len,
+            num_prefill_tokens=0,
+            num_decode_tokens=len(input_tokens),
+            num_prefills=0,
+            block_tables=block_tables,
+        )
+        return (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+        )
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[torch.Tensor],
+    ) -> Optional[SamplerOutput]:
+        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+         multi_modal_input
+         ) = self.prepare_input_tensors(seq_group_metadata_list)
+
+        model_executable = self.model
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+        # Only perform sampling in the driver worker.
+        if not self.is_driver_worker:
+            return None
+
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+        return output
+
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               Optional[torch.Tensor]]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+        slot_mapping: List[int] = []
+        seq_lens: List[int] = []
+        multi_modal_input_list: List[torch.Tensor] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            prompt_tokens = seq_data.get_token_ids()
+            computed_len = seq_data.get_num_computed_tokens()
+            seq_len = len(prompt_tokens)
+
+            seq_lens.append(seq_len)  # Prompt token num
+            input_tokens.extend(prompt_tokens)  # Token ids
+
+            # Token position ids
+            # NOTE(woosuk): Here we assume that the first token in the prompt
+            # is always the first token in the sequence.
+            input_positions.extend(list(range(computed_len, seq_len)))
+
+            if seq_group_metadata.multi_modal_data:
+                multi_modal_input_list.append(
+                    seq_group_metadata.multi_modal_data.data)
+
+            if seq_group_metadata.block_tables is None:
+                # During memory profiling, the block tables are not initialized
+                # yet. In this case, we just use a dummy slot mapping.
+                slot_mapping.extend([_PAD_SLOT_ID] * seq_len)
+                continue
+
+            # Compute the slot mapping.
+            block_table = seq_group_metadata.block_tables[seq_id]
+            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
+            # where start_idx is max(0, seq_len - sliding_window).
+            # For example, if the prompt len is 10, sliding window is 8, and
+            # block size is 4, the first two tokens are masked and the slot
+            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+            start_idx = 0
+            if self.sliding_window is not None:
+                start_idx = max(0, seq_len - self.sliding_window)
+
+            for i in range(computed_len, seq_len):
+                if i < start_idx:
+                    slot_mapping.append(_PAD_SLOT_ID)
+                    continue
+
+                block_number = block_table[i //
+                                           self.block_size]  # type: ignore
+                block_offset = i % self.block_size  # type: ignore
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append(slot)
+
+        if multi_modal_input_list:
+            assert self.vision_language_config, (
+                "Multi-modal inputs are only supported by "
+                "vision language models.")
+            multi_modal_input = torch.cat(multi_modal_input_list,
+                                          dim=0).to(self.device)
+        else:
+            multi_modal_input = None
+
+        num_prompt_tokens = len(input_tokens)
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)  # type: ignore
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)  # type: ignore
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.long,
+                                    device=self.device)  # type: ignore
+
+        max_seqlen = max(seq_lens)
+        tmp = [0]
+        tmp.extend(seq_lens)
+        seqlen = torch.tensor(tmp)
+        seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device)
+
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=True,
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seqlen_q=seqlen_q,
+            max_seqlen=max_seqlen,
+            seq_lens_tensor=None,
+            max_decode_seq_len=None,
+            num_prefills=len(seq_lens),
+            num_prefill_tokens=num_prompt_tokens,
+            num_decode_tokens=0,
+            block_tables=torch.tensor([], device=self.device, dtype=torch.int),
+        )
+        return (input_tokens, input_positions, attn_metadata, seq_lens,
+                multi_modal_input)
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
new file mode 100644
index 0000000000000..773ee9f8159e1
--- /dev/null
+++ b/vllm/worker/xpu_worker.py
@@ -0,0 +1,193 @@
+"""A XPU worker class."""
+import gc
+import os
+from typing import List, Optional, Tuple
+
+import intel_extension_for_pytorch  # noqa: F401
+import oneccl_bindings_for_pytorch  # noqa: F401
+import torch
+import torch.distributed
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, VisionLanguageConfig)
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.model_executor import set_random_seed
+from vllm.utils import is_xpu
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.worker import Worker
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.xpu_model_runner import XPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class XPUWorker(LoraNotSupportedWorkerBase, Worker):
+    """A worker class that executes (a partition of) the model on a GPU.
+    
+    Each worker is associated with a single XPU device. The worker is 
+    responsible for maintaining the KV cache and executing the model on the 
+    XPU. In case of distributed inference, each worker is assigned a partition
+    of the model.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        lora_config: Optional[LoRAConfig] = None,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+        speculative_config: Optional[SpeculativeConfig] = None,
+        is_driver_worker: bool = False,
+    ) -> None:
+        assert device_config.device_type == "xpu"
+        assert is_xpu()
+
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.lora_config = lora_config
+        self.is_driver_worker = is_driver_worker
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+
+        self.vision_language_config = vision_language_config
+        if self.vision_language_config:
+            assert not self.lora_config, (
+                "To be tested: vision language model with LoRA settings.")
+
+        self.model_runner = XPUModelRunner(  # type: ignore
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            cache_config,
+            load_config=self.load_config,
+            lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=is_driver_worker,
+            vision_language_config=vision_language_config,
+        )
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: CacheEngine
+        self.gpu_cache: List[torch.Tensor]
+
+    def init_device(self) -> None:
+        if self.device_config.device.type == "xpu" and is_xpu():
+            self.device = torch.device(f"xpu:{self.local_rank}")
+            torch.xpu.set_device(self.device)
+            torch.xpu.empty_cache()
+            self.init_gpu_memory = torch.xpu.get_device_properties(
+                self.local_rank).total_memory
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        self.init_worker_distributed_environment()
+        # Initialize the model.
+        set_random_seed(self.model_config.seed)
+
+    # keep this method for `empty_cache` and `synchronize` api
+    @torch.inference_mode()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
+
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+        torch.xpu.empty_cache()
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        self.model_runner.profile_run()
+
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
+        torch.xpu.synchronize()
+        used_memory = torch.xpu.memory_allocated()
+        total_gpu_memory = torch.xpu.get_device_properties(
+            self.local_rank).total_memory
+        free_gpu_memory = total_gpu_memory - used_memory
+
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        peak_memory = self.init_gpu_memory - free_gpu_memory
+        assert peak_memory > 0, (
+            "Error in memory profiling. This happens when the GPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+
+        cache_block_size = self.get_cache_block_size_bytes()
+        num_gpu_blocks = int(
+            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
+             peak_memory) // cache_block_size)
+        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                             cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+        gc.collect()
+        torch.xpu.empty_cache()
+        return num_gpu_blocks, num_cpu_blocks
+
+    def _warm_up_model(self) -> None:
+        # IPEX don't support capture graph yet
+        pass
+
+    def init_worker_distributed_environment(self) -> None:
+        """Initialize the distributed environment."""
+
+        parallel_config = self.parallel_config
+        rank = self.rank
+        distributed_init_method = self.distributed_init_method
+
+        if torch.distributed.is_initialized():
+            torch_world_size = torch.distributed.get_world_size()
+            if torch_world_size != parallel_config.world_size:
+                raise RuntimeError(
+                    "torch.distributed is already initialized but the torch "
+                    "world size does not match parallel_config.world_size "
+                    f"({torch_world_size} vs. {parallel_config.world_size}).")
+        elif not distributed_init_method:
+            raise ValueError(
+                "distributed_init_method must be set if torch.distributed "
+                "is not already initialized")
+        else:
+            # use sockets as default Level zero IPC exchange backend. By
+            # default oneccl will use `drmfd` as mechanism which need extra
+            # dependency (libdrm and drm headers) on your system.
+            ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE",
+                                                "sockets")
+            os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE
+            init_distributed_environment(
+                world_size=parallel_config.world_size,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                local_rank=self.local_rank,
+                backend="ccl")
+
+        ensure_model_parallel_initialized(
+            parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size)

From ab66536dbfedff4ffcbb6dc9f9a21d0a9ac0ec91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Tue, 18 Jun 2024 02:36:10 +0800
Subject: [PATCH 21/98] [CI/BUILD] Support non-AVX512 vLLM building and testing
 (#5574)

---
 .buildkite/run-cpu-test.sh |  5 ++++-
 Dockerfile.cpu             |  4 ++++
 cmake/cpu_extension.cmake  | 13 ++++++++++++-
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 5f9ca5d75994c..532d6ad8811de 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -4,17 +4,20 @@ set -ex
 
 # Try building the docker image
 docker build -t cpu-test -f Dockerfile.cpu .
+docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; }
+remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image
 docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 777bb08296ed9..6e55203decc56 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -21,6 +21,10 @@ WORKDIR /workspace/vllm
 
 RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
 WORKDIR /workspace/
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index a644e5b6a8b21..511e443f78403 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -33,10 +33,21 @@ function (find_isa CPUINFO TARGET OUT)
     endif()
 endfunction()
 
+function (is_avx512_disabled OUT)
+    set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
+    if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
+        set(${OUT} ON PARENT_SCOPE)
+    else()
+        set(${OUT} OFF PARENT_SCOPE)
+    endif()
+endfunction()
+
+is_avx512_disabled(AVX512_DISABLED)
+
 find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 
-if (AVX512_FOUND)
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
         "-mavx512f"
         "-mavx512vl"

From 9e4e6fe2073ff5e4a747d5ce2a08d321268b7254 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 17 Jun 2024 11:41:08 -0700
Subject: [PATCH 22/98] [CI] the readability of benchmarking and prepare for
 dashboard (#5571)

[CI] Improve the readability of performance benchmarking results and prepare for upcoming performance dashboard (#5571)
---
 .buildkite/nightly-benchmarks/README.md       |  21 +-
 .../run-benchmarks-suite.sh                   |   6 +-
 .../convert-results-json-to-markdown.py       | 260 ++++++++++--------
 .../nightly-benchmarks/tests/descriptions.md  |  67 +++++
 .../{ => tests}/latency-tests.json            |   2 +-
 .../{ => tests}/serving-tests.json            |   2 +-
 .../{ => tests}/throughput-tests.json         |   2 +-
 benchmarks/benchmark_latency.py               |   2 +-
 8 files changed, 232 insertions(+), 130 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/tests/descriptions.md
 rename .buildkite/nightly-benchmarks/{ => tests}/latency-tests.json (99%)
 rename .buildkite/nightly-benchmarks/{ => tests}/serving-tests.json (99%)
 rename .buildkite/nightly-benchmarks/{ => tests}/throughput-tests.json (99%)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 6a18be947be99..4036b32a46bf7 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -13,9 +13,17 @@ This benchmark will be *triggered* upon:
 
 **Benchmarking Duration**: about 1hr.
 
-## Configuring the workload for the quick benchmark
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
 
-The workload of the quick benchmark contains two parts: latency tests in `latency-tests.json`, throughput tests in `throughput-tests.json` and serving tests in `serving-tests.json`.
+
+## Configuring the workload
+
+The benchmarking workload contains three parts:
+- Latency tests in `latency-tests.json`.
+- Throughput tests in `throughput-tests.json`.
+- Serving tests in `serving-tests.json`.
+
+See [descriptions.md](tests/descriptions.md) for detailed descriptions. 
 
 ### Latency test
 
@@ -23,7 +31,6 @@ Here is an example of one test inside `latency-tests.json`:
 
 ```json
 [
-    ...
     {
         "test_name": "latency_llama8B_tp1",
         "parameters": {
@@ -34,7 +41,6 @@ Here is an example of one test inside `latency-tests.json`:
             "num_iters": 15
         }
     },
-    ...
 ]
 ```
 
@@ -57,7 +63,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
 
 ```
 [
-    ...
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
@@ -77,7 +82,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
             "num_prompts": 200
         }
     },
-    ...
 ]
 ```
 
@@ -92,7 +96,8 @@ The number of this test is less stable compared to the delay and latency benchma
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 
 ## Visualizing the results
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table.
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
-The JSON file is also attached within each buildkite job for further analysis.
\ No newline at end of file
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 6cff6917f8ad5..021473f76d0e5 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -343,9 +343,9 @@ main() {
   QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
   # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/serving-tests.json
-  run_latency_tests $QUICK_BENCHMARK_ROOT/latency-tests.json
-  run_throughput_tests $QUICK_BENCHMARK_ROOT/throughput-tests.json
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
 
 
   # postprocess benchmarking results
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 75cff84347942..9aa8162d18d2d 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 
 import pandas as pd
@@ -11,12 +12,13 @@
 latency_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    "avg_latency": "Average latency (s)",
-    "P10": "P10 (s)",
-    "P25": "P25 (s)",
-    "P50": "P50 (s)",
-    "P75": "P75 (s)",
-    "P90": "P90 (s)",
+    "avg_latency": "Mean latency (ms)",
+    # "P10": "P10 (s)",
+    # "P25": "P25 (s)",
+    "P50": "Median",
+    # "P75": "P75 (s)",
+    # "P90": "P90 (s)",
+    "P99": "P99",
 }
 
 # thoughput tests and the keys that will be printed into markdown
@@ -24,11 +26,11 @@
 throughput_results_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    "num_requests": "# of req.",
-    "total_num_tokens": "Total # of tokens",
-    "elapsed_time": "Elapsed time (s)",
+    # "num_requests": "# of req.",
+    # "total_num_tokens": "Total # of tokens",
+    # "elapsed_time": "Elapsed time (s)",
     "requests_per_second": "Tput (req/s)",
-    "tokens_per_second": "Tput (tok/s)",
+    # "tokens_per_second": "Tput (tok/s)",
 }
 
 # serving results and the keys that will be printed into markdown
@@ -36,120 +38,148 @@
 serving_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    "completed": "# of req.",
+    # "completed": "# of req.",
     "request_throughput": "Tput (req/s)",
-    "input_throughput": "Input Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
+    # "input_throughput": "Input Tput (tok/s)",
+    # "output_throughput": "Output Tput (tok/s)",
     "mean_ttft_ms": "Mean TTFT (ms)",
     # do not say TTFT again to avoid the table getting too wide
     "median_ttft_ms": "Median",
     "p99_ttft_ms": "P99",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "median_tpot_ms": "Median",
-    "p99_tpot_ms": "P99",
+    # "mean_tpot_ms": "Mean TPOT (ms)",
+    # "median_tpot_ms": "Median",
+    # "p99_tpot_ms": "P99",
     "mean_itl_ms": "Mean ITL (ms)",
     "median_itl_ms": "Median",
     "p99_itl_ms": "P99",
 }
 
-for test_file in results_folder.glob("*.json"):
-
-    with open(test_file, "r") as f:
-        raw_result = json.loads(f.read())
-
-    if "serving" in str(test_file):
-        # this result is generated via `benchmark_serving.py`
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        serving_results.append(raw_result)
-        continue
-
-    elif "latency" in f.name:
-        # this result is generated via `benchmark_latency.py`
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # get different percentiles
-        for perc in [10, 25, 50, 75, 90]:
-            raw_result.update(
-                {f"P{perc}": raw_result["percentiles"][str(perc)]})
-
-        # add the result to raw_result
-        latency_results.append(raw_result)
-        continue
-
-    elif "throughput" in f.name:
-        # this result is generated via `benchmark_throughput.py`
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        throughput_results.append(raw_result)
-        continue
-
-    print(f"Skipping {test_file}")
-
-latency_results = pd.DataFrame.from_dict(latency_results)
-serving_results = pd.DataFrame.from_dict(serving_results)
-throughput_results = pd.DataFrame.from_dict(throughput_results)
-
-# remapping the key, for visualization purpose
-if not latency_results.empty:
-    latency_results = latency_results[list(
-        latency_column_mapping.keys())].rename(columns=latency_column_mapping)
-if not serving_results.empty:
-    serving_results = serving_results[list(
-        serving_column_mapping.keys())].rename(columns=serving_column_mapping)
-if not throughput_results.empty:
-    throughput_results = throughput_results[list(
-        throughput_results_column_mapping.keys())].rename(
-            columns=throughput_results_column_mapping)
-
-# get markdown tables
-latency_md_table = tabulate(latency_results,
-                            headers='keys',
-                            tablefmt='pipe',
-                            showindex=False)
-serving_md_table = tabulate(serving_results,
-                            headers='keys',
-                            tablefmt='pipe',
-                            showindex=False)
-throughput_md_table = tabulate(throughput_results,
-                               headers='keys',
-                               tablefmt='pipe',
-                               showindex=False)
-
-# document the result
-with open(results_folder / "benchmark_results.md", "w") as f:
+
+def read_markdown(file):
+    if os.path.exists(file):
+        with open(file, "r") as f:
+            return f.read() + "\n"
+    else:
+        return f"{file} not found.\n"
+
+
+def results_to_json(latency, throughput, serving):
+    return json.dumps({
+        'latency': latency.to_dict(),
+        'throughput': throughput.to_dict(),
+        'serving': serving.to_dict()
+    })
+
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file, "r") as f:
+            raw_result = json.loads(f.read())
+
+        if "serving" in str(test_file):
+            # this result is generated via `benchmark_serving.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands"), "r") as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            serving_results.append(raw_result)
+            continue
+
+        elif "latency" in f.name:
+            # this result is generated via `benchmark_latency.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands"), "r") as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # get different percentiles
+            for perc in [10, 25, 50, 75, 90, 99]:
+                # Multiply 1000 to convert the time unit from s to ms
+                raw_result.update(
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
+
+            # add the result to raw_result
+            latency_results.append(raw_result)
+            continue
+
+        elif "throughput" in f.name:
+            # this result is generated via `benchmark_throughput.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands"), "r") as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            throughput_results.append(raw_result)
+            continue
+
+        print(f"Skipping {test_file}")
+
+    latency_results = pd.DataFrame.from_dict(latency_results)
+    serving_results = pd.DataFrame.from_dict(serving_results)
+    throughput_results = pd.DataFrame.from_dict(throughput_results)
+
+    raw_results_json = results_to_json(latency_results, throughput_results,
+                                       serving_results)
+
+    # remapping the key, for visualization purpose
     if not latency_results.empty:
-        f.write("## Latency tests\n")
-        f.write(latency_md_table)
-        f.write("\n")
-    if not throughput_results.empty:
-        f.write("## Throughput tests\n")
-        f.write(throughput_md_table)
-        f.write("\n")
+        latency_results = latency_results[list(
+            latency_column_mapping.keys())].rename(
+                columns=latency_column_mapping)
     if not serving_results.empty:
-        f.write("## Serving tests\n")
-        f.write(serving_md_table)
-        f.write("\n")
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+    if not throughput_results.empty:
+        throughput_results = throughput_results[list(
+            throughput_results_column_mapping.keys())].rename(
+                columns=throughput_results_column_mapping)
+
+    processed_results_json = results_to_json(latency_results,
+                                             throughput_results,
+                                             serving_results)
+
+    # get markdown tables
+    latency_md_table = tabulate(latency_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    serving_md_table = tabulate(serving_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    throughput_md_table = tabulate(throughput_results,
+                                   headers='keys',
+                                   tablefmt='pipe',
+                                   showindex=False)
+
+    # document the result
+    with open(results_folder / "benchmark_results.md", "w") as f:
+
+        results = read_markdown(
+            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
+        results = results.format(
+            latency_tests_markdown_table=latency_md_table,
+            throughput_tests_markdown_table=throughput_md_table,
+            serving_tests_markdown_table=serving_md_table,
+            benchmarking_results_in_json_string=processed_results_json)
+        f.write(results)
diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md
new file mode 100644
index 0000000000000..891e4917070d9
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/descriptions.md
@@ -0,0 +1,67 @@
+
+## Latency tests
+
+This test suite aims to test vllm's end-to-end latency under a controlled setup.
+
+- Input length: 32 tokens.
+- Output length: 128 tokens.
+- Batch size: fixed (8).
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: end-to-end latency (mean, median, p99).
+
+### Latency benchmarking results
+
+{latency_tests_markdown_table}
+
+## Throughput tests
+
+This test suite aims to test vllm's throughput.
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm to achieve maximum throughput.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput.
+
+### Throughput benchmarking results
+
+{throughput_tests_markdown_table}
+
+## Serving tests
+
+This test suite aims to test vllm's real serving metrics.
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
+- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+
+### Serving benchmarking results
+
+{serving_tests_markdown_table}
+
+## json version of the benchmarking tables
+
+This section contains the data of the markdown tables above in JSON format. 
+You can load the benchmarking tables into pandas dataframes as follows:
+
+```python
+import json
+import pandas as pd
+
+benchmarking_results_json = """The json string"""
+benchmarking_results = json.loads(benchmarking_results_json)
+latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
+throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
+serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
+```
+
+The json string for all benchmarking tables:
+```json
+{benchmarking_results_in_json_string}
+```
+
+You can also check the raw experiment data in the Artifact tab of the Buildkite page.
+
diff --git a/.buildkite/nightly-benchmarks/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json
similarity index 99%
rename from .buildkite/nightly-benchmarks/latency-tests.json
rename to .buildkite/nightly-benchmarks/tests/latency-tests.json
index 294a8c439c3ae..06488cd79110a 100644
--- a/.buildkite/nightly-benchmarks/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -29,4 +29,4 @@
             "num-iters": 15
         }
     }
-]
+]
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
similarity index 99%
rename from .buildkite/nightly-benchmarks/serving-tests.json
rename to .buildkite/nightly-benchmarks/tests/serving-tests.json
index bb674661279d3..86a0fefa339f7 100644
--- a/.buildkite/nightly-benchmarks/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -56,4 +56,4 @@
             "num_prompts": 200
         }
     }
-]
+]
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
similarity index 99%
rename from .buildkite/nightly-benchmarks/throughput-tests.json
rename to .buildkite/nightly-benchmarks/tests/throughput-tests.json
index db4f908d79971..41ac135748704 100644
--- a/.buildkite/nightly-benchmarks/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@@ -32,4 +32,4 @@
             "backend": "vllm"
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 11d1bf7a4c58f..767afd21aeacf 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -98,7 +98,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
     for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
         latencies.append(run_to_completion(profile_dir=None))
     latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90]
+    percentages = [10, 25, 50, 75, 90, 99]
     percentiles = np.percentile(latencies, percentages)
     print(f'Avg latency: {np.mean(latencies)} seconds')
     for percentage, percentile in zip(percentages, percentiles):

From 1b44aaf4e3559e4e321f32715b08f1aa7e4f3d50 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 17 Jun 2024 14:35:04 -0700
Subject: [PATCH 23/98] [bugfix][distributed] fix 16 gpus local rank
 arrangement (#5604)

---
 vllm/executor/ray_gpu_executor.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 89d1c4ac7cbc2..843332e5ea0c8 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -137,6 +137,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
             node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
             node_gpus[node_id].extend(gpu_ids)
         for node_id, gpu_ids in node_gpus.items():
             node_gpus[node_id] = sorted(gpu_ids)

From e441bad674e6dca076a145ca63f72100318c51e5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 17 Jun 2024 15:08:05 -0700
Subject: [PATCH 24/98] [Optimization] use a pool to reuse
 LogicalTokenBlock.token_ids (#5584)

---
 vllm/block.py | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/vllm/block.py b/vllm/block.py
index 2cc6b947f2255..e7fb29c8c2c61 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -1,5 +1,7 @@
 """Token blocks."""
-from typing import List
+import weakref
+from collections import defaultdict
+from typing import Dict, List
 
 from vllm.utils import Device
 
@@ -7,6 +9,35 @@
 
 DEFAULT_LAST_ACCESSED_TIME = -1
 
+TokensBlock = List[int]
+
+
+class BlockPool:
+    """A pool of physical blocks.
+    When requests come, we create a lot of logical blocks;
+    when requests are done, we destroy a lot of logical blocks.
+    It turns out that creating and destroying logical blocks can be expensive,
+    especially for the `token_ids` field, which is a list of integers.
+    To avoid this overhead, we use a pool to manage the logical blocks.
+    When an old request is done and a new request comes, we can reuse the
+    logical blocks from the old request to feed the new request.
+    """
+
+    def __init__(self) -> None:
+        # block size to list of token blocks
+        self.pool: Dict[int, List[TokensBlock]] = defaultdict(list)
+
+    def alloc_block(self, block_size: int) -> TokensBlock:
+        if block_size in self.pool and self.pool[block_size]:
+            return self.pool[block_size].pop()
+        return [_BLANK_TOKEN_ID] * block_size
+
+    def del_block(self, block: TokensBlock) -> None:
+        self.pool[len(block)].append(block)
+
+
+_BLOCK_POOL = BlockPool()
+
 
 class LogicalTokenBlock:
     """A block that stores a contiguous chunk of tokens from left to right.
@@ -23,7 +54,13 @@ def __init__(
         self.block_number = block_number
         self.block_size = block_size
 
-        self.token_ids = [_BLANK_TOKEN_ID] * block_size
+        self.token_ids = _BLOCK_POOL.alloc_block(block_size)
+        # this finalizer is used to return the block to the pool when the object is deleted # noqa
+        # NOTE: don't use __del__ because it cannot guarantee the order of finalization, # noqa
+        # i.e. `self.token_ids` may be deleted before `self`, and we lose
+        #  the opportunity to return the block to the pool
+        self._finalizer = weakref.finalize(self, _BLOCK_POOL.del_block,
+                                           self.token_ids)
         self.num_tokens = 0
 
     def is_empty(self) -> bool:

From a3e8a05d4c1b79dd44eb92bb6f57eb40c3fbdb21 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bruce@2.7182.net>
Date: Mon, 17 Jun 2024 15:26:41 -0700
Subject: [PATCH 25/98] [Bugfix] Fix KV head calculation for MPT models when
 using GQA (#5142)

---
 vllm/config.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index b1a3a82f5a6c0..d95faf52db1a0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -302,7 +302,11 @@ def get_total_num_kv_heads(self) -> int:
             return 1
 
         # For DBRX and MPT
-        if self.hf_config.model_type in ["dbrx", "mpt"]:
+        if self.hf_config.model_type == "mpt":
+            if "kv_n_heads" in self.hf_config.attn_config:
+                return self.hf_config.attn_config["kv_n_heads"]
+            return self.hf_config.num_attention_heads
+        if self.hf_config.model_type == "dbrx":
             return getattr(self.hf_config.attn_config, "kv_n_heads",
                            self.hf_config.num_attention_heads)
 

From 26e1188e51aca3b76184671d804a8b17c294b610 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Mon, 17 Jun 2024 16:16:10 -0700
Subject: [PATCH 26/98] [Fix] Use utf-8 encoding in
 entrypoints/openai/run_batch.py (#5606)

---
 vllm/entrypoints/openai/run_batch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index b0c0f4ad2f47e..2f18701870a6f 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -58,7 +58,7 @@ async def read_file(path_or_url: str) -> str:
                    session.get(path_or_url) as resp:
             return await resp.text()
     else:
-        with open(path_or_url, "r") as f:
+        with open(path_or_url, "r", encoding="utf-8") as f:
             return f.read()
 
 
@@ -71,7 +71,7 @@ async def write_file(path_or_url: str, data: str) -> None:
         # We should make this async, but as long as this is always run as a
         # standalone program, blocking the event loop won't effect performance
         # in this particular case.
-        with open(path_or_url, "w") as f:
+        with open(path_or_url, "w", encoding="utf-8") as f:
             f.write(data)
 
 

From fa9e3852290ecb6eaae45befbd629bb060f57fb7 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Mon, 17 Jun 2024 19:29:09 -0700
Subject: [PATCH 27/98] [Speculative Decoding 1/2 ] Add typical acceptance
 sampling as one of the sampling techniques in the verifier (#5131)

---
 .../test_typical_acceptance_sampler.py        | 464 ++++++++++++++++++
 .../layers/rejection_sampler.py               | 174 +------
 .../layers/spec_decode_base_sampler.py        | 206 ++++++++
 .../layers/typical_acceptance_sampler.py      | 186 +++++++
 4 files changed, 866 insertions(+), 164 deletions(-)
 create mode 100644 tests/samplers/test_typical_acceptance_sampler.py
 create mode 100644 vllm/model_executor/layers/spec_decode_base_sampler.py
 create mode 100644 vllm/model_executor/layers/typical_acceptance_sampler.py

diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
new file mode 100644
index 0000000000000..87cf37bc926bc
--- /dev/null
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -0,0 +1,464 @@
+"""Tests for rejection sampling."""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.typical_acceptance_sampler import (
+    TypicalAcceptanceSampler)
+from vllm.model_executor.utils import set_random_seed
+
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
+
+
+def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
+    """
+    Generates a fake temperature zero probability distribution.
+    Returns:
+        1. A fake temperature zero probability distribution of shape
+           [batch_size, k, vocab_size]
+        2. Tensor of shape [batch_size, k] containing the token ids 
+           of the probability 1.0 tokens at each position.
+    """
+    # Simulate temperature 0 probability distribution for target probabilities
+    # and create target probabilities such that only 1 token id has
+    # probability 1.0
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    probs = torch.rand(batch_size, k, vocab_size)
+    _, zero_temperature_token_ids = torch.max(probs, dim=-1)
+    # set the probability of the tokens with ids in zero_temperature_token_ids
+    # to 1 and the rest to 0.
+    target_probs = torch.zeros_like(probs).scatter_(
+        -1, zero_temperature_token_ids.unsqueeze(-1), 1.0)
+    return target_probs, zero_temperature_token_ids
+
+
+def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
+                        token_ids_to_exclude: torch.Tensor):
+    """
+    Returns a tensor of shape [batch_size, k] of fake draft token ids
+    drawn randomly from a vocab of size vocab_size. We however ensure
+    that token_ids from token_ids_to_exclude are excluded at the 
+    corresponding positions.
+    """
+    draft_token_ids = torch.empty(batch_size, k, dtype=torch.long)
+    for i in range(batch_size):
+        for j in range(k):
+            # Generate a random token ID excluding token_ids_to_exclude[i, j]
+            while True:
+                token_id = torch.randint(0, vocab_size, (1, )).item()
+                if token_id != token_ids_to_exclude[i, j]:
+                    draft_token_ids[i, j] = token_id
+                    break
+    return draft_token_ids
+
+
+@pytest.mark.parametrize("k", list(range(1, 6)))
+@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
+@pytest.mark.parametrize("batch_size", list(range(1, 32)))
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
+                                    device: str):
+    """
+    Tests that the TypicalAcceptancSampler forward succeeds for
+    different combinations of k, vocab_size, batch_size and num devices.
+    """
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler()
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+    # Verify that sampling succeeds for all cases.
+    typical_acceptance_sampler(target_probs, bonus_token_ids, draft_token_ids)
+
+
+@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
+@pytest.mark.parametrize("which_token_ids",
+                         ["bonus_token_ids", "draft_token_ids"])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
+                               which_token_ids: str, device: str):
+    """
+    Tests that we throw an exception of the token ids fall outside
+    the bound of the provided vocabulary.
+    """
+    k = 3
+    batch_size = 5
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(strict_mode=True)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+    # Verify that appropriate exceptions are thrown for out
+    # of bound vocabs.
+    oob_token_ids = None
+    if which_token_ids == "bonus_token_ids":
+        oob_token_ids = bonus_token_ids
+    elif which_token_ids == "draft_token_ids":
+        oob_token_ids = draft_token_ids
+    else:
+        raise AssertionError()
+
+    if above_or_below_vocab_range == "above":
+        rogue_token_id = vocab_size + 1
+    elif above_or_below_vocab_range == "below":
+        rogue_token_id = -1
+    else:
+        raise AssertionError()
+
+    oob_token_ids[0][0] = rogue_token_id
+
+    with pytest.raises(AssertionError):
+        typical_acceptance_sampler(target_probs, bonus_token_ids,
+                                   draft_token_ids)
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_uniform_target_distribution_accepts_all_tokens(
+        seed: int, disable_bonus_tokens: bool, device: str):
+    """
+     Test the TypicalAcceptanceSampler with a uniform target probability 
+     distribution.
+    
+    This test verifies that when provided with a uniform target probability
+    distribution, the TypicalAcceptanceSampler accepts all draft tokens. The
+    entropy of the uniform target distribution being high should lead to all
+    draft tokens being accepted. The test also ensures that the behavior
+    regarding bonus tokens is consistent with the `disable_bonus_tokens`
+    flag.
+    """
+    set_random_seed(seed)
+    k = 3
+    batch_size = 5
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    # We are using a uniform target probability distribution.
+    # For a uniform distribution the entropy is very high and it
+    # should lead to all draft tokens being accepted. Verify that.
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    if disable_bonus_tokens:
+        assert torch.all(output_token_ids[:, -1] == -1)
+    else:
+        assert torch.all(output_token_ids[:, -1] == bonus_token_ids.squeeze())
+
+    assert torch.all(output_token_ids[:, :k] == draft_token_ids)
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_temperature_zero_target_distribution(seed: int,
+                                              disable_bonus_tokens: bool,
+                                              device: str):
+    """
+    Test the TypicalAcceptanceSampler with a zero-temperature target
+    probability distribution.
+
+    This test verifies that when using a zero-temperature target probability
+    distribution, where only one token has a probability of 1.0, the
+    TypicalAcceptanceSampler correctly rejects all draft tokens that do not
+    match this probability. Additionally, it ensures that when all draft
+    tokens are rejected, the sampler falls back to greedy sampling to select a
+    single token from the target distribution.
+    """
+    set_random_seed(seed)
+    k = 3
+    batch_size = 5
+    vocab_size = 30_000
+    torch.set_default_device(device)
+
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    # Simulate temperature 0 probability distribution for target probabilities
+    # and create target probabilities such that only 1 token id has
+    # probability 1.0
+    target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist(
+        batch_size, k, vocab_size)
+    # Populate draft_token_ids such that they exclude the token_ids
+    # with probability = 1.0
+    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
+                                          zero_temperature_token_ids)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    # The target probaility distribution is a temperature zero distribution
+    # with zero entroy. Since our draft token ids don't match the probability
+    # 1.0 tokens in the target distribution we will reject all of them and
+    # fallback to the greedy sampling for selecting 1 token for each sequence.
+    # Verify the same.
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, -1] == -1)
+    assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:,
+                                                                          0])
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
+                                   device: str):
+    """
+    Test the TypicalAcceptanceSampler with a mixed target probability
+    distribution.
+
+    This test ensures that the TypicalAcceptanceSampler handles a mixed
+    target probability distribution correctly. Specifically, it uses a 
+    zero-temperature distribution for some sequences and a uniform
+    distribution for others. The test verifies that:
+    
+    - For sequences with a zero-temperature distribution, only the token
+    with a probability of 1.0 is accepted, and all other tokens are rejected.
+    - For sequences with a uniform distribution, all draft tokens are
+    accepted.
+    - When `disable_bonus_tokens` is False, the bonus tokens are also accepted
+    for sequences with a uniform distribution.
+    """
+    set_random_seed(seed)
+    k = 3
+    batch_size = 4
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    # For sequences 0 and 2 set the distribution to a temperature
+    # zero distribution. For sequences 1 and 3 set it to a uniform
+    # distribution.
+    target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist(
+        batch_size, k, vocab_size))
+    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
+                                          zero_temperature_token_ids)
+    uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32)
+    target_probs[[1, 3]] = uniform_probs
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    # verify the shape of output_token_ids
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    # For sequences 0 and 2 verify that only 1 token is accepted
+    # which is the token with probability 1.0 in the target distribution
+    # at position 0.
+    assert torch.all(output_token_ids[[0, 2], 1:] == -1)
+    assert (torch.all(output_token_ids[[0, 2],
+                                       0] == zero_temperature_token_ids[[0, 2],
+                                                                        0]))
+    # For sequences 1 and 3 verify that all tokens are accepted since the
+    # target probability distribution is uniform. In addition verify that
+    # if disable_bonus_tokens is false then we also accept the bonus tokens.
+    assert torch.all(
+        output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :])
+    if disable_bonus_tokens:
+        assert torch.all(output_token_ids[[1, 3], -1] == -1)
+    else:
+        assert torch.all(output_token_ids[[1, 3], -1] != -1)
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
+                                 device: str):
+    """
+    Test the TypicalAcceptanceSampler's behavior when only a subset of draft
+    tokens should be accepted.
+
+    This test verifies that the TypicalAcceptanceSampler correctly accepts or
+    rejects draft tokens based on a zero-temperature target probability
+    distribution. Specifically, it ensures that:
+    
+    - When all draft tokens match tokens with a probability of 1.0 in the
+    target distribution, all draft tokens are accepted.
+    - When only some draft tokens match tokens with a probability of 1.0 in
+    the target distribution, only those matching tokens are accepted, and the
+    rest are rejected.
+    """
+    set_random_seed(seed)
+    k = 5
+    batch_size = 1
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    # Create a temperature zero target probability distribution and ensure
+    # all draft token ids correspond to the tokens with 1.0 probability.
+    # Verify that all of them are accepted.
+    target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist(
+        batch_size, k, vocab_size))
+    draft_token_ids = zero_temperature_token_ids
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
+    if disable_bonus_tokens:
+        assert torch.all(output_token_ids[:, -1] == -1)
+    else:
+        assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
+    # Next only keep the first 2 draft tokens same as the zero temperature
+    # tokens. For the remaining 3 choose some other tokens. In the
+    # response we will expect the first 2 tokens to be the same as the
+    # draft tokens and the rest as -1
+    draft_token_ids_to_replace = get_draft_token_ids(
+        batch_size, k, vocab_size, zero_temperature_token_ids)
+    draft_token_ids = torch.cat(
+        (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2])
+    assert torch.all(output_token_ids[:, -3:] == -1)
+
+
+@pytest.mark.parametrize("seed", list(range(1)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_accept_tokens_set_non_default_posteriors(seed: int,
+                                                  disable_bonus_tokens: bool,
+                                                  device: str):
+    """
+    Test the TypicalAcceptanceSampler with custom posterior thresholds and 
+    alpha values. This test verifies that by modifying the posterior
+    thresholds and alpha values we can change the acceptance behavior of the
+    sampler. 
+    """
+    set_random_seed(seed)
+    k = 5
+    batch_size = 1
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    # Simulate temperature 0 probability distribution for target
+    # probabilities and create target probabilities such that only 1 token
+    # id has probability 1.0 and others have a very low probability of
+    # 0.00001. Populate draft_token_ids such that they exclude the token_ids
+    # with probability = 1.0. Without any changes to the posterior thresholds
+    # none of the draft tokens are accepted.
+    target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist(
+        batch_size, k, vocab_size))
+    target_probs[target_probs == 0] = 0.00001
+    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
+                                          zero_temperature_token_ids)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, 1:-1] == -1)
+
+    # Change the posterior threshold values to 0.0 so that we will
+    # now accept even draft tokens with very low probability in the
+    # target distribution. Simulate and verify the same.
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True,
+        disable_bonus_tokens=disable_bonus_tokens,
+        posterior_threshold=0.0,
+        posterior_alpha=0.0)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
+    if disable_bonus_tokens:
+        assert torch.all(output_token_ids[:, -1] == -1)
+    else:
+        assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool,
+                               device: str):
+    """
+    Test the TypicalAcceptanceSampler's method for generating
+    replacement token IDs.
+
+    This test verifies that the `_replacement_token_ids` method of the 
+    TypicalAcceptanceSampler correctly identifies the token IDs to be used
+    as replacements based on the target probability distribution.
+    Specifically, it ensures that the method correctly identifies the
+    tokens with the highest probability for each sequence in the batch.
+    """
+    set_random_seed(seed)
+    k = 10
+    batch_size = 5
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    expected_replacement_tokens = -torch.ones(
+        (batch_size, k), dtype=torch.long)
+    expected_replacement_tokens[:, 0] = torch.argmax(target_probs[:, 0, :],
+                                                     dim=1)
+    actual_replacement_tokens = (
+        typical_acceptance_sampler._replacement_token_ids(target_probs))
+    assert torch.all(expected_replacement_tokens == actual_replacement_tokens)
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index a80703155c0b6..fe9b2fac1117e 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,12 +1,15 @@
 from functools import cached_property
-from typing import Optional, Tuple
+from typing import Tuple
 
 import torch
 import torch.jit
 import torch.nn as nn
 
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler)
 
-class RejectionSampler(nn.Module):
+
+class RejectionSampler(SpecDecodeBaseSampler, nn.Module):
     """Apply modified rejection sampling as described in "Accelerating Large
         Language Model Decoding with Speculative Sampling"
         https://arxiv.org/pdf/2302.01318.pdf.
@@ -22,39 +25,11 @@ def __init__(self,
             Require when bonus tokens will cause corrupt KV cache for
             proposal methods that require KV cache.
             strict_mode: Whether or not to perform shape/device/dtype checks
-                during sampling. This catches correctness issues but adds
-                nontrivial latency.
+            during sampling. This catches correctness issues but adds
+            nontrivial latency.
         """
-        super().__init__()
-        self._disable_bonus_tokens = disable_bonus_tokens
-        self._strict_mode = strict_mode
-
-        # NOTE: A "bonus token" is accepted iff all proposal tokens are
-        # accepted. There is always only one possible bonus token. We store this
-        # value in a variable for readability.
-        self._num_bonus_tokens = 1
-
-        self.num_accepted_tokens: Optional[torch.Tensor] = None
-        self.num_emitted_tokens: Optional[torch.Tensor] = None
-        self.num_draft_tokens: int = 0
-
-    def init_gpu_tensors(self, rank: int) -> None:
-        assert self.num_accepted_tokens is None
-        device = f"cuda:{rank}"
-        self.num_accepted_tokens = torch.tensor(0,
-                                                dtype=torch.long,
-                                                device=device)
-        self.num_emitted_tokens = torch.tensor(0,
-                                               dtype=torch.long,
-                                               device=device)
-
-    @property
-    def probs_dtype(self):
-        return torch.float32
-
-    @property
-    def token_id_dtype(self):
-        return torch.int64
+        SpecDecodeBaseSampler.__init__(self, disable_bonus_tokens, strict_mode)
+        nn.Module.__init__(self)
 
     def forward(
         self,
@@ -100,15 +75,8 @@ def forward(
         # Only perform shape/dtype/device checking in strict mode, as it adds
         # overhead.
         if self._strict_mode:
-            self._raise_if_incorrect_shape(target_probs, bonus_token_ids,
-                                           draft_probs, draft_token_ids)
-            self._raise_if_incorrect_dtype(target_probs, bonus_token_ids,
+            self._raise_if_incorrect_input(target_probs, bonus_token_ids,
                                            draft_probs, draft_token_ids)
-            self._raise_if_inconsistent_device(target_probs, bonus_token_ids,
-                                               draft_probs, draft_token_ids)
-            self._raise_if_out_of_bounds_vocab(target_probs.shape[-1],
-                                               bonus_token_ids,
-                                               draft_token_ids)
 
         accepted, recovered_token_ids = self._batch_modified_rejection_sampling(
             target_probs,
@@ -272,128 +240,6 @@ def _smallest_positive_value(self) -> float:
         """
         return torch.finfo(self.probs_dtype).tiny
 
-    def _create_output(
-            self,
-            accepted: torch.Tensor,  # [batch_size, k]
-            recovered_token_ids: torch.Tensor,  # [batch_size, k]
-            draft_token_ids: torch.Tensor,  # [batch_size, k]
-            bonus_token_ids: torch.Tensor,  # [batch_size]
-    ) -> torch.Tensor:
-        """Format output. Returns a matrix of token ids. When
-        a token is rejected via rejection sampling, all subsequent
-        token ids are set to -1 for the sequence.
-
-        shape = [batch_size, k + num_bonus_tokens]
-        """
-        bonus_token_ids = bonus_token_ids.squeeze()
-        batch_size, k = recovered_token_ids.shape
-
-        # Determine the index of the first False value for each row.
-        limits = (accepted == 0).max(1).indices
-        limits[~(accepted == 0).any(1)] = k
-
-        # Create masks using the indices.
-        indices = torch.arange(k, device=accepted.device).unsqueeze(0)
-        accepted_mask = indices < limits.unsqueeze(1)
-        after_false_mask = indices == limits.unsqueeze(1)
-
-        # Create an extended output tensor
-        output_with_bonus_tokens = -torch.ones(
-            (batch_size, k + self._num_bonus_tokens),
-            dtype=self.token_id_dtype,
-            device=accepted.device)
-        output = output_with_bonus_tokens[:, :k]
-
-        # Fill in the first k columns of the output tensor using masks and data
-        # tensors.
-        torch.where(accepted_mask,
-                    draft_token_ids,
-                    -torch.ones_like(draft_token_ids),
-                    out=output)
-
-        # Fill the last column.
-        # We check output directly as accepted may have True values inconsistent
-        # with causal acceptance.
-        output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1,
-                                                      bonus_token_ids, -1)
-
-        # We disable bonus tokens because it causes corrupt KV cache for
-        # proposal methods that require KV cache. We can fix it by "prefilling"
-        # the bonus token in the proposer. The following issue tracks the fix.
-        # https://github.com/vllm-project/vllm/issues/4212
-        if self._disable_bonus_tokens:
-            output_with_bonus_tokens[:, -1] = -1
-
-        # Fill the recovered token ids.
-        output.mul_(~after_false_mask).add_(
-            recovered_token_ids.mul(after_false_mask))
-
-        self.num_accepted_tokens += accepted.sum()
-        self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum()
-        self.num_draft_tokens += batch_size * k
-
-        return output_with_bonus_tokens
-
-    def _raise_if_incorrect_shape(
-        self,
-        target_probs: torch.Tensor,
-        bonus_token_ids: torch.Tensor,
-        draft_probs: torch.Tensor,
-        draft_token_ids: torch.Tensor,
-    ) -> None:
-        (target_batch_size, num_target_probs,
-         target_vocab_size) = target_probs.shape
-        bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape
-        draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape
-        draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape
-
-        assert draft_batch_size == target_batch_size
-        assert num_draft_probs == num_target_probs
-        assert (draft_vocab_size == target_vocab_size
-                ), f"{draft_vocab_size=} {target_vocab_size=}"
-
-        assert draft_token_ids_batch_size == draft_batch_size
-        assert num_draft_token_ids == num_draft_probs
-
-        assert bonus_batch_size == target_batch_size
-        assert num_bonus_tokens == self._num_bonus_tokens
-
-    def _raise_if_incorrect_dtype(
-        self,
-        target_probs: torch.Tensor,
-        bonus_token_ids: torch.Tensor,
-        draft_probs: torch.Tensor,
-        draft_token_ids: torch.Tensor,
-    ) -> None:
-        assert all(probs.dtype == self.probs_dtype
-                   for probs in [target_probs, draft_probs])
-        assert all(token_ids.dtype == self.token_id_dtype
-                   for token_ids in [bonus_token_ids, draft_token_ids])
-
-    def _raise_if_inconsistent_device(
-        self,
-        target_probs: torch.Tensor,
-        bonus_token_ids: torch.Tensor,
-        draft_probs: torch.Tensor,
-        draft_token_ids: torch.Tensor,
-    ) -> None:
-        devices = [
-            t.device for t in
-            [target_probs, bonus_token_ids, draft_probs, draft_token_ids]
-        ]
-        assert all([devices[0] == device for device in devices])
-
-    def _raise_if_out_of_bounds_vocab(
-        self,
-        vocab_size: int,
-        bonus_token_ids: torch.Tensor,
-        draft_token_ids: torch.Tensor,
-    ) -> None:
-        assert torch.all(bonus_token_ids < vocab_size)
-        assert torch.all(bonus_token_ids >= 0)
-        assert torch.all(draft_token_ids < vocab_size)
-        assert torch.all(draft_token_ids >= 0)
-
 
 # torch.multinomial forces a GPU<->CPU sync.
 # Therefore, we use an optimized implementation instead that skips the sync.
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
new file mode 100644
index 0000000000000..9856a7e7ddea0
--- /dev/null
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -0,0 +1,206 @@
+from typing import Optional
+
+import torch
+
+
+class SpecDecodeBaseSampler():
+    """Base class for samplers used for Speculative Decoding verification
+        step.
+    """
+
+    def __init__(self,
+                 disable_bonus_tokens: bool = True,
+                 strict_mode: bool = False):
+        """Base class constructor.
+        Args:
+            disable_bonus_tokens: Whether or not to disable the bonus token.
+            Require when bonus tokens will cause corrupt KV cache for
+            proposal methods that require KV cache.
+            strict_mode: Whether or not to perform shape/device/dtype checks
+                during sampling. This catches correctness issues but adds
+                nontrivial latency.
+        """
+        super().__init__()
+        self._disable_bonus_tokens = disable_bonus_tokens
+        self._strict_mode = strict_mode
+
+        # NOTE: A "bonus token" is accepted iff all proposal tokens are
+        # accepted. There is always only one possible bonus token. We store this
+        # value in a variable for readability.
+        self._num_bonus_tokens = 1
+
+        self.num_accepted_tokens: Optional[torch.Tensor] = None
+        self.num_emitted_tokens: Optional[torch.Tensor] = None
+        self.num_draft_tokens: int = 0
+
+    def init_gpu_tensors(self, rank: int) -> None:
+        assert self.num_accepted_tokens is None
+        device = f"cuda:{rank}"
+        self.num_accepted_tokens = torch.tensor(0,
+                                                dtype=torch.long,
+                                                device=device)
+        self.num_emitted_tokens = torch.tensor(0,
+                                               dtype=torch.long,
+                                               device=device)
+
+    @property
+    def probs_dtype(self):
+        return torch.float32
+
+    @property
+    def token_id_dtype(self):
+        return torch.int64
+
+    def _create_output(
+            self,
+            accepted: torch.Tensor,  # [batch_size, k]
+            substitute_token_ids: torch.Tensor,  # [batch_size, k]
+            draft_token_ids: torch.Tensor,  # [batch_size, k]
+            bonus_token_ids: torch.Tensor,  # [batch_size]
+    ) -> torch.Tensor:
+        """Format output. Returns a matrix of token ids. When
+        a token is rejected via sampling, all subsequent token ids are 
+        set to -1 for the sequence.
+
+        Args:
+            accepted: A boolean tensor indicating if the corresponding
+            draft token in draft_token_ids should be accepted or not.
+            substitute_token_ids: A tensor of token_ids that can be used
+            as substitutes for the draft token ids if the proposed token
+            is rejected.
+            draft_token_ids: A tensor of token ids speculated by the 
+            draft model.
+            bonus_token_ids: Token ids to use as the bonus token if
+            all the draft tokens are accepted.
+        Returns:
+            A tensor containing the accepted token ids. The shape of the 
+            tensor is [batch_size, k + num_bonus_tokens]
+        """
+        batch_size, k = substitute_token_ids.shape
+        bonus_token_ids = bonus_token_ids.squeeze()
+        # Determine the index of the first False value for each row.
+        limits = (accepted == 0).max(1).indices
+        limits[~(accepted == 0).any(1)] = k
+
+        # Create masks using the indices.
+        indices = torch.arange(k, device=accepted.device).unsqueeze(0)
+        accepted_mask = indices < limits.unsqueeze(1)
+        after_false_mask = indices == limits.unsqueeze(1)
+
+        # Create an extended output tensor
+        output_with_bonus_tokens = -torch.ones(
+            (batch_size, k + self._num_bonus_tokens),
+            dtype=self.token_id_dtype,
+            device=accepted.device)
+        output = output_with_bonus_tokens[:, :k]
+
+        # Fill in the first k columns of the output tensor using masks and data
+        # tensors.
+        output[:, :k] = torch.where(accepted_mask, draft_token_ids,
+                                    -torch.ones_like(draft_token_ids))
+
+        # Fill the last column.
+        # We check output directly as accepted may have True values inconsistent
+        # with causal acceptance.
+        output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1,
+                                                      bonus_token_ids, -1)
+
+        # We disable bonus tokens because it causes corrupt KV cache for
+        # proposal methods that require KV cache. We can fix it by "prefilling"
+        # the bonus token in the proposer. The following issue tracks the fix.
+        # https://github.com/vllm-project/vllm/issues/4212
+        if self._disable_bonus_tokens:
+            output_with_bonus_tokens[:, -1] = -1
+
+        # Fill the recovered token ids.
+        output.mul_(~after_false_mask).add_(
+            substitute_token_ids.mul(after_false_mask))
+
+        self.num_accepted_tokens += accepted.sum()
+        self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum()
+        self.num_draft_tokens += batch_size * k
+
+        return output_with_bonus_tokens
+
+    def _raise_if_incorrect_input(
+        self,
+        target_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: Optional[torch.Tensor] = None,
+    ) -> None:
+        self._raise_if_incorrect_shape(target_probs, draft_token_ids,
+                                       bonus_token_ids, draft_probs)
+        self._raise_if_incorrect_dtype(target_probs, draft_token_ids,
+                                       bonus_token_ids, draft_probs)
+        self._raise_if_inconsistent_device(target_probs, draft_token_ids,
+                                           bonus_token_ids, draft_probs)
+        self._raise_if_out_of_bounds_vocab(target_probs.shape[-1],
+                                           draft_token_ids, bonus_token_ids)
+
+    def _raise_if_incorrect_shape(
+        self,
+        target_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: Optional[torch.Tensor] = None,
+    ) -> None:
+        (target_batch_size, num_target_probs,
+         target_vocab_size) = target_probs.shape
+
+        # validate the shape of draft token ids.
+        draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape
+        assert draft_token_ids_batch_size == target_batch_size
+        assert num_draft_token_ids == num_target_probs
+
+        # validate the shape of bonus token ids
+        bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape
+        assert bonus_batch_size == target_batch_size
+        assert num_bonus_tokens == self._num_bonus_tokens
+
+        # validate the shape of draft probs if it is set
+        if draft_probs is not None:
+            (draft_batch_size, num_draft_probs,
+             draft_vocab_size) = draft_probs.shape
+            assert draft_batch_size == target_batch_size
+            assert num_draft_probs == num_target_probs
+            assert (draft_vocab_size == target_vocab_size
+                    ), f"{draft_vocab_size=} {target_vocab_size=}"
+
+    def _raise_if_incorrect_dtype(
+        self,
+        target_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: Optional[torch.Tensor] = None,
+    ) -> None:
+        assert target_probs.dtype == self.probs_dtype
+        assert draft_token_ids.dtype == self.token_id_dtype
+        assert bonus_token_ids.dtype == self.token_id_dtype
+        if draft_probs is not None:
+            assert draft_probs.dtype == self.probs_dtype
+
+    def _raise_if_inconsistent_device(
+        self,
+        target_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: Optional[torch.Tensor] = None,
+    ) -> None:
+        devices = [
+            t.device for t in
+            [target_probs, bonus_token_ids, draft_probs, draft_token_ids]
+            if t is not None
+        ]
+        assert all([devices[0] == device for device in devices])
+
+    def _raise_if_out_of_bounds_vocab(
+        self,
+        vocab_size: int,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+    ) -> None:
+        assert torch.all(bonus_token_ids < vocab_size)
+        assert torch.all(bonus_token_ids >= 0)
+        assert torch.all(draft_token_ids < vocab_size)
+        assert torch.all(draft_token_ids >= 0)
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
new file mode 100644
index 0000000000000..f12d6a03b4d16
--- /dev/null
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -0,0 +1,186 @@
+import torch
+import torch.jit
+import torch.nn as nn
+
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler)
+
+
+class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
+    """Apply typical acceptance sampling as described in section 3.3.1 in 
+        "MEDUSA: Simple LLM Inference Acceleration Framework with 
+        Multiple Decoding Heads"
+        https://arxiv.org/pdf/2401.10774
+    """
+
+    def __init__(
+        self,
+        disable_bonus_tokens: bool = False,
+        strict_mode: bool = False,
+        posterior_threshold: float = 0.09,
+        posterior_alpha: float = 0.3,
+    ):
+        """Create a Typical Acceptance Sampler.
+
+        Args:
+            disable_bonus_tokens: Whether or not to disable the bonus token.
+            Require when bonus tokens will cause corrupt KV cache for
+            proposal methods that require KV cache.
+            strict_mode: Whether or not to perform shape/device/dtype checks
+            during sampling. This catches correctness issues but adds
+            nontrivial latency.
+            posterior_threshold : A threshold value that sets a lower bound 
+            on the posterior probability of a token in target model for it
+            to be accepted. Default is 0.09
+            posterior_alpha : A scaling factor for the entropy-based
+            threshold in typical acceptance sampling. Typically defaults to
+            sqrt of posterior_threshold and is set to 0.3.
+        """
+        SpecDecodeBaseSampler.__init__(
+            self,
+            disable_bonus_tokens=disable_bonus_tokens,
+            strict_mode=strict_mode)
+        nn.Module.__init__(self)
+        self._posterior_threshold = posterior_threshold
+        self._posterior_alpha = posterior_alpha
+
+    def forward(
+        self,
+        target_probs: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        """Sample token ids using typical acceptance sampling. This accepts 
+        or rejects tokens proposed by the draft model using the probability
+        of each token according to the draft and target models.
+
+        In the worst case where all draft tokens are rejected, it is guaranteed
+        one token will be emitted.
+
+        In the case where all draft tokens are accepted, the bonus token will be
+        accepted conditioned on self._disable_bonus_tokens being false.
+
+        Args:
+            target_probs: The probability distribution over token ids given
+                context according to the target model.
+            shape = [batch_size, num_speculative_tokens, vocab_size]
+
+            bonus_token_ids: The "bonus" token ids that are accepted iff all
+                speculative tokens in a sequence are accepted.
+            shape = [batch_size, num_bonus_tokens]
+
+            draft_token_ids: The token ids that were sampled from the draft
+                probabilities.
+            shape = [batch_size, num_speculative_tokens]
+
+        Returns:
+            output_token_ids: The token ids sampled via rejection sampling,
+                or -1 if unable to sample a token because the previous token
+                was rejected.
+            shape = [batch_size, num_speculative_tokens + num_bonus_tokens]
+        """
+        # Only perform shape/dtype/device checking in strict mode, as it adds
+        # overhead.
+        if self._strict_mode:
+            self._raise_if_incorrect_input(target_probs, draft_token_ids,
+                                           bonus_token_ids)
+        accepted = self._evaluate_accepted_tokens(target_probs,
+                                                  draft_token_ids)
+        recovered_token_ids = self._replacement_token_ids(target_probs)
+        output_token_ids = self._create_output(accepted, recovered_token_ids,
+                                               draft_token_ids,
+                                               bonus_token_ids)
+        return output_token_ids
+
+    def _evaluate_accepted_tokens(self, target_probs, draft_token_ids):
+        r"""
+        Evaluates and returns a mask of accepted tokens based on the
+        posterior probabilities.
+
+        Parameters:
+        ----------
+        target_probs : torch.Tensor
+            A tensor of shape (batch_size, k, vocab_size) representing 
+            the probabilities of each token in the vocabulary for each
+            position in the proposed sequence. This is the distribution
+            generated by the target model.
+        draft_token_ids : torch.Tensor
+            A tensor of shape (batch_size, k) representing the proposed
+            token ids.
+
+        A draft token_id x_{n+k} is accepted if it satisfies the
+        following condition
+    
+        .. math::
+            p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > 
+            \min \left( \epsilon, \delta * \exp \left(
+                -H(p_{\text{original}}(
+                    \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
+        
+        where :math:`p_{\text{original}}` corresponds to target_probs 
+        and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters
+        specified using self._posterior_threshold and self._posterior_alpha
+
+        This method computes the posterior probabilities for the given
+        draft token ids based on the provided target probabilities. It
+        calculates the entropy of the posterior distribution and determines
+        a dynamic threshold for each token position using the provided
+        posterior_threshold and posterior_alpha values. The method then
+        returns a boolean mask indicating which tokens can be accepted.
+
+        Returns:
+        -------
+        torch.Tensor
+            A boolean tensor of shape (batch_size, k) where each element
+            indicates whether the corresponding draft token has been accepted
+            or rejected. True indicates acceptance and false indicates
+            rejection.
+            
+        """
+        device = target_probs.device
+        candidates_prob = torch.gather(
+            target_probs, dim=-1,
+            index=draft_token_ids.unsqueeze(-1)).squeeze(-1)
+        # A small constant added to prevent computing the logarithm of zero,
+        # which can lead to undefined values.
+        epsilon = 1e-5
+        posterior_entropy = -torch.sum(
+            target_probs * torch.log(target_probs + epsilon), dim=-1)
+        threshold = torch.minimum(
+            torch.ones_like(posterior_entropy, device=device) *
+            self._posterior_threshold,
+            torch.exp(-posterior_entropy) * self._posterior_alpha,
+        )
+        accepted_mask = candidates_prob > threshold
+        return accepted_mask
+
+    def _replacement_token_ids(self, target_probs):
+        """
+        Generate one replacement token ID for each sequence based on target
+        probabilities. The replacement token is used as the fallback option
+        if typical acceptance sampling does not accept any draft tokens for
+        that particular sequence. 
+
+        This method computes the token IDs to be replaced by selecting the
+        token with the highest probability for each sequence in the first 
+        position. The rest of the output is filled with -1. 
+
+        Parameters
+        ----------
+        target_probs : torch.Tensor
+            A tensor of shape (batch_size, k, vocab_size) containing 
+            the target probability distribution
+
+        Returns
+        -------
+        torch.Tensor
+            A tensor of shape (batch_size, k) with the replacement 
+            token IDs. Only the first column is set, and the rest of the
+            columns are filled with -1.
+        """
+        max_indices = torch.argmax(target_probs[:, 0, :], dim=1)
+        output = -torch.ones((target_probs.shape[0], target_probs.shape[1]),
+                             dtype=self.token_id_dtype,
+                             device=target_probs.device)
+        output[:, 0] = max_indices
+        return output

From daef218b5595a8c744ee143223f4f0544619ea9f Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 18 Jun 2024 10:34:33 +0800
Subject: [PATCH 28/98] [Model] Initialize Phi-3-vision support (#4986)

---
 docs/source/models/supported_models.rst |   4 +
 examples/phi3v_example.py               |  57 ++++
 requirements-test.txt                   |   1 +
 tests/conftest.py                       |   3 +
 tests/models/test_phi3v.py              | 124 ++++++++
 vllm/model_executor/models/__init__.py  |   1 +
 vllm/model_executor/models/phi3v.py     | 379 ++++++++++++++++++++++++
 vllm/multimodal/utils.py                |   2 +
 8 files changed, 571 insertions(+)
 create mode 100644 examples/phi3v_example.py
 create mode 100644 tests/models/test_phi3v.py
 create mode 100644 vllm/model_executor/models/phi3v.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 5d3f55be1271f..f4673dc27092f 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -135,6 +135,10 @@ Alongside each architecture, we include some popular models that use it.
     - Phi-3-Small
     - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
     -
+  * - :code:`Phi3VForCausalLM`
+    - Phi-3-Vision
+    - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
+    -
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
new file mode 100644
index 0000000000000..d5e60ae1ee3a2
--- /dev/null
+++ b/examples/phi3v_example.py
@@ -0,0 +1,57 @@
+import os
+import subprocess
+
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+from vllm.multimodal.image import ImagePixelData
+
+
+def run_phi3v():
+    model_path = "microsoft/Phi-3-vision-128k-instruct"
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=4096,
+        image_input_type="pixel_values",
+        image_token_id=32044,
+        image_input_shape="1,3,1008,1344",
+        image_feature_size=1921,
+        disable_image_processor=False,
+    )
+
+    image = Image.open("images/cherry_blossom.jpg")
+
+    # single-image prompt
+    prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n"  # noqa: E501
+    prompt = prompt.replace("<|image_1|>", "<|image|>" * 1921 + "<s>")
+
+    sampling_params = SamplingParams(temperature=0, max_tokens=64)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "sampling_params": sampling_params,
+        "multi_modal_data": ImagePixelData(image),
+    })
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
+    local_directory = "images"
+
+    # Make sure the local directory exists or create it
+    os.makedirs(local_directory, exist_ok=True)
+
+    # Use AWS CLI to sync the directory, assume anonymous access
+    subprocess.check_call([
+        "aws",
+        "s3",
+        "sync",
+        s3_bucket_path,
+        local_directory,
+        "--no-sign-request",
+    ])
+    run_phi3v()
diff --git a/requirements-test.txt b/requirements-test.txt
index 8b68e0e939669..fef0ede7be0ff 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,6 +14,7 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
+torchvision # required for the image processor of phi3v
 
 # Benchmarking
 aiohttp
diff --git a/tests/conftest.py b/tests/conftest.py
index 18aea3702d0a9..f37c9883f7cf1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -144,6 +144,7 @@ def __init__(
         model_name: str,
         dtype: str = "half",
         *,
+        model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
     ) -> None:
@@ -166,11 +167,13 @@ def __init__(
             else:
                 auto_cls = AutoModelForCausalLM
 
+            model_kwargs = model_kwargs if model_kwargs is not None else {}
             self.model = self.wrap_device(
                 auto_cls.from_pretrained(
                     model_name,
                     torch_dtype=torch_dtype,
                     trust_remote_code=True,
+                    **model_kwargs,
                 ))
 
         self.tokenizer = AutoTokenizer.from_pretrained(
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
new file mode 100644
index 0000000000000..607ad95e8c365
--- /dev/null
+++ b/tests/models/test_phi3v.py
@@ -0,0 +1,124 @@
+from typing import List, Tuple
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.config import VisionLanguageConfig
+from vllm.utils import is_cpu
+
+from ..conftest import IMAGE_FILES
+
+pytestmark = pytest.mark.llava
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = [
+    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
+]
+
+assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+
+
+def iter_phi3v_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (1008, 1344): 1921,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+        ]:
+            yield (model_name,
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
+                                        image_token_id=32044,
+                                        image_input_shape=input_shape,
+                                        image_processor=model_name,
+                                        image_processor_revision=None))
+
+
+model_and_vl_config = [
+    *iter_phi3v_configs("microsoft/Phi-3-vision-128k-instruct"),
+]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+                      vlm_config: VisionLanguageConfig, model_id: str):
+    """Sanitize vllm output to be comparable with hf output.
+    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
+    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
+    It also reduces `output_str` from "<image><image>bla" to "bla".
+    """
+    input_ids, output_str = vllm_output
+    image_token_id = vlm_config.image_token_id
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(image_token_id)
+
+    hf_input_ids = [
+        input_id if input_id != image_token_id else 0
+        for idx, input_id in enumerate(input_ids)
+    ]
+    hf_output_str = output_str \
+        .replace(image_token_str * vlm_config.image_feature_size, "") \
+        .replace("<s>", " ").replace("<|user|>", "") \
+        .replace("<|end|>\n<|assistant|>", " ")
+
+    return hf_input_ids, hf_output_str
+
+
+target_dtype = "half"
+if is_cpu():
+    target_dtype = "bfloat16"
+
+
+# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
+# Since we use _attn_implementation="eager" for hf_runner, here is
+# numeric difference for longer context and test can't pass
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [8])
+def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
+                model_and_config, dtype: str, max_tokens: int) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vlm_config = model_and_config
+
+    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+    hf_model_kwargs = {"_attn_implementation": "eager"}
+    with hf_runner(model_id, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
+
+    vllm_image_prompts = [
+        p.replace("<|image_1|>",
+                  "<|image|>" * vlm_config.image_feature_size + "<s>")
+        for p in HF_IMAGE_PROMPTS
+    ]
+
+    with vllm_runner(model_id,
+                     max_model_len=2048,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
+
+    for i in range(len(HF_IMAGE_PROMPTS)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+            vllm_outputs[i], vlm_config, model_id)
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index bed6f518ca033..f9ec7209689e7 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -49,6 +49,7 @@
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
new file mode 100644
index 0000000000000..e8f190d3fc4fa
--- /dev/null
+++ b/vllm/model_executor/models/phi3v.py
@@ -0,0 +1,379 @@
+# coding=utf-8
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
+from transformers.utils import logging
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import get_dummy_image_data
+from vllm.sequence import SamplerOutput
+
+logger = logging.get_logger(__name__)
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "model.vision_embed_tokens": "vision_embed_tokens",
+}
+
+CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
+                                                     hidden_act="quick_gelu",
+                                                     hidden_size=1024,
+                                                     image_size=336,
+                                                     intermediate_size=4096,
+                                                     num_attention_heads=16,
+                                                     num_channels=3,
+                                                     num_hidden_layers=24,
+                                                     patch_size=14,
+                                                     projection_dim=768)
+
+
+class Phi3ImageEmbeddingBase(nn.Module):
+
+    def __init__(self, wte=None) -> None:
+        super().__init__()
+        self.wte = wte
+        self.layer_idx: int
+        self.type_feature: str
+        self.img_processor: CLIPVisionModel
+
+    def set_img_features(self, img_features: torch.FloatTensor) -> None:
+        self.img_features = img_features
+
+    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
+        self.img_sizes = img_sizes
+
+    def get_img_features(self,
+                         img_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        LAYER_IDX = self.layer_idx
+        TYPE_FEATURE = self.type_feature
+
+        img_processor_output = self.img_processor(img_embeds,
+                                                  output_hidden_states=True)
+        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+
+        if TYPE_FEATURE == "patch":
+            patch_feature = img_feature[:, 1:]
+            return patch_feature
+
+        if TYPE_FEATURE == "cls_patch":
+            return img_feature
+
+        raise NotImplementedError
+
+
+# adapted from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_embedding_phi3_v.py
+class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
+    """Phi3 Image embedding with HD transform."""
+
+    def __init__(self,
+                 vision_language_config: VisionLanguageConfig,
+                 config: PretrainedConfig,
+                 wte=None) -> None:
+        super().__init__(wte)
+
+        self.image_token_id = vision_language_config.image_token_id
+        # n_embed or hidden_size
+        hidden_size = config.n_embd if hasattr(
+            config, 'n_embd') else config.hidden_size
+
+        clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
+        self.img_processor = CLIPVisionModel(clip_config)
+        image_dim_out = config.img_processor['image_dim_out']
+        self.num_img_tokens = config.img_processor['num_img_tokens']
+
+        self.image_dim_out = image_dim_out
+        self.img_sizes = None
+
+        # global_gn and sub_gn for hd transform, serves as line separator
+        self.use_hd_transform = config.embd_layer.get('use_hd_transform',
+                                                      False)
+        self.with_learnable_separator = config.embd_layer.get(
+            'with_learnable_separator', False)
+        self.hd_transform_order = config.embd_layer.get(
+            'hd_transform_order', 'glb_sub')
+        # with_hd_transform and with_learnable_separator should have same value
+        assert self.use_hd_transform and self.with_learnable_separator
+
+        # 1024 * 4, merge spatial to channel dimension
+        self.glb_GN = nn.Parameter(torch.empty([1, 1, self.image_dim_out * 4]))
+        self.sub_GN = nn.Parameter(
+            torch.empty([1, 1, 1, self.image_dim_out * 4]))
+
+        dim_projection = hidden_size
+        depth = 2
+        layers = [nn.Linear(image_dim_out * 4, dim_projection)]
+        for _ in range(1, depth):
+            layers.extend(
+                [nn.GELU(),
+                 nn.Linear(dim_projection, dim_projection)])
+        self.img_projection = nn.Sequential(*layers)
+
+        self.vocab_size = config.vocab_size
+        self.img_features = None
+
+        self.layer_idx = config.img_processor.get('layer_idx', -2)
+        self.type_feature = config.img_processor.get('type_feature', 'patch')
+
+    def forward(self,
+                input_ids: torch.LongTensor,
+                pixel_values: torch.FloatTensor,
+                image_sizes=None) -> torch.FloatTensor:
+        """process and merge text embeddings with image embeddings."""
+
+        img_embeds = pixel_values
+        img_sizes = image_sizes
+
+        if self.img_features is not None:
+            img_embeds = self.img_features.clone()
+            self.img_features = None
+
+        if self.img_sizes is not None:
+            img_sizes = self.img_sizes
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        positions = torch.nonzero(input_ids == self.image_token_id)
+
+        select = False
+
+        target_device = self.img_projection[0].bias.device
+        target_dtype = self.img_projection[0].bias.dtype
+
+        if len(positions.tolist()) > 0:
+            # if self.use_hd_transform and img_sizes:
+            # img_embeds: (num_images, max_num_crops, 3, H, W)
+            # img_sizes: (num_images, 2).view(1, -1)
+
+            bs = img_embeds.shape[0]
+            # Nx(HW)xC
+            img_features = self.get_img_features(img_embeds.flatten(0, 1))
+            base_feat_height = base_feat_width = int(
+                img_features.shape[1]**0.5)
+
+            # bs x max_num_crops x (24x24) x C
+            img_features = img_features.view(
+                bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
+            C = self.image_dim_out
+            H = base_feat_height
+
+            output_imgs = []
+            output_len = []
+
+            if isinstance(img_sizes, torch.Tensor):
+                img_sizes.squeeze_(0)
+
+            for _bs in range(bs):
+                h, w = img_sizes
+                h = h // 336
+                w = w // 336
+                B_ = h * w
+
+                # 1 x (24x24) x 1024
+                global_img_feature = img_features[_bs, :1]
+
+                # 1 x 12 x 12 x 4096
+                glb_img = global_img_feature \
+                    .reshape(1, H // 2, 2, H // 2, 2,C) \
+                    .permute(0, 1, 3, 2, 4, 5) \
+                    .reshape(1, H // 2, H // 2, 4 * C)
+                temp_glb_GN = self.sub_GN.repeat(1, H // 2, 1, 1)
+
+                # 1 x 156 x 4096
+                glb_img = torch.cat([glb_img, temp_glb_GN],
+                                    dim=2).reshape(1, -1, 4 * C)
+
+                # (max_num_crops-1) x (12x12) x C
+                sub_img = img_features[_bs, 1:]
+                # 16x574x1024
+                # get rid of padding sub_img
+                sub_img = sub_img[:B_]
+
+                sub_img = sub_img.reshape(B_, H // 2, 2, H // 2, 2, C) \
+                    .permute(0, 1, 3, 2, 4, 5).reshape(B_, -1, 4 * C)
+                sub_img = sub_img.reshape(1, h, w, 12, 12, -1) \
+                    .permute(0, 1, 3, 2, 4, 5) \
+                    .reshape(1, h * 12, w * 12, 4 * C)
+                temp_sub_GN = self.sub_GN.repeat(1, h * 12, 1, 1)
+                sub_img = torch.cat([sub_img, temp_sub_GN],
+                                    dim=2).reshape(1, -1, 4 * C)
+                # (1, num_img_tokens, 1024*4)
+
+                # glb + sub
+                if self.hd_transform_order == 'glb_sub':
+                    output_imgs.append(
+                        torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+                elif self.hd_transform_order == 'sub_glb':
+                    output_imgs.append(
+                        torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+
+                temp_len = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
+                output_len.append(temp_len)
+
+            num_img_tokens = output_len
+            img_set_tensor = []
+            for _output_img in output_imgs:
+                img_feature_proj = self.img_projection(
+                    _output_img.to(target_device, target_dtype))
+                img_set_tensor.append(img_feature_proj)
+            select = True
+
+        input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
+
+        hidden_states = self.wte(input_ids)
+
+        if select:
+            idx = 0
+            for i, cnt in enumerate(num_img_tokens):
+                hidden_states[positions[idx, 0],
+                              positions[idx, 1]:positions[idx, 1] +
+                              cnt] = (img_set_tensor[i].to(
+                                  hidden_states.device, hidden_states.dtype))
+                idx += cnt
+
+        return hidden_states.squeeze(0)
+
+
+class Phi3VImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
+
+    image_sizes: torch.Tensor
+    """Shape: (batch_size, 2)"""
+
+
+@MULTIMODAL_REGISTRY.register_image_pixel_input()
+@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
+class Phi3VForCausalLM(VisionLanguageModelBase):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 vision_language_config: VisionLanguageConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__(vision_language_config)
+        self.config = config
+        self.model = LlamaModel(config, cache_config, quant_config)
+        self.vision_embed_tokens = Phi3HDImageEmbedding(
+            vision_language_config, config, self.model.embed_tokens)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Phi3VImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+
+        expected_input_type = self.vision_language_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+
+        if expected_input_type != ImageInputType.PIXEL_VALUES:
+            raise ValueError(
+                f"Unexpected image input type: {expected_input_type}."
+                "Phi3v only support pixel_values input currently.")
+
+        if pixel_values is not None and image_sizes is not None:
+            return Phi3VImagePixelInputs(type="pixel_values",
+                                         data=pixel_values,
+                                         image_sizes=image_sizes)
+
+        return None
+
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                attn_metadata: AttentionMetadata, **kwargs: object):
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            inputs_embeds = self.vision_embed_tokens(
+                input_ids, image_input["data"], image_input["image_sizes"])
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.model(input_ids,
+                                   positions,
+                                   kv_caches,
+                                   attn_metadata,
+                                   inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # We only do sharding for language model
+                # and not vision model for now.
+                if "vision_embed_tokens" in name and self.vision_embed_tokens:
+                    continue
+                if weight_name not in name:
+                    continue
+                param = params_dict[name.replace(weight_name, param_name)]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c6311d60e0bdd..509f791d27c6f 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -79,6 +79,8 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
 
     if config.hf_config.model_type in ("llava", "llava_next"):
         full_prompt = f"{image_prompt}\n{text_prompt}"
+    elif config.hf_config.model_type == 'phi3_v':
+        full_prompt = f"{image_prompt}<s>\n{text_prompt}"
     else:
         raise ValueError(
             f"Unsupported model type: {config.hf_config.model_type}")

From 5002175e801703c5b8a1411b490f6ff6c1747c8e Mon Sep 17 00:00:00 2001
From: Joe Runde <joe@joerun.de>
Date: Mon, 17 Jun 2024 21:54:11 -0600
Subject: [PATCH 29/98] [Kernel] Add punica dimensions for Granite 13b (#5559)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 csrc/punica/bgmv/bgmv_config.h | 8 ++++++++
 tests/lora/test_punica.py      | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index 4b376261d30d2..0456b4bc263cc 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -20,6 +20,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 1152) \
     f(in_T, out_T, W_T, narrow, 1280) \
     f(in_T, out_T, W_T, narrow, 1536) \
+    f(in_T, out_T, W_T, narrow, 1664) \
     f(in_T, out_T, W_T, narrow, 1728) \
     f(in_T, out_T, W_T, narrow, 1792) \
     f(in_T, out_T, W_T, narrow, 2048) \
@@ -36,6 +37,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 5120) \
     f(in_T, out_T, W_T, narrow, 5504) \
     f(in_T, out_T, W_T, narrow, 5632) \
+    f(in_T, out_T, W_T, narrow, 5888) \
     f(in_T, out_T, W_T, narrow, 6144) \
     f(in_T, out_T, W_T, narrow, 6400) \
     f(in_T, out_T, W_T, narrow, 6848) \
@@ -45,6 +47,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 9216) \
     f(in_T, out_T, W_T, narrow, 10240) \
     f(in_T, out_T, W_T, narrow, 11008) \
+    f(in_T, out_T, W_T, narrow, 11264) \
     f(in_T, out_T, W_T, narrow, 12288) \
     f(in_T, out_T, W_T, narrow, 13696) \
     f(in_T, out_T, W_T, narrow, 13824) \
@@ -53,6 +56,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 16384) \
     f(in_T, out_T, W_T, narrow, 20480) \
     f(in_T, out_T, W_T, narrow, 22016) \
+    f(in_T, out_T, W_T, narrow, 22528) \
     f(in_T, out_T, W_T, narrow, 24576) \
     f(in_T, out_T, W_T, narrow, 27392) \
     f(in_T, out_T, W_T, narrow, 27648) \
@@ -91,6 +95,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 1152, narrow) \
     f(in_T, out_T, W_T, 1280, narrow) \
     f(in_T, out_T, W_T, 1536, narrow) \
+    f(in_T, out_T, W_T, 1664, narrow) \
     f(in_T, out_T, W_T, 1728, narrow) \
     f(in_T, out_T, W_T, 1792, narrow) \
     f(in_T, out_T, W_T, 2048, narrow) \
@@ -107,6 +112,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 5120, narrow) \
     f(in_T, out_T, W_T, 5504, narrow) \
     f(in_T, out_T, W_T, 5632, narrow) \
+    f(in_T, out_T, W_T, 5888, narrow) \
     f(in_T, out_T, W_T, 6144, narrow) \
     f(in_T, out_T, W_T, 6400, narrow) \
     f(in_T, out_T, W_T, 6848, narrow) \
@@ -116,6 +122,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 9216, narrow) \
     f(in_T, out_T, W_T, 10240, narrow) \
     f(in_T, out_T, W_T, 11008, narrow) \
+    f(in_T, out_T, W_T, 11264, narrow) \
     f(in_T, out_T, W_T, 12288, narrow) \
     f(in_T, out_T, W_T, 13696, narrow) \
     f(in_T, out_T, W_T, 13824, narrow) \
@@ -124,6 +131,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 16384, narrow) \
     f(in_T, out_T, W_T, 20480, narrow) \
     f(in_T, out_T, W_T, 22016, narrow) \
+    f(in_T, out_T, W_T, 22528, narrow) \
     f(in_T, out_T, W_T, 24576, narrow) \
     f(in_T, out_T, W_T, 27392, narrow) \
     f(in_T, out_T, W_T, 27648, narrow) \
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index f021c003b1322..d87658e5dd886 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -53,6 +53,7 @@ def _lora_ref_impl(
     1152,
     1280,
     1536,
+    1664,
     2048,
     2304,
     2560,
@@ -66,6 +67,7 @@ def _lora_ref_impl(
     5120,
     5504,
     5632,
+    5888,
     6144,
     6400,
     6848,
@@ -75,10 +77,12 @@ def _lora_ref_impl(
     9216,
     10240,
     11008,
+    11264,
     13824,
     14336,
     15360,
     22016,
+    22528,
     24576,
     27392,
     27648,

From 8eadcf0b90f126cf9b23f9583a53b19b6b58fd87 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 17 Jun 2024 20:54:57 -0700
Subject: [PATCH 30/98] [misc][typo] fix typo (#5620)

---
 vllm/block.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/block.py b/vllm/block.py
index e7fb29c8c2c61..bd00c07adc0d7 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -13,7 +13,7 @@
 
 
 class BlockPool:
-    """A pool of physical blocks.
+    """A pool of logical blocks.
     When requests come, we create a lot of logical blocks;
     when requests are done, we destroy a lot of logical blocks.
     It turns out that creating and destroying logical blocks can be expensive,

From 32c86e494a49dff8d1d4b10c5922a36daa6e8faf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 18 Jun 2024 11:58:30 +0800
Subject: [PATCH 31/98] [Misc] Fix typo (#5618)

---
 .../scripts/convert-results-json-to-markdown.py                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 9aa8162d18d2d..e1002213fdbc0 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -21,7 +21,7 @@
     "P99": "P99",
 }
 
-# thoughput tests and the keys that will be printed into markdown
+# throughput tests and the keys that will be printed into markdown
 throughput_results = []
 throughput_results_column_mapping = {
     "test_name": "Test name",

From 114d7270ffc2e5a66e0974b0d6d913c7f990afa7 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 17 Jun 2024 21:37:18 -0700
Subject: [PATCH 32/98] [CI] Avoid naming different metrics with the same name
 in performance benchmark (#5615)

---
 .../convert-results-json-to-markdown.py       | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index e1002213fdbc0..534ecf17930e9 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -15,10 +15,10 @@
     "avg_latency": "Mean latency (ms)",
     # "P10": "P10 (s)",
     # "P25": "P25 (s)",
-    "P50": "Median",
+    "P50": "Median latency (ms)",
     # "P75": "P75 (s)",
     # "P90": "P90 (s)",
-    "P99": "P99",
+    "P99": "P99 latency (ms)",
 }
 
 # throughput tests and the keys that will be printed into markdown
@@ -43,15 +43,14 @@
     # "input_throughput": "Input Tput (tok/s)",
     # "output_throughput": "Output Tput (tok/s)",
     "mean_ttft_ms": "Mean TTFT (ms)",
-    # do not say TTFT again to avoid the table getting too wide
-    "median_ttft_ms": "Median",
-    "p99_ttft_ms": "P99",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "p99_ttft_ms": "P99 TTFT (ms)",
     # "mean_tpot_ms": "Mean TPOT (ms)",
     # "median_tpot_ms": "Median",
     # "p99_tpot_ms": "P99",
     "mean_itl_ms": "Mean ITL (ms)",
-    "median_itl_ms": "Median",
-    "p99_itl_ms": "P99",
+    "median_itl_ms": "Median ITL (ms)",
+    "p99_itl_ms": "P99 ITL (ms)",
 }
 
 
@@ -183,3 +182,11 @@ def results_to_json(latency, throughput, serving):
             serving_tests_markdown_table=serving_md_table,
             benchmarking_results_in_json_string=processed_results_json)
         f.write(results)
+
+    # document benchmarking results in json
+    with open(results_folder / "benchmark_results.json", "w") as f:
+
+        results = latency_results.to_dict(
+            orient='records') + throughput_results.to_dict(
+                orient='records') + serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))

From db5ec52ad7dc69dbe8dd9ba25fe8f2c6ce35a4cf Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 18 Jun 2024 00:21:05 -0700
Subject: [PATCH 33/98] [bugfix][distributed] improve p2p capability test
 (#5612)

[bugfix][distributed] do not error if two processes do not agree on p2p capability (#5612)
---
 .../device_communicators/custom_all_reduce_utils.py    | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 75b7c374c8e6e..e0641a54c4194 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -71,6 +71,7 @@ def consumer(batch_tgt: Sequence[int],
         if open_success:
             # modify the memory
             lib.cudaMemset(pointer, 2, 1024)
+            lib.cudaDeviceSynchronize()
             # use two queues to simulate barrier
             producer_queue.get()
             consumer_queue.put(0)
@@ -142,8 +143,13 @@ def can_actually_p2p(
     for src, tgt in zip(batch_src, batch_tgt):
         a = result_queue.get()
         b = result_queue.get()
-        assert a == b
-        result.append(a)
+        if a != b:
+            logger.warning(
+                "Two processes do not agree on the P2P access"
+                " status on %d -> %d, treat as disabled.", src, tgt)
+            result.append(False)
+        else:
+            result.append(a)
     return result
 
 

From f0cc0e68e3ceef6fe43f78bf36df88e6cad28766 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 18 Jun 2024 05:12:19 -0700
Subject: [PATCH 34/98] [Misc] Remove import from transformers logging (#5625)

---
 vllm/model_executor/models/phi3v.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index e8f190d3fc4fa..35f3b894f099a 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -18,7 +18,6 @@
 import torch
 import torch.nn as nn
 from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
-from transformers.utils import logging
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
@@ -35,8 +34,6 @@
 from vllm.multimodal.image import get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
-logger = logging.get_logger(__name__)
-
 _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
 }

From 4ad7b53e59b6600d050581329dfaba0222b13ae5 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 18 Jun 2024 06:10:04 -0700
Subject: [PATCH 35/98] [CI/Build][Misc] Update Pytest Marker for VLMs (#5623)

---
 .buildkite/run-cpu-test.sh      | 2 +-
 .buildkite/test-pipeline.yaml   | 6 +++---
 pyproject.toml                  | 2 +-
 tests/models/test_llava.py      | 2 +-
 tests/models/test_llava_next.py | 2 +-
 tests/models/test_phi3v.py      | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 532d6ad8811de..f4fa24be1f20f 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,4 +23,4 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
   cd ../
-  pytest -v -s tests/models -m \"not llava\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6439a315e327a..c1e433ec4f5c3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -100,13 +100,13 @@ steps:
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
-    - pytest -v -s models -m \"not llava\"
+    - pytest -v -s models -m \"not vlm\"
 
-- label: Llava Test
+- label: Vision Language Models Test
   mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m llava
+    - pytest -v -s models -m vlm
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]
diff --git a/pyproject.toml b/pyproject.toml
index eb691c29724ce..4958aae02594a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,5 +71,5 @@ markers = [
     "skip_global_cleanup",
     "llm: run tests for vLLM API only",
     "openai: run tests for OpenAI API only",
-    "llava: run tests for LLaVA models only",
+    "vlm: run tests for vision language models only",
 ]
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index a1f0cff1cc0e5..b41c69f72b052 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -7,7 +7,7 @@
 
 from ..conftest import IMAGE_FILES
 
-pytestmark = pytest.mark.llava
+pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = [
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index aa6ee268ae588..0eca5cb5330c8 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -7,7 +7,7 @@
 
 from ..conftest import IMAGE_FILES
 
-pytestmark = pytest.mark.llava
+pytestmark = pytest.mark.vlm
 
 _PREFACE = (
     "A chat between a curious human and an artificial intelligence assistant. "
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 607ad95e8c365..1732e8f08a897 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -8,7 +8,7 @@
 
 from ..conftest import IMAGE_FILES
 
-pytestmark = pytest.mark.llava
+pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = [

From 13db4369d9ab3158a01192d60c744c6523961824 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 18 Jun 2024 07:26:20 -0700
Subject: [PATCH 36/98] [ci] Deprecate original CI template (#5624)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml |   2 +-
 .buildkite/test-template.j2   | 101 ----------------------------------
 2 files changed, 1 insertion(+), 102 deletions(-)
 delete mode 100644 .buildkite/test-template.j2

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c1e433ec4f5c3..a81885b8a9184 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1,6 +1,6 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template.j2` to generate
+# This script will be feed into Jinja template in `test-template-aws.j2` to generate
 # the final pipeline yaml file.
 
 steps:
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
deleted file mode 100644
index 3bd1e90c2b711..0000000000000
--- a/.buildkite/test-template.j2
+++ /dev/null
@@ -1,101 +0,0 @@
-{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
-{% set default_num_gpu = 1 %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-
-steps:
-  - label: ":docker: build image"
-    commands:
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
-  - group: "AMD Tests"
-    depends_on: ~
-    steps:
-    {% for step in steps %}
-    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
-      - label: "AMD: {{ step.label }}"
-        agents:
-          queue: amd
-        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
-        env:
-          DOCKER_BUILDKIT: "1"
-        soft_fail: true
-    {% endif %}
-    {% endfor %}
-
-  - label: "Neuron Test"
-    depends_on: ~
-    agents:
-      queue: neuron
-    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: false
-
-  - label: "Intel Test"
-    depends_on: ~
-    agents:
-      queue: intel
-    command: bash .buildkite/run-cpu-test.sh
-
-  - label: "XPU Test"
-    agents:
-      queue: intel
-    command: bash .buildkite/run-xpu-test.sh
-
-  {% for step in steps %}
-  - label: "{{ step.label }}"
-    agents:
-      queue: kubernetes
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-      - kubernetes:
-          podSpec:
-            {% if step.num_gpus %}
-            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
-            {% endif %}
-            volumes:
-              - name: dshm
-                emptyDir:
-                  medium: Memory
-            containers:
-              - image: "{{ docker_image }}"
-                command: ["bash"]
-                args:
-                - '-c'
-                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
-                {% if not step.no_gpu %}
-                resources:
-                  requests:
-                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
-                  limits:
-                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
-                {% endif %}
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                volumeMounts:
-                  - mountPath: /dev/shm
-                    name: dshm
-  {% endfor %}

From 7879f24dcce75665d83865ee8281f2ef1bbb7e74 Mon Sep 17 00:00:00 2001
From: Ronen Schaffer <ronen.schaffer@ibm.com>
Date: Tue, 18 Jun 2024 19:17:03 +0300
Subject: [PATCH 37/98] [Misc] Add OpenTelemetry support (#4687)

This PR adds basic support for OpenTelemetry distributed tracing.
It includes changes to enable tracing functionality and improve monitoring capabilities.

I've also added a markdown with print-screens to guide users how to use this feature. You can find it here
---
 .buildkite/test-pipeline.yaml                 |   9 ++
 benchmarks/benchmark_latency.py               |  48 +++++---
 examples/production_monitoring/Otel.md        |  82 +++++++++++++
 .../production_monitoring/dummy_client.py     |  35 ++++++
 tests/tracing/__init__.py                     |   0
 tests/tracing/test_tracing.py                 | 116 ++++++++++++++++++
 vllm/config.py                                |  13 ++
 vllm/engine/arg_utils.py                      |  40 ++++--
 vllm/engine/async_llm_engine.py               |  22 ++++
 vllm/engine/llm_engine.py                     | 102 +++++++++++++--
 vllm/entrypoints/openai/serving_chat.py       |  11 ++
 vllm/entrypoints/openai/serving_completion.py |  11 ++
 vllm/sequence.py                              |   3 +
 vllm/tracing.py                               | 104 ++++++++++++++++
 vllm/utils.py                                 |  12 ++
 15 files changed, 567 insertions(+), 41 deletions(-)
 create mode 100644 examples/production_monitoring/Otel.md
 create mode 100644 examples/production_monitoring/dummy_client.py
 create mode 100644 tests/tracing/__init__.py
 create mode 100644 tests/tracing/test_tracing.py
 create mode 100644 vllm/tracing.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a81885b8a9184..5afe3730210e8 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -159,6 +159,15 @@ steps:
   #mirror_hardwares: [amd]
   command: pytest -v -s quantization
 
+- label: Tracing Test
+  commands: 
+    - "pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai"
+    - pytest -v -s tracing
+
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   mirror_hardwares: [amd]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 767afd21aeacf..98e0be2779922 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -20,26 +20,29 @@ def main(args: argparse.Namespace):
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
-    llm = LLM(model=args.model,
-              speculative_model=args.speculative_model,
-              num_speculative_tokens=args.num_speculative_tokens,
-              tokenizer=args.tokenizer,
-              quantization=args.quantization,
-              tensor_parallel_size=args.tensor_parallel_size,
-              trust_remote_code=args.trust_remote_code,
-              dtype=args.dtype,
-              enforce_eager=args.enforce_eager,
-              kv_cache_dtype=args.kv_cache_dtype,
-              quantization_param_path=args.quantization_param_path,
-              device=args.device,
-              ray_workers_use_nsight=args.ray_workers_use_nsight,
-              use_v2_block_manager=args.use_v2_block_manager,
-              enable_chunked_prefill=args.enable_chunked_prefill,
-              download_dir=args.download_dir,
-              block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization,
-              load_format=args.load_format,
-              distributed_executor_backend=args.distributed_executor_backend)
+    llm = LLM(
+        model=args.model,
+        speculative_model=args.speculative_model,
+        num_speculative_tokens=args.num_speculative_tokens,
+        tokenizer=args.tokenizer,
+        quantization=args.quantization,
+        tensor_parallel_size=args.tensor_parallel_size,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+        enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
+        quantization_param_path=args.quantization_param_path,
+        device=args.device,
+        ray_workers_use_nsight=args.ray_workers_use_nsight,
+        use_v2_block_manager=args.use_v2_block_manager,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        download_dir=args.download_dir,
+        block_size=args.block_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        load_format=args.load_format,
+        distributed_executor_backend=args.distributed_executor_backend,
+        otlp_traces_endpoint=args.otlp_traces_endpoint,
+    )
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -254,5 +257,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
         help='Backend to use for distributed serving. When more than 1 GPU '
         'is used, will be automatically set to "ray" if installed '
         'or "mp" (multiprocessing) otherwise.')
+    parser.add_argument(
+        '--otlp-traces-endpoint',
+        type=str,
+        default=None,
+        help='Target URL to which OpenTelemetry traces will be sent.')
     args = parser.parse_args()
     main(args)
diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md
new file mode 100644
index 0000000000000..1449442273c7a
--- /dev/null
+++ b/examples/production_monitoring/Otel.md
@@ -0,0 +1,82 @@
+# Setup OpenTelemetry POC
+
+1. Install OpenTelemetry packages:
+    ```
+    pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai
+    ```
+
+1. Start Jaeger in a docker container:
+    ```
+    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
+    docker run --rm --name jaeger \
+        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
+        -p 6831:6831/udp \
+        -p 6832:6832/udp \
+        -p 5778:5778 \
+        -p 16686:16686 \
+        -p 4317:4317 \
+        -p 4318:4318 \
+        -p 14250:14250 \
+        -p 14268:14268 \
+        -p 14269:14269 \
+        -p 9411:9411 \
+        jaegertracing/all-in-one:1.57
+    ```
+
+1. In a new shell, export Jaeger IP:
+    ```
+    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    ```
+    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
+    ```
+    export OTEL_SERVICE_NAME="vllm-server"
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    ```
+
+1. In a new shell, send requests with trace context from a dummy client
+    ```
+    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    export OTEL_SERVICE_NAME="client-service"
+    python dummy_client.py
+    ```
+
+1. Open Jaeger webui: http://localhost:16686/
+
+    In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
+    ![Traces](https://i.imgur.com/GYHhFjo.png)
+
+1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
+![Spans details](https://i.imgur.com/OPf6CBL.png)
+
+## Exporter Protocol
+OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
+By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
+```
+export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+```
+
+## Instrumentation of FastAPI
+OpenTelemetry allows automatic instrumentation of FastAPI.
+1. Install the instrumentation library
+    ```
+    pip install opentelemetry-instrumentation-fastapi
+    ```
+
+1. Run vLLM with `opentelemetry-instrument`
+    ```
+    opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" 
+    ```
+
+1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
+
+![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
\ No newline at end of file
diff --git a/examples/production_monitoring/dummy_client.py b/examples/production_monitoring/dummy_client.py
new file mode 100644
index 0000000000000..b1a2b3c3c4aaf
--- /dev/null
+++ b/examples/production_monitoring/dummy_client.py
@@ -0,0 +1,35 @@
+import requests
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+    OTLPSpanExporter)
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
+                                            ConsoleSpanExporter)
+from opentelemetry.trace import SpanKind, set_tracer_provider
+from opentelemetry.trace.propagation.tracecontext import (
+    TraceContextTextMapPropagator)
+
+trace_provider = TracerProvider()
+set_tracer_provider(trace_provider)
+
+trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+
+tracer = trace_provider.get_tracer("dummy-client")
+
+url = "http://localhost:8000/v1/completions"
+with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
+    prompt = "San Francisco is a"
+    span.set_attribute("prompt", prompt)
+    headers = {}
+    TraceContextTextMapPropagator().inject(headers)
+    payload = {
+        "model": "facebook/opt-125m",
+        "prompt": prompt,
+        "max_tokens": 10,
+        "best_of": 20,
+        "n": 3,
+        "use_beam_search": "true",
+        "temperature": 0.0,
+        # "stream": True,
+    }
+    response = requests.post(url, headers=headers, json=payload)
diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
new file mode 100644
index 0000000000000..2f8f62cf2d1e4
--- /dev/null
+++ b/tests/tracing/test_tracing.py
@@ -0,0 +1,116 @@
+import os
+import threading
+from concurrent import futures
+from typing import Callable, Dict, Iterable, Literal
+
+import grpc
+import pytest
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
+    ExportTraceServiceResponse)
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
+    TraceServiceServicer, add_TraceServiceServicer_to_server)
+from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
+from opentelemetry.sdk.environment_variables import (
+    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+
+from vllm import LLM, SamplingParams
+from vllm.tracing import SpanAttributes
+
+FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
+
+FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
+                    'array_value']
+
+
+def decode_value(value: AnyValue):
+    field_decoders: Dict[FieldName, Callable] = {
+        "bool_value": (lambda v: v.bool_value),
+        "string_value": (lambda v: v.string_value),
+        "int_value": (lambda v: v.int_value),
+        "double_value": (lambda v: v.double_value),
+        "array_value":
+        (lambda v: [decode_value(item) for item in v.array_value.values]),
+    }
+    for field, decoder in field_decoders.items():
+        if value.HasField(field):
+            return decoder(value)
+    raise ValueError(f"Couldn't decode value: {value}")
+
+
+def decode_attributes(attributes: Iterable[KeyValue]):
+    return {kv.key: decode_value(kv.value) for kv in attributes}
+
+
+class FakeTraceService(TraceServiceServicer):
+
+    def __init__(self):
+        self.request = None
+        self.evt = threading.Event()
+
+    def Export(self, request, context):
+        self.request = request
+        self.evt.set()
+        return ExportTraceServiceResponse()
+
+
+@pytest.fixture
+def trace_service():
+    """Fixture to set up a fake gRPC trace service"""
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    service = FakeTraceService()
+    add_TraceServiceServicer_to_server(service, server)
+    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
+    server.start()
+
+    yield service
+
+    server.stop(None)
+
+
+def test_traces(trace_service):
+    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+
+    sampling_params = SamplingParams(temperature=0.01,
+                                     top_p=0.1,
+                                     max_tokens=256)
+    model = "facebook/opt-125m"
+    llm = LLM(
+        model=model,
+        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+    )
+    prompts = ["This is a short prompt"]
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+    timeout = 5
+    if not trace_service.evt.wait(timeout):
+        raise TimeoutError(
+            f"The fake trace service didn't receive a trace within "
+            f"the {timeout} seconds timeout")
+
+    attributes = decode_attributes(trace_service.request.resource_spans[0].
+                                   scope_spans[0].spans[0].attributes)
+    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
+    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
+    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
+        outputs[0].prompt_token_ids)
+    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+    assert attributes.get(
+        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
+    metrics = outputs[0].metrics
+    assert attributes.get(
+        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+    ttft = metrics.first_token_time - metrics.arrival_time
+    assert attributes.get(
+        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+    e2e_time = metrics.finished_time - metrics.arrival_time
+    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
diff --git a/vllm/config.py b/vllm/config.py
index d95faf52db1a0..5de00d7d38d46 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -10,6 +10,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
+from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
                         is_hip, is_neuron, is_tpu, is_xpu)
@@ -1371,6 +1372,17 @@ def __post_init__(self):
                              f"must be one of {valid_guided_backends}")
 
 
+@dataclass
+class ObservabilityConfig:
+    """Configuration for observability."""
+    otlp_traces_endpoint: Optional[str] = None
+
+    def __post_init__(self):
+        if not is_otel_installed() and self.otlp_traces_endpoint is not None:
+            raise ValueError("OpenTelemetry packages must be installed before "
+                             "configuring 'otlp_traces_endpoint'")
+
+
 @dataclass(frozen=True)
 class EngineConfig:
     """Dataclass which contains all engine-related configuration. This
@@ -1387,6 +1399,7 @@ class EngineConfig:
     vision_language_config: Optional[VisionLanguageConfig]
     speculative_config: Optional[SpeculativeConfig]
     decoding_config: Optional[DecodingConfig]
+    observability_config: Optional[ObservabilityConfig]
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9d04f1dc557fd..647793a6d5cd1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -7,8 +7,9 @@
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         TokenizerPoolConfig, VisionLanguageConfig)
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, TokenizerPoolConfig,
+                         VisionLanguageConfig)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import str_to_int_tuple
 
@@ -101,6 +102,8 @@ class EngineArgs:
 
     qlora_adapter_name_or_path: Optional[str] = None
 
+    otlp_traces_endpoint: Optional[str] = None
+
     def __post_init__(self):
         if self.tokenizer is None:
             self.tokenizer = self.model
@@ -599,6 +602,13 @@ def add_cli_args(
                             type=str,
                             default=None,
                             help='Name or path of the QLoRA adapter.')
+
+        parser.add_argument(
+            '--otlp-traces-endpoint',
+            type=str,
+            default=None,
+            help='Target URL to which OpenTelemetry traces will be sent.')
+
         return parser
 
     @classmethod
@@ -757,6 +767,9 @@ def create_engine_config(self, ) -> EngineConfig:
         decoding_config = DecodingConfig(
             guided_decoding_backend=self.guided_decoding_backend)
 
+        observability_config = ObservabilityConfig(
+            otlp_traces_endpoint=self.otlp_traces_endpoint)
+
         if (model_config.get_sliding_window() is not None
                 and scheduler_config.chunked_prefill_enabled
                 and not scheduler_config.use_v2_block_manager):
@@ -764,16 +777,19 @@ def create_engine_config(self, ) -> EngineConfig:
                 "Chunked prefill is not supported with sliding window. "
                 "Set --disable-sliding-window to disable sliding window.")
 
-        return EngineConfig(model_config=model_config,
-                            cache_config=cache_config,
-                            parallel_config=parallel_config,
-                            scheduler_config=scheduler_config,
-                            device_config=device_config,
-                            lora_config=lora_config,
-                            vision_language_config=vision_language_config,
-                            speculative_config=speculative_config,
-                            load_config=load_config,
-                            decoding_config=decoding_config)
+        return EngineConfig(
+            model_config=model_config,
+            cache_config=cache_config,
+            parallel_config=parallel_config,
+            scheduler_config=scheduler_config,
+            device_config=device_config,
+            lora_config=lora_config,
+            vision_language_config=vision_language_config,
+            speculative_config=speculative_config,
+            load_config=load_config,
+            decoding_config=decoding_config,
+            observability_config=observability_config,
+        )
 
 
 @dataclass
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index ab312850b9ec2..86720e4fbe7c6 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -244,6 +244,9 @@ async def step_async(
         # Log stats.
         self.do_log_stats(scheduler_outputs, output)
 
+        # Tracing
+        self.do_tracing(scheduler_outputs)
+
         if not request_outputs:
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
@@ -285,6 +288,7 @@ async def add_request_async(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -301,6 +305,7 @@ async def add_request_async(
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            trace_headers=trace_headers,
         )
 
     async def check_health_async(self) -> None:
@@ -556,6 +561,7 @@ async def add_request(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> AsyncStream:
         if self.log_requests:
             if isinstance(inputs, str):
@@ -597,6 +603,7 @@ async def add_request(
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            trace_headers=trace_headers,
         )
 
         return stream
@@ -607,6 +614,7 @@ async def generate(
         sampling_params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> AsyncIterator[RequestOutput]:
         """Generate outputs for a request.
 
@@ -621,6 +629,7 @@ async def generate(
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.
 
         Yields:
             The output `RequestOutput` objects from the LLMEngine
@@ -674,6 +683,7 @@ async def generate(
                 inputs,
                 sampling_params,
                 lora_request=lora_request,
+                trace_headers=trace_headers,
         ):
             yield LLMEngine.validate_output(output, RequestOutput)
 
@@ -683,6 +693,7 @@ async def encode(
         pooling_params: PoolingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> AsyncIterator[EmbeddingRequestOutput]:
         """Generate outputs for a request from an embedding model.
 
@@ -697,6 +708,7 @@ async def encode(
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.
 
         Yields:
             The output `EmbeddingRequestOutput` objects from the LLMEngine
@@ -748,6 +760,7 @@ async def encode(
                 inputs,
                 pooling_params,
                 lora_request=lora_request,
+                trace_headers=trace_headers,
         ):
             yield LLMEngine.validate_output(output, EmbeddingRequestOutput)
 
@@ -758,6 +771,7 @@ async def _process_request(
         params: Union[SamplingParams, PoolingParams],
         *,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Common logic to process requests with SamplingParams or
         PoolingParams."""
@@ -769,6 +783,7 @@ async def _process_request(
             params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            trace_headers=trace_headers,
         )
 
         try:
@@ -848,3 +863,10 @@ async def check_health(self) -> None:
         else:
             await self.engine.check_health_async()
         logger.debug("Health check took %fs", time.perf_counter() - t)
+
+    async def is_tracing_enabled(self) -> bool:
+        if self.engine_use_ray:
+            return await self.engine.is_tracing_enabled.remote(  # type: ignore
+            )
+        else:
+            return self.engine.is_tracing_enabled()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index eed9a17e477f3..75d417f525e3a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,14 +1,14 @@
 import time
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional
+from typing import TYPE_CHECKING, ClassVar, Dict, Iterable, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Set, Type, TypeVar, Union
 
 from transformers import GenerationConfig, PreTrainedTokenizer
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
-                         LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, SpeculativeConfig,
+                         LoRAConfig, ModelConfig, ObservabilityConfig,
+                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          VisionLanguageConfig)
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
@@ -31,6 +31,8 @@
                            PoolerOutput, SamplerOutput, Sequence,
                            SequenceGroup, SequenceGroupMetadata,
                            SequenceStatus)
+from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
+                          init_tracer)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
@@ -154,6 +156,7 @@ def __init__(
         vision_language_config: Optional[VisionLanguageConfig],
         speculative_config: Optional[SpeculativeConfig],
         decoding_config: Optional[DecodingConfig],
+        observability_config: Optional[ObservabilityConfig],
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -168,7 +171,8 @@ def __init__(
             "disable_custom_all_reduce=%s, quantization=%s, "
             "enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
-            "decoding_config=%r, seed=%d, served_model_name=%s)",
+            "decoding_config=%r, observability_config=%r, "
+            "seed=%d, served_model_name=%s)",
             VLLM_VERSION,
             model_config.model,
             speculative_config,
@@ -192,6 +196,7 @@ def __init__(
             model_config.quantization_param_path,
             device_config.device,
             decoding_config,
+            observability_config,
             model_config.seed,
             model_config.served_model_name,
         )
@@ -207,6 +212,8 @@ def __init__(
         self.speculative_config = speculative_config
         self.load_config = load_config
         self.decoding_config = decoding_config or DecodingConfig()
+        self.observability_config = observability_config or ObservabilityConfig(
+        )
         self.log_stats = log_stats
 
         if not self.model_config.skip_tokenizer_init:
@@ -288,6 +295,12 @@ def __init__(
                 max_model_len=self.model_config.max_model_len)
             self.stat_logger.info("cache_config", self.cache_config)
 
+        self.tracer = None
+        if self.observability_config.otlp_traces_endpoint:
+            self.tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+
         # Create sequence output processor, e.g. for beam search or
         # speculative decoding.
         self.output_processor = (
@@ -444,6 +457,7 @@ def _add_processed_request(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
         lora_request: Optional[LoRARequest],
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> None:
         # Create the sequences.
         block_size = self.cache_config.block_size
@@ -461,6 +475,7 @@ def _add_processed_request(
                 params,
                 arrival_time=arrival_time,
                 lora_request=lora_request,
+                trace_headers=trace_headers,
             )
         elif isinstance(params, PoolingParams):
             seq_group = self._create_sequence_group_with_pooling(
@@ -507,6 +522,7 @@ def add_request(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> None:
         """Add a request to the engine's request pool.
 
@@ -524,6 +540,7 @@ def add_request(
                 :class:`~vllm.PoolingParams` for pooling.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
+            trace_headers: OpenTelemetry trace headers.
 
         Details:
             - Set arrival_time to the current time if it is None.
@@ -565,6 +582,7 @@ def add_request(
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            trace_headers=trace_headers,
         )
 
     def _create_sequence_group_with_sampling(
@@ -574,6 +592,7 @@ def _create_sequence_group_with_sampling(
         sampling_params: SamplingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest],
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> SequenceGroup:
         """Creates a SequenceGroup with SamplingParams."""
         max_logprobs = self.get_model_config().max_logprobs
@@ -595,11 +614,14 @@ def _create_sequence_group_with_sampling(
             self.generation_config_fields)
 
         # Create the sequence group.
-        seq_group = SequenceGroup(request_id=request_id,
-                                  seqs=[seq],
-                                  arrival_time=arrival_time,
-                                  sampling_params=sampling_params,
-                                  lora_request=lora_request)
+        seq_group = SequenceGroup(
+            request_id=request_id,
+            seqs=[seq],
+            arrival_time=arrival_time,
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+            trace_headers=trace_headers,
+        )
 
         return seq_group
 
@@ -793,6 +815,9 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         # Log stats.
         self.do_log_stats(scheduler_outputs, output)
 
+        # Tracing
+        self.do_tracing(scheduler_outputs)
+
         if not request_outputs:
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
@@ -986,3 +1011,62 @@ def list_loras(self) -> Set[int]:
 
     def check_health(self) -> None:
         self.model_executor.check_health()
+
+    def is_tracing_enabled(self) -> bool:
+        return self.tracer is not None
+
+    def do_tracing(self, scheduler_outputs: SchedulerOutputs) -> None:
+        if self.tracer is None:
+            return
+
+        for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
+            seq_group = scheduled_seq_group.seq_group
+            if seq_group.is_finished():
+                self.create_trace_span(seq_group)
+
+    def create_trace_span(self, seq_group: SequenceGroup) -> None:
+        if self.tracer is None or seq_group.sampling_params is None:
+            return
+        arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9)
+
+        trace_context = extract_trace_context(seq_group.trace_headers)
+
+        with self.tracer.start_as_current_span(
+                "llm_request",
+                kind=SpanKind.SERVER,
+                context=trace_context,
+                start_time=arrival_time_nano_seconds) as seq_span:
+            metrics = seq_group.metrics
+            ttft = metrics.first_token_time - metrics.arrival_time
+            e2e_time = metrics.finished_time - metrics.arrival_time
+            # attribute names are based on
+            # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
+            seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL,
+                                   self.model_config.model)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID,
+                                   seq_group.request_id)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE,
+                                   seq_group.sampling_params.temperature)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P,
+                                   seq_group.sampling_params.top_p)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS,
+                                   seq_group.sampling_params.max_tokens)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_BEST_OF,
+                                   seq_group.sampling_params.best_of)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N,
+                                   seq_group.sampling_params.n)
+            seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES,
+                                   seq_group.num_seqs())
+            seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
+                                   len(seq_group.prompt_token_ids))
+            seq_span.set_attribute(
+                SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
+                sum([
+                    seq.get_output_len()
+                    for seq in seq_group.get_finished_seqs()
+                ]))
+            seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE,
+                                   metrics.time_in_queue)
+            seq_span.set_attribute(
+                SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
+            seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 76940612496a0..744e1d94511b3 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -31,6 +31,8 @@
                                    get_full_image_text_prompt)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+                          log_tracing_disabled_warning)
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -267,11 +269,20 @@ async def create_chat_completion(
         if image_data is not None:
             inputs["multi_modal_data"] = image_data
 
+        is_tracing_enabled = await self.engine.is_tracing_enabled()
+        trace_headers = None
+        if is_tracing_enabled and raw_request:
+            trace_headers = extract_trace_headers(raw_request.headers)
+        if not is_tracing_enabled and raw_request and contains_trace_headers(
+                raw_request.headers):
+            log_tracing_disabled_warning()
+
         result_generator = self.engine.generate(
             inputs,
             sampling_params,
             request_id,
             lora_request,
+            trace_headers=trace_headers,
         )
         # Streaming response
         if request.stream:
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 64671e21a724d..c775fa6daa739 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -24,6 +24,8 @@
     get_guided_decoding_logits_processor)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+                          log_tracing_disabled_warning)
 from vllm.utils import merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
@@ -125,6 +127,14 @@ async def create_completion(self, request: CompletionRequest,
                         truncate_prompt_tokens)
                 prompt_ids, prompt_text = prompt_formats
 
+                is_tracing_enabled = await self.engine.is_tracing_enabled()
+                trace_headers = None
+                if is_tracing_enabled:
+                    trace_headers = extract_trace_headers(raw_request.headers)
+                if not is_tracing_enabled and contains_trace_headers(
+                        raw_request.headers):
+                    log_tracing_disabled_warning()
+
                 generator = self.engine.generate(
                     {
                         "prompt": prompt_text,
@@ -133,6 +143,7 @@ async def create_completion(self, request: CompletionRequest,
                     sampling_params,
                     f"{request_id}-{i}",
                     lora_request=lora_request,
+                    trace_headers=trace_headers,
                 )
 
                 generators.append(generator)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 54243bfb1e913..38d3349f2ab41 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -414,6 +414,7 @@ class SequenceGroup:
             for an embedding model.
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
+        trace_headers: OpenTelemetry trace headers.
     """
 
     def __init__(
@@ -426,6 +427,7 @@ def __init__(
         embeddings: Optional[List[float]] = None,
         pooling_params: Optional[PoolingParams] = None,
         encoder_seq: Optional[Sequence] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -441,6 +443,7 @@ def __init__(
         self.embeddings = embeddings
         self.pooling_params = pooling_params
         self.encoder_seq = encoder_seq
+        self.trace_headers = trace_headers
 
     @property
     def prompt(self) -> Optional[str]:
diff --git a/vllm/tracing.py b/vllm/tracing.py
new file mode 100644
index 0000000000000..ba6732cab68f2
--- /dev/null
+++ b/vllm/tracing.py
@@ -0,0 +1,104 @@
+import os
+from typing import Mapping, Optional
+
+from vllm.logger import init_logger
+from vllm.utils import run_once
+
+TRACE_HEADERS = ["traceparent", "tracestate"]
+
+logger = init_logger(__name__)
+
+_is_otel_installed = False
+try:
+    from opentelemetry.context.context import Context
+    from opentelemetry.sdk.environment_variables import (
+        OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.semconv.ai import SpanAttributes as BaseSpanAttributes
+    from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
+    from opentelemetry.trace.propagation.tracecontext import (
+        TraceContextTextMapPropagator)
+    _is_otel_installed = True
+except ImportError:
+
+    class Context:  # type: ignore
+        pass
+
+    class BaseSpanAttributes:  # type: ignore
+        pass
+
+    class SpanKind:  # type: ignore
+        pass
+
+    class Tracer:  # type: ignore
+        pass
+
+
+def is_otel_installed() -> bool:
+    return _is_otel_installed
+
+
+def init_tracer(instrumenting_module_name: str,
+                otlp_traces_endpoint: str) -> Optional[Tracer]:
+    assert is_otel_installed(), ("OpenTelemetry packages must be installed "
+                                 "prior to initializing a tracer")
+    trace_provider = TracerProvider()
+
+    span_exporter = get_span_exporter(otlp_traces_endpoint)
+    trace_provider.add_span_processor(BatchSpanProcessor(span_exporter))
+    set_tracer_provider(trace_provider)
+
+    tracer = trace_provider.get_tracer(instrumenting_module_name)
+    return tracer
+
+
+def get_span_exporter(endpoint):
+    protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc")
+    if protocol == "grpc":
+        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+            OTLPSpanExporter)
+    elif protocol == "http/protobuf":
+        from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
+            OTLPSpanExporter)
+    else:
+        raise ValueError(
+            f"Unsupported OTLP protocol '{protocol}' is configured")
+
+    return OTLPSpanExporter(endpoint=endpoint)
+
+
+def extract_trace_context(
+        headers: Optional[Mapping[str, str]]) -> Optional[Context]:
+    if is_otel_installed():
+        headers = headers or {}
+        return TraceContextTextMapPropagator().extract(headers)
+    else:
+        return None
+
+
+def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
+
+    return {h: headers[h] for h in TRACE_HEADERS if h in headers}
+
+
+class SpanAttributes(BaseSpanAttributes):
+    # The following span attribute names are added here because they are missing
+    # from the Semantic Conventions for LLM.
+    LLM_REQUEST_ID = "gen_ai.request.id"
+    LLM_REQUEST_BEST_OF = "gen_ai.request.best_of"
+    LLM_REQUEST_N = "gen_ai.request.n"
+    LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
+    LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
+    LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
+    LLM_LATENCY_E2E = "gen_ai.latency.e2e"
+
+
+def contains_trace_headers(headers: Mapping[str, str]) -> bool:
+    return any(h in headers for h in TRACE_HEADERS)
+
+
+@run_once
+def log_tracing_disabled_warning() -> None:
+    logger.warning(
+        "Received a request with trace context but tracing is disabled")
diff --git a/vllm/utils.py b/vllm/utils.py
index 1adfa9218c047..ffe921e650aaa 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -763,3 +763,15 @@ def cuda_device_count_stateless() -> int:
     # after https://github.com/pytorch/pytorch/pull/122815 is released.
 
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
+
+
+#From: https://stackoverflow.com/a/4104188/2749989
+def run_once(f):
+
+    def wrapper(*args, **kwargs) -> Any:
+        if not wrapper.has_run:  # type: ignore[attr-defined]
+            wrapper.has_run = True  # type: ignore[attr-defined]
+            return f(*args, **kwargs)
+
+    wrapper.has_run = False  # type: ignore[attr-defined]
+    return wrapper

From 95db455e7f337e99ffafd0b14367a7cbc11dca43 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 18 Jun 2024 12:45:05 -0400
Subject: [PATCH 38/98] [Misc] Add channel-wise quantization support for w8a8
 dynamic per token activation quantization (#5542)

---
 tests/quantization/test_compressed_tensors.py | 13 +++++--
 vllm/model_executor/layers/linear.py          | 13 -------
 .../compressed_tensors/compressed_tensors.py  | 14 ++++---
 .../compressed_tensors_w8a8_dynamictoken.py   | 37 ++++++++++++++-----
 4 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 611c6b8b7fb9a..b78081155e2ba 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -14,7 +14,7 @@
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -43,15 +43,19 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
 
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path) as llm:
         sampling_params = SamplingParams()
         output = llm.generate("Hello world!", sampling_params=sampling_params)
         assert output
 
 
-def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
+@pytest.mark.parametrize("model_args", [
+    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
+    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
+])
+def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
+    model_path, strategy = model_args
     with vllm_runner(model_path, dtype=torch.float16) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -60,6 +64,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
 
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
         assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+        assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
 
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 58c379bcd88df..45f805547b414 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -468,13 +468,6 @@ def weight_loader(self,
                     "MergedColumnParallelLinear, assume the weight is "
                     "the same for all partitions.")
 
-        if fp8_scales_shard_indexer is None:
-            if len(param_data.shape) == 0:
-                param_data = param_data.reshape(1)
-
-            if len(loaded_weight.shape) == 0:
-                loaded_weight = loaded_weight.reshape(1)
-
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -686,12 +679,6 @@ def weight_loader(self,
                     "QKVParallelLinear, assume the weight is the same "
                     "for all partitions.")
 
-        if len(param_data.shape) == 0:
-            param_data = param_data.reshape(1)
-
-        if len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1)
-
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 92a84b3c0dd89..347a052a663da 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -95,14 +95,15 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
     def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        is_token_tensor = (weight_quant.strategy
-                           == QuantizationStrategy.TENSOR.value) and (
-                               input_quant.strategy
-                               == QuantizationStrategy.TOKEN.value)
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_token = (weight_strategy and input_quant.strategy
+                    == QuantizationStrategy.TOKEN.value)
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
 
-        return is_8_bits and is_token_tensor and is_symmetric and is_dynamic
+        return is_8_bits and is_token and is_symmetric and is_dynamic
 
     def _is_w4a16(self, weight_quant: BaseModel,
                   input_quant: BaseModel) -> bool:
@@ -133,7 +134,8 @@ def _get_schema(self, weight_quant: BaseModel,
                 return CompressedTensorsW8A8StaticTensor()
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8DynamicToken()
+                return CompressedTensorsW8A8DynamicToken(
+                    strategy=weight_quant.strategy)
 
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index d514d7b28cfd9..37610c9c2898b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -6,6 +6,8 @@
 from vllm import _custom_ops as custom_ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW8A8DynamicToken"]
@@ -13,6 +15,9 @@
 
 class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
 
+    def __init__(self, strategy: str):
+        self.strategy = strategy
+
     def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         if isinstance(shard_id, int):
             return shard_id
@@ -45,11 +50,17 @@ def create_weights(self, layer: torch.nn.Module,
         # CompressedTensorsW8A8StaticTensor::create_weights for further
         # information.
         is_tensor_partitioned = len(output_partition_sizes) != 1
-        weight_scale_dim = sum(
-            output_partition_sizes) if is_tensor_partitioned else 1
+        # when doing channel-wise quantization, number of scales
+        # is equal to output_dim
+        weight_scale_dim = sum(output_partition_sizes) if (
+            is_tensor_partitioned
+            or self.strategy == QuantizationStrategy.CHANNEL) else 1
+
+        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            shape = (weight_scale_dim, 1)
 
-        weight_scale = Parameter(torch.empty(weight_scale_dim,
-                                             dtype=torch.float32),
+        weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
                                  requires_grad=False)
 
         weight = Parameter(torch.empty(sum(output_partition_sizes),
@@ -67,12 +78,20 @@ def create_weights(self, layer: torch.nn.Module,
             })
 
         layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(
-            weight_scale, {
-                "weight_loader": weight_loader,
-                "shard_splitter": self.scales_shard_splitter,
-                "logical_widths": output_partition_sizes
+        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+
+        # Don't need a shard_splitter for channel-wise quantization
+        # Use the default loading method
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            set_weight_attrs(weight_scale, {
+                "output_dim": 0,
             })
+        else:
+            set_weight_attrs(
+                weight_scale, {
+                    "logical_widths": output_partition_sizes,
+                    "shard_splitter": self.scales_shard_splitter,
+                })
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         weight = layer.weight

From 19091efc44c6f9b1e008dc5469c63a1f01684745 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 18 Jun 2024 11:00:36 -0700
Subject: [PATCH 39/98] [ci] Setup Release pipeline and build release wheels
 with cache (#5610)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/release-pipeline.yaml | 21 ++++++++++++
 Dockerfile                       | 58 ++++++++++++++++++++++----------
 2 files changed, 62 insertions(+), 17 deletions(-)
 create mode 100644 .buildkite/release-pipeline.yaml

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
new file mode 100644
index 0000000000000..1959f9752069f
--- /dev/null
+++ b/.buildkite/release-pipeline.yaml
@@ -0,0 +1,21 @@
+steps:
+  - block: "Build wheels"
+
+  - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" 
+    agents:
+      queue: cpu_queue
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+    matrix:
+      setup:
+        cuda_version:
+          - "11.8.0"
+          - "12.1.0"
+        python_version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
diff --git a/Dockerfile b/Dockerfile
index 72894e7cab9ba..5b3e682a80169 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,9 +5,26 @@
 # docs/source/dev/dockerfile/dockerfile.rst and
 # docs/source/assets/dev/dockerfile-stages-dependency.png
 
+ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
+
+ARG CUDA_VERSION=12.4.1
+ARG PYTHON_VERSION=3
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version \
+    && python3 -m pip --version
 
 RUN apt-get update -y \
     && apt-get install -y python3-pip git curl sudo
@@ -16,7 +33,7 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.4/compat/
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
 WORKDIR /workspace
 
@@ -24,14 +41,7 @@ WORKDIR /workspace
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-cuda.txt
-
-# install development dependencies
-COPY requirements-lint.txt requirements-lint.txt
-COPY requirements-test.txt requirements-test.txt
-COPY requirements-dev.txt requirements-dev.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-dev.txt
+    python3 -m pip install -r requirements-cuda.txt
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -41,14 +51,16 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
 
-
 #################### WHEEL BUILD IMAGE ####################
-FROM dev AS build
+FROM base AS build
+
+ARG PYTHON_VERSION=3
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
+
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-build.txt
+    python3 -m pip install -r requirements-build.txt
 
 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
@@ -101,9 +113,21 @@ RUN python3 check-wheel-size.py dist
 
 #################### EXTENSION Build IMAGE ####################
 
+#################### DEV IMAGE ####################
+FROM base as dev
+
+COPY requirements-lint.txt requirements-lint.txt
+COPY requirements-test.txt requirements-test.txt
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-dev.txt
+
+#################### DEV IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+ARG CUDA_VERSION=12.4.1
 WORKDIR /vllm-workspace
 
 RUN apt-get update -y \
@@ -113,12 +137,12 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.4/compat/
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install dist/*.whl --verbose
+    python3 -m pip install dist/*.whl --verbose
 #################### vLLM installation IMAGE ####################
 
 
@@ -131,7 +155,7 @@ ADD . /vllm-workspace/
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-dev.txt
+    python3 -m pip install -r requirements-dev.txt
 
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code

From 07feecde1a69859d565786a7ad64c0f604f17b28 Mon Sep 17 00:00:00 2001
From: sergey-tinkoff <167607910+sergey-tinkoff@users.noreply.github.com>
Date: Tue, 18 Jun 2024 21:01:21 +0300
Subject: [PATCH 40/98] [Model] LoRA support added for command-r (#5178)

---
 csrc/punica/bgmv/bgmv_config.h         |  6 ++++
 tests/lora/test_punica.py              |  2 ++
 vllm/model_executor/models/commandr.py | 48 ++++++++++++++++++++++----
 3 files changed, 50 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 csrc/punica/bgmv/bgmv_config.h

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
old mode 100644
new mode 100755
index 0456b4bc263cc..c38db2dcd2c4d
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -69,6 +69,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 36864) \
     f(in_T, out_T, W_T, narrow, 43264) \
     f(in_T, out_T, W_T, narrow, 49152) \
+    f(in_T, out_T, W_T, narrow, 60544) \
+    f(in_T, out_T, W_T, narrow, 60672) \
     f(in_T, out_T, W_T, narrow, 64000) \
     f(in_T, out_T, W_T, narrow, 64256) \
     f(in_T, out_T, W_T, narrow, 64512) \
@@ -78,6 +80,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 128000) \
     f(in_T, out_T, W_T, narrow, 128256) \
     f(in_T, out_T, W_T, narrow, 128512) \
+    
+    
 // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
 // and vllm/tests/lora/test_punica.py
 
@@ -144,6 +148,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 36864, narrow) \
     f(in_T, out_T, W_T, 43264, narrow) \
     f(in_T, out_T, W_T, 49152, narrow) \
+    f(in_T, out_T, W_T, 60544, narrow) \
+    f(in_T, out_T, W_T, 60672, narrow) \
     f(in_T, out_T, W_T, 64000, narrow) \
     f(in_T, out_T, W_T, 64256, narrow) \
     f(in_T, out_T, W_T, 64512, narrow) \
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index d87658e5dd886..dae1d56874628 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -94,6 +94,8 @@ def _lora_ref_impl(
     36864,
     43264,
     49152,
+    60544,
+    60672,
     64000,
     64256,
     102400,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 11d88d45e993c..600c2990b3691 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -29,7 +29,7 @@
 from transformers import CohereConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -265,10 +265,14 @@ def __init__(
         config: CohereConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
         self.config = config
-        self.vocab_size = config.vocab_size
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
                                                    config.hidden_size)
         self.layers = nn.ModuleList([
@@ -302,18 +306,44 @@ def forward(
 
 class CohereForCausalLM(nn.Module):
 
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens"
+    ]
+    embedding_modules = {"embed_tokens": "input_embeddings"}
+    embedding_padding_modules = []
+
     def __init__(
         self,
         config: CohereConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
         self.config = config
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.quant_config = quant_config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
                                                 scale=config.logit_scale)
-        self.model = CohereModel(config, cache_config, quant_config)
+        self.model = CohereModel(config,
+                                 cache_config,
+                                 quant_config,
+                                 lora_config=lora_config)
         self.sampler = Sampler()
 
     @torch.no_grad()
@@ -330,8 +360,14 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.model.embed_tokens.weight,
-                                       hidden_states, sampling_metadata)
+        is_not_lora = hasattr(self.model.embed_tokens, 'weight')
+        if is_not_lora:
+            embedding_weights = self.model.embed_tokens.weight
+        else:
+            embedding_weights = self.model.embed_tokens.base_layer.weight
+
+        logits = self.logits_processor(embedding_weights, hidden_states,
+                                       sampling_metadata)
         return logits
 
     def sample(

From 8a173382c80d6730e1bbc81f932ac3721ab2cd9d Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 18 Jun 2024 23:18:37 +0200
Subject: [PATCH 41/98] [Bugfix] Fix for inconsistent behaviour related to
 sampling and repetition penalties  (#5639)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/sampling_metadata.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 7ad84f51b7e4c..f95de56f39b57 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -386,18 +386,10 @@ def from_sampling_metadata(
                 presence_penalties += [0] * prefill_len
                 frequency_penalties += [0] * prefill_len
                 repetition_penalties += [1] * prefill_len
-                if do_penalties:
-                    prompt_tokens.extend([] for _ in range(prefill_len))
-                    output_tokens.extend([] for _ in range(prefill_len))
 
             if seq_group.do_sample:
                 sample_lens = len(seq_group.sample_indices)
                 assert sample_lens == len(seq_ids)
-                for seq_id in seq_ids:
-                    seq_data = seq_group.seq_data[seq_id]
-                    if do_penalties:
-                        prompt_tokens.append(seq_data.prompt_token_ids)
-                        output_tokens.append(seq_data.output_token_ids)
                 temperatures += [temperature] * len(seq_ids)
                 top_ps += [top_p] * len(seq_ids)
                 top_ks += [top_k] * len(seq_ids)
@@ -424,6 +416,20 @@ def from_sampling_metadata(
                 sampling_seeds.append(seq_seeds)
             sample_indices.extend(seq_group.sample_indices)
 
+        if do_penalties:
+            for seq_group in sampling_metadata.seq_groups:
+                seq_ids = seq_group.seq_ids
+                if (seq_group.is_prompt
+                        and sampling_params.prompt_logprobs is not None):
+                    prefill_len = len(seq_group.prompt_logprob_indices)
+                    prompt_tokens.extend([] for _ in range(prefill_len))
+                    output_tokens.extend([] for _ in range(prefill_len))
+                if seq_group.do_sample:
+                    for seq_id in seq_ids:
+                        seq_data = seq_group.seq_data[seq_id]
+                        prompt_tokens.append(seq_data.prompt_token_ids)
+                        output_tokens.append(seq_data.output_token_ids)
+
         sampling_tensors = SamplingTensors.from_lists(
             temperatures, top_ps, top_ks, min_ps, presence_penalties,
             frequency_penalties, repetition_penalties, sampling_seeds,

From 2bd231a7b7787407ccba36f966603578842d03f7 Mon Sep 17 00:00:00 2001
From: milo157 <43028253+milo157@users.noreply.github.com>
Date: Tue, 18 Jun 2024 18:56:59 -0400
Subject: [PATCH 42/98] [Doc] Added cerebrium as Integration option (#5553)

---
 .../serving/deploying_with_cerebrium.rst      | 109 ++++++++++++++++++
 docs/source/serving/integrations.rst          |   1 +
 2 files changed, 110 insertions(+)
 create mode 100644 docs/source/serving/deploying_with_cerebrium.rst

diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst
new file mode 100644
index 0000000000000..ff0ac911108c4
--- /dev/null
+++ b/docs/source/serving/deploying_with_cerebrium.rst
@@ -0,0 +1,109 @@
+.. _deploying_with_cerebrium:
+
+Deploying with Cerebrium
+============================
+
+.. raw:: html
+
+    <p align="center">
+        <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
+    </p>
+
+vLLM can be run on a cloud based GPU machine with `Cerebrium <https://www.cerebrium.ai/>`__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
+
+To install the Cerebrium client, run:
+
+.. code-block:: console
+
+    $ pip install cerebrium
+    $ cerebrium login
+
+Next, create your Cerebrium project, run:
+    
+.. code-block:: console
+
+    $ cerebrium init vllm-project
+
+Next, to install the required packages, add the following to your cerebrium.toml:
+
+.. code-block:: toml
+
+    [cerebrium.dependencies.pip]
+    vllm = "latest"
+
+Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`:
+    
+.. code-block:: python
+
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
+
+    def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
+    
+        sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Print the outputs.
+        results = []
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            results.append({"prompt": prompt, "generated_text": generated_text})
+
+        return {"results": results}
+
+
+Then, run the following code to deploy it to the cloud
+
+.. code-block:: console
+
+    $ cerebrium deploy
+
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
+
+.. code-block:: python
+
+    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
+     -H 'Content-Type: application/json' \
+     -H 'Authorization: <JWT TOKEN>' \
+     --data '{
+       "prompts": [
+         "Hello, my name is",
+         "The president of the United States is",
+         "The capital of France is",
+         "The future of AI is"
+       ]
+     }'
+
+You should get a response like:
+
+.. code-block:: python
+    
+    {
+        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
+        "result": {
+            "result": [
+                {
+                    "prompt": "Hello, my name is",
+                    "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
+                },
+                {
+                    "prompt": "The president of the United States is",
+                    "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
+                },
+                {
+                    "prompt": "The capital of France is",
+                    "generated_text": " Paris.\n"
+                },
+                {
+                    "prompt": "The future of AI is",
+                    "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
+                }
+            ]
+        },
+        "run_time_ms": 152.53663063049316
+    }
+
+You now have an autoscaling endpoint where you only pay for the compute you use!
+
diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst
index 83a8b5a88bd38..680ea523dfe94 100644
--- a/docs/source/serving/integrations.rst
+++ b/docs/source/serving/integrations.rst
@@ -8,6 +8,7 @@ Integrations
    deploying_with_kserve
    deploying_with_triton
    deploying_with_bentoml
+   deploying_with_cerebrium
    deploying_with_lws
    deploying_with_dstack
    serving_with_langchain

From b23ce9203235488e080434108d3504d54b24e867 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 18 Jun 2024 19:48:49 -0400
Subject: [PATCH 43/98] [Bugfix] Fix CUDA version check for mma warning
 suppression (#5642)

---
 csrc/quantization/marlin/sparse/common/mma.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h
index 8a6c65338f277..b26505f771c8b 100644
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -17,6 +17,7 @@
 
 #pragma once
 #include "base.h"
+#include <cudaTypedefs.h>
 
 namespace marlin_24 {
 
@@ -26,7 +27,7 @@ namespace marlin_24 {
 //  | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
 //  | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
 //  | reduced performance on some future architectures
-#if defined CUDA_VERSION && CUDA_VERSION >= 12500
+#if defined CUDA_VERSION && CUDA_VERSION >= 12050
   #define MMA_SP_INST \
     "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
 #else

From 6820724e51079120251c8522afd385ca64abc948 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 18 Jun 2024 20:33:25 -0400
Subject: [PATCH 44/98] [Bugfix] Fix w8a8 benchmarks for int8 case (#5643)

---
 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 523e970c2c9be..5cc0fbbd49b8e 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -120,9 +120,8 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     # cutlass impl
     timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.bfloat16, label, sub_label, cutlass_impl,
-                 "cutlass_i8_i8_bf16_scaled_mm"))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
 
     return timers
 

From 59a1eb59c9cb383e5ea36d7253f81ff2ea7766cc Mon Sep 17 00:00:00 2001
From: Shukant Pal <SukantK2002@outlook.com>
Date: Tue, 18 Jun 2024 18:46:38 -0700
Subject: [PATCH 45/98] [Bugfix] Fix Phi-3 Long RoPE scaling implementation
 (#5628)

---
 vllm/model_executor/layers/rotary_embedding.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 9c0a74cdab96e..a0b19046b7491 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -507,8 +507,8 @@ def __init__(
         dtype: torch.dtype,
         short_factor: List[float],
         long_factor: List[float],
-        short_mscale: float = 1.1,
-        long_mscale: float = 1.225,
+        short_mscale: float = 1.0,
+        long_mscale: float = 1.0,
     ):
         super().__init__()
 
@@ -530,6 +530,16 @@ def __init__(
         self.short_mscale = short_mscale
         self.long_mscale = long_mscale
 
+        scale = (self.max_position_embeddings /
+                 self.original_max_position_embeddings)
+
+        if scale <= 1.0:
+            self.scaling_factor = 1.0
+        else:
+            self.scaling_factor = math.sqrt(
+                1 + math.log(scale) /
+                math.log(self.original_max_position_embeddings))
+
         short_cache = self._compute_cos_sin_cache(
             original_max_position_embeddings, short_factor, short_mscale)
         short_cache = short_cache.to(dtype)
@@ -565,8 +575,8 @@ def _compute_cos_sin_cache(
         inv_freq = self._compute_inv_freq(rescale_factors)
         t = torch.arange(max_position_embeddings, dtype=torch.float)
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos() * mscale
-        sin = freqs.sin() * mscale
+        cos = freqs.cos() * mscale * self.scaling_factor
+        sin = freqs.sin() * mscale * self.scaling_factor
         cache = torch.cat((cos, sin), dim=-1)
         return cache
 

From e5150f2c281f052df42121ae60827156abe57173 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 19 Jun 2024 08:03:55 +0200
Subject: [PATCH 46/98] [Bugfix] Added test for sampling repetition penalty
 bug. (#5659)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 tests/samplers/test_sampler.py | 69 ++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index c6ef4358ea5fa..0aabde6aa8c5c 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -631,3 +631,72 @@ def mock_sample(probs, *args, **kwargs):
     hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
     assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
     assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_repetition_penalty_mixed(device: str):
+
+    vocab_size = 8
+
+    def test_sampling_params(sampling_params: List[SamplingParams]):
+
+        seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        seq_lens: List[int] = []
+        for i in range(2):
+            seq_group_metadata_list.append(
+                SequenceGroupMetadata(
+                    request_id=f"test_{i}",
+                    is_prompt=True,
+                    seq_data={0: SequenceData([1, 2, 3])},
+                    sampling_params=sampling_params[i],
+                    block_tables={0: [1]},
+                ))
+            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens=seq_lens,
+            device=device,
+            pin_memory=is_pin_memory_available())
+
+        fake_logits = torch.full((2, vocab_size),
+                                 1e-2,
+                                 device=device,
+                                 dtype=torch.float16)
+
+        fake_logits[:, 5] = 1.1e-2
+        fake_logits[:, 1] = 1.2e-2
+
+        sampler = MockLogitsSampler(fake_logits)
+
+        sampler_output = sampler(logits=fake_logits,
+                                 sampling_metadata=sampling_metadata)
+
+        generated_tokens = []
+        for output in sampler_output:
+            generated_tokens.append(output.samples[0].output_token)
+
+        return generated_tokens
+
+    # one configuration is greedy with repetition_penalty
+    sampling_params_rep = SamplingParams(
+        temperature=0.0,
+        repetition_penalty=2.0,
+    )
+
+    # other configuration is sampling w/o repetition_penalty
+    sampling_params_sample = SamplingParams(
+        temperature=1.0,
+        top_k=1,
+        seed=42,
+    )
+
+    tokens1 = test_sampling_params(
+        [sampling_params_rep, sampling_params_sample])
+
+    tokens2 = test_sampling_params(
+        [sampling_params_sample, sampling_params_rep])
+
+    assert tokens1[0] == tokens2[1]
+    assert tokens1[1] == tokens2[0]

From f758aed0e851687e919a4ee09ab872ee2c8fe159 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Wed, 19 Jun 2024 02:21:29 -0400
Subject: [PATCH 47/98] [Bugfix][CI/Build][AMD][ROCm]Fixed the cmake build bug
 which generate garbage on certain devices (#5641)

---
 Dockerfile.rocm   | 17 ++++++++---------
 cmake/utils.cmake |  5 ++++-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 724fa1673c3b3..6bda696859c8b 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -7,9 +7,8 @@ ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
 RUN echo "Base image is $BASE_IMAGE"
 
-# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
-# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
+ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
+    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
@@ -68,7 +67,7 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
     && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+    && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \
         patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
     && python3 setup.py install \
     && cd ..; \
@@ -76,7 +75,7 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
 
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue
-RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
     rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
 # build triton
@@ -107,11 +106,11 @@ ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
-    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
+    && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
+       patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \
     && python3 setup.py install \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
+    && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \
+    && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \
     && cd ..
 
 
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index f3c1286dd8498..071e16336dfa2 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -155,8 +155,11 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
     # Find the intersection of the supported + detected architectures to
     # set the module architecture flags.
     #
+
+    set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
+
     set(${GPU_ARCHES})
-    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
+    foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS})
       if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
         list(APPEND ${GPU_ARCHES} ${_ARCH})
       endif()

From 3eea74889fe29534808bae41fca251e0e74c0962 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 19 Jun 2024 01:05:00 -0700
Subject: [PATCH 48/98] [misc][distributed] use 127.0.0.1 for single-node
 (#5619)

---
 vllm/executor/multiproc_gpu_executor.py |  7 +++++--
 vllm/executor/ray_gpu_executor.py       | 10 ++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 8385e56f88b39..e63e5a3a027fa 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -10,7 +10,7 @@
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (cuda_device_count_stateless,
-                        get_distributed_init_method, get_ip, get_open_port,
+                        get_distributed_init_method, get_open_port,
                         get_vllm_instance_id, make_async)
 
 logger = init_logger(__name__)
@@ -37,8 +37,11 @@ def _init_executor(self) -> None:
         assert world_size <= cuda_device_count_stateless(), (
             "please set tensor_parallel_size to less than max local gpu count")
 
+        # Multiprocessing-based executor does not support multi-node setting.
+        # Since it only works for single node, we can use the loopback address
+        # 127.0.0.1 for communication.
         distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
+            "127.0.0.1", get_open_port())
 
         if world_size == 1:
             self.workers = []
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 843332e5ea0c8..fc83c552888a6 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -161,6 +161,16 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
 
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 

From da971ec7a5b35f33981cff9ca50064d3166953f9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 19 Jun 2024 05:38:26 -0400
Subject: [PATCH 49/98] [Model] Add FP8 kv cache for Qwen2 (#5656)

---
 vllm/model_executor/models/qwen2.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 9a4829a27873e..b5d13bb6b937c 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -46,6 +46,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
+from vllm.utils import print_warning_once
 
 
 class Qwen2MLP(nn.Module):
@@ -375,6 +376,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Remapping the name of FP8 kv-scale.
+                if name.endswith("kv_scale"):
+                    remapped_kv_scale_name = name.replace(
+                        ".kv_scale", ".attn.kv_scale")
+                    if remapped_kv_scale_name not in params_dict:
+                        print_warning_once(
+                            f"Found kv scale in the checkpoint (e.g. {name}), "
+                            "but not found the expected name in the model "
+                            f"(e.g. {remapped_kv_scale_name}). kv-scale is "
+                            "not loaded.")
+                        continue
+                    else:
+                        name = remapped_kv_scale_name
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From 7d46c8d37864993162bbeb61dc19b5ad6043646d Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 19 Jun 2024 17:58:32 +0800
Subject: [PATCH 50/98] [Bugfix] Fix sampling_params passed incorrectly in
 Phi3v example (#5684)

---
 examples/phi3v_example.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index d5e60ae1ee3a2..4f37c47ddca87 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -12,7 +12,6 @@ def run_phi3v():
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
         image_input_type="pixel_values",
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
@@ -28,11 +27,12 @@ def run_phi3v():
 
     sampling_params = SamplingParams(temperature=0, max_tokens=64)
 
-    outputs = llm.generate({
-        "prompt": prompt,
-        "sampling_params": sampling_params,
-        "multi_modal_data": ImagePixelData(image),
-    })
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": ImagePixelData(image),
+        },
+        sampling_params=sampling_params)
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)

From d8714530d11603a159a46ea0dde299f95807cfde Mon Sep 17 00:00:00 2001
From: DearPlanet <junsong.zhang2021.work@outlook.com>
Date: Wed, 19 Jun 2024 18:19:08 +0800
Subject: [PATCH 51/98] [Misc]Add param max-model-len in benchmark_latency.py
 (#5629)

---
 benchmarks/benchmark_latency.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 98e0be2779922..e9d1048c89b64 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -29,6 +29,7 @@ def main(args: argparse.Namespace):
         tensor_parallel_size=args.tensor_parallel_size,
         trust_remote_code=args.trust_remote_code,
         dtype=args.dtype,
+        max_model_len=args.max_model_len,
         enforce_eager=args.enforce_eager,
         kv_cache_dtype=args.kv_cache_dtype,
         quantization_param_path=args.quantization_param_path,
@@ -150,6 +151,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--trust-remote-code',
                         action='store_true',
                         help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
     parser.add_argument(
         '--dtype',
         type=str,

From e9c2732b976612b6362635be2984f03bfabc20ec Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 19 Jun 2024 22:37:33 +0800
Subject: [PATCH 52/98] [CI/Build] Add tqdm to dependencies (#5680)

---
 requirements-common.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-common.txt b/requirements-common.txt
index 32e2ebe8c6159..05969cfa5d65f 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -4,6 +4,7 @@ psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
 requests
+tqdm
 py-cpuinfo
 transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
 tokenizers >= 0.19.1  # Required for Llama 3.

From 3ee5c4bca514ee95592a018fae95e050fd6763c0 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 19 Jun 2024 07:42:13 -0700
Subject: [PATCH 53/98] [ci] Add A100 queue into AWS CI template (#5648)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .../benchmark-pipeline.yaml                   |  1 +
 .buildkite/test-pipeline.yaml                 |  5 ++
 .buildkite/test-template-aws.j2               | 46 +++++++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 8f12748b68f39..2b25c954b5c5c 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -17,6 +17,7 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
+          priorityClassName: perf-benchmark
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
             command:
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5afe3730210e8..c2160fee348e5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -181,3 +181,8 @@ steps:
   commands:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
+
+- label: A100 status
+  gpu: a100
+  commands: 
+  - nvidia-smi
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 01f7ff1e0e2b5..08146bf4454cc 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -49,6 +49,51 @@ steps:
     command: bash .buildkite/run-cpu-test.sh
 
   {% for step in steps %}
+  {% if step.gpu == "a100" %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: a100-queue
+    soft_fail: {{ step.soft_fail or false }}
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: ci
+          containers:
+          - image: {{ docker_image }}
+            command: ["bash"]
+            args:
+            - '-c'
+            - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  {% else %}
   - label: "{{ step.label }}"
     agents:
       {% if step.label == "Documentation Build" %}
@@ -90,4 +135,5 @@ steps:
             {% endif %}
           volumes:
             - /dev/shm:/dev/shm
+  {% endif %}
   {% endfor %}

From afed90a0344b1b0ce6aae46efc630adb489ec769 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 19 Jun 2024 14:41:42 -0400
Subject: [PATCH 54/98] [Frontend][Bugfix] Fix preemption_mode ->
 preemption-mode for CLI arg in arg_utils.py (#5688)

---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 647793a6d5cd1..7f760c2775f3d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -577,7 +577,7 @@ def add_cli_args(
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
         parser.add_argument(
-            '--preemption_mode',
+            '--preemption-mode',
             type=str,
             default=None,
             help='If \'recompute\', the engine performs preemption by block '

From d571ca010813c5532c646ee74c1a2c9e1e78e12a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 19 Jun 2024 13:16:04 -0700
Subject: [PATCH 55/98] [ci][distributed] add tests for custom allreduce
 (#5689)

---
 .buildkite/test-pipeline.yaml               | 8 ++++++--
 tests/distributed/test_custom_all_reduce.py | 7 ++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c2160fee348e5..b1602dd9496ba 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -182,7 +182,11 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
 
-- label: A100 status
+- label: Distributed Tests (A100)
   gpu: a100
   commands: 
-  - nvidia-smi
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 3776c1f91a3f2..9a39160b8a462 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -11,7 +11,8 @@
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
                                              get_tp_group, graph_capture)
 
-from ..utils import (init_test_distributed_environment,
+from ..utils import (ensure_model_parallel_initialized,
+                     init_test_distributed_environment,
                      multi_process_tensor_parallel)
 
 random.seed(42)
@@ -27,8 +28,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
                                       distributed_init_port)
-
-    group = get_tensor_model_parallel_group()
+    ensure_model_parallel_initialized(tp_size, pp_size)
+    group = get_tensor_model_parallel_group().device_group
 
     # A small all_reduce for warmup.
     # this is needed because device communicators might be created lazily

From 78687504f7eb6d7523bff15b1bca8c9cbb74656a Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Wed, 19 Jun 2024 13:57:12 -0700
Subject: [PATCH 56/98] [Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654)

---
 tests/async_engine/test_async_llm_engine.py |  38 +++-
 tests/spec_decode/e2e/conftest.py           |  43 +----
 tests/utils.py                              |  43 ++++-
 vllm/engine/async_llm_engine.py             |   5 +-
 vllm/engine/async_timeout.py                | 189 ++++++++++++++++++++
 5 files changed, 271 insertions(+), 47 deletions(-)
 create mode 100644 vllm/engine/async_timeout.py

diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 10a46422887e3..52d3394a96a13 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -2,8 +2,12 @@
 from dataclasses import dataclass
 
 import pytest
+import torch
 
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm import SamplingParams
+from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+
+from ..utils import wait_for_gpu_memory_to_clear
 
 
 @dataclass
@@ -94,3 +98,35 @@ async def test_new_requests_event():
     assert engine.get_model_config() is not None
     assert engine.get_tokenizer() is not None
     assert engine.get_decoding_config() is not None
+
+
+def test_asyncio_run():
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(torch.cuda.device_count())),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    engine = AsyncLLMEngine.from_engine_args(
+        AsyncEngineArgs(model="facebook/opt-125m"))
+
+    async def run(prompt: str):
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=32,
+        )
+
+        async for output in engine.generate(prompt,
+                                            sampling_params,
+                                            request_id=prompt):
+            final_output = output
+        return final_output
+
+    async def generate():
+        return await asyncio.gather(
+            run("test0"),
+            run("test1"),
+        )
+
+    results = asyncio.run(generate())
+    assert len(results) == 2
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 86103cf85484a..60dfe33f2918b 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,5 +1,4 @@
 import asyncio
-import time
 from itertools import cycle
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -7,12 +6,6 @@
 import ray
 import torch
 
-from vllm.utils import is_hip
-
-if (not is_hip()):
-    from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
-                        nvmlInit)
-
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -26,6 +19,7 @@
 from vllm.utils import Counter, random_uuid
 
 from ...conftest import cleanup
+from ...utils import wait_for_gpu_memory_to_clear
 
 
 class AsyncLLM:
@@ -291,38 +285,3 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
         print(f'{i=} {baseline_token_ids=}')
         print(f'{i=}     {spec_token_ids=}')
         assert baseline_token_ids == spec_token_ids
-
-
-def wait_for_gpu_memory_to_clear(devices: List[int],
-                                 threshold_bytes: int,
-                                 timeout_s: float = 120) -> None:
-    # Use nvml instead of pytorch to reduce measurement error from torch cuda
-    # context.
-    nvmlInit()
-    start_time = time.time()
-    while True:
-        output: Dict[int, str] = {}
-        output_raw: Dict[int, float] = {}
-        for device in devices:
-            dev_handle = nvmlDeviceGetHandleByIndex(device)
-            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
-            gb_used = mem_info.used / 2**30
-            output_raw[device] = gb_used
-            output[device] = f'{gb_used:.02f}'
-
-        print('gpu memory used (GB): ', end='')
-        for k, v in output.items():
-            print(f'{k}={v}; ', end='')
-        print('')
-
-        dur_s = time.time() - start_time
-        if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
-            print(f'Done waiting for free GPU memory on devices {devices=} '
-                  f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
-            break
-
-        if dur_s >= timeout_s:
-            raise ValueError(f'Memory of devices {devices=} not free after '
-                             f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
-
-        time.sleep(5)
diff --git a/tests/utils.py b/tests/utils.py
index f2b2d22b1ebc5..bc30515c83100 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,7 +4,7 @@
 import time
 import warnings
 from contextlib import contextmanager
-from typing import List
+from typing import Dict, List
 
 import openai
 import ray
@@ -13,7 +13,11 @@
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.utils import get_open_port
+from vllm.utils import get_open_port, is_hip
+
+if (not is_hip()):
+    from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
+                        nvmlInit)
 
 # Path to root of repository so that utilities can be imported by ray workers
 VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
@@ -154,3 +158,38 @@ def error_on_warning():
         warnings.simplefilter("error")
 
         yield
+
+
+def wait_for_gpu_memory_to_clear(devices: List[int],
+                                 threshold_bytes: int,
+                                 timeout_s: float = 120) -> None:
+    # Use nvml instead of pytorch to reduce measurement error from torch cuda
+    # context.
+    nvmlInit()
+    start_time = time.time()
+    while True:
+        output: Dict[int, str] = {}
+        output_raw: Dict[int, float] = {}
+        for device in devices:
+            dev_handle = nvmlDeviceGetHandleByIndex(device)
+            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
+            gb_used = mem_info.used / 2**30
+            output_raw[device] = gb_used
+            output[device] = f'{gb_used:.02f}'
+
+        print('gpu memory used (GB): ', end='')
+        for k, v in output.items():
+            print(f'{k}={v}; ', end='')
+        print('')
+
+        dur_s = time.time() - start_time
+        if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
+            print(f'Done waiting for free GPU memory on devices {devices=} '
+                  f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
+            break
+
+        if dur_s >= timeout_s:
+            raise ValueError(f'Memory of devices {devices=} not free after '
+                             f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
+
+        time.sleep(5)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 86720e4fbe7c6..df25eb111e87f 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -10,6 +10,7 @@
 from vllm.config import DecodingConfig, ModelConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_timeout import asyncio_timeout
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.ray_utils import initialize_ray_cluster, ray
 from vllm.inputs import LLMInputs, PromptInputs
@@ -545,8 +546,8 @@ async def run_engine_loop(self):
             # Abort if iteration takes too long due to unrecoverable errors
             # (eg. NCCL timeouts).
             try:
-                has_requests_in_progress = await asyncio.wait_for(
-                    self.engine_step(), ENGINE_ITERATION_TIMEOUT_S)
+                async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
+                    has_requests_in_progress = await self.engine_step()
             except asyncio.TimeoutError as exc:
                 logger.error(
                     "Engine iteration timed out. This should never happen!")
diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
new file mode 100644
index 0000000000000..4b18426252127
--- /dev/null
+++ b/vllm/engine/async_timeout.py
@@ -0,0 +1,189 @@
+# Workaround for https://github.com/python/cpython/issues/86296
+#
+# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
+# Licensed under the Apache License (Apache-2.0)
+
+import asyncio
+import enum
+import sys
+import warnings
+from types import TracebackType
+from typing import Any, Optional, Type
+
+if sys.version_info[:2] >= (3, 11):
+    from asyncio import timeout as asyncio_timeout
+else:
+
+    def asyncio_timeout(delay: Optional[float]) -> "Timeout":
+        """timeout context manager.
+        Useful in cases when you want to apply timeout logic around block
+        of code or in cases when asyncio.wait_for is not suitable. For example:
+        >>> async with timeout(0.001):
+        ...     async with aiohttp.get('https://github.com') as r:
+        ...         await r.text()
+        delay - value in seconds or None to disable timeout logic
+        """
+        loop = asyncio.get_running_loop()
+        deadline = loop.time() + delay if delay is not None else None
+        return Timeout(deadline, loop)
+
+    class _State(enum.Enum):
+        INIT = "INIT"
+        ENTER = "ENTER"
+        TIMEOUT = "TIMEOUT"
+        EXIT = "EXIT"
+
+    class Timeout:
+        # Internal class, please don't instantiate it directly
+        # Use timeout() and timeout_at() public factories instead.
+        #
+        # Implementation note: `async with timeout()` is preferred
+        # over `with timeout()`.
+        # While technically the Timeout class implementation
+        # doesn't need to be async at all,
+        # the `async with` statement explicitly points that
+        # the context manager should be used from async function context.
+        #
+        # This design allows to avoid many silly misusages.
+        #
+        # TimeoutError is raised immediately when scheduled
+        # if the deadline is passed.
+        # The purpose is to time out as soon as possible
+        # without waiting for the next await expression.
+
+        __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler")
+
+        def __init__(self, deadline: Optional[float],
+                     loop: asyncio.AbstractEventLoop) -> None:
+            self._loop = loop
+            self._state = _State.INIT
+
+            self._timeout_handler = None  # type: Optional[asyncio.Handle]
+            if deadline is None:
+                self._deadline = None  # type: Optional[float]
+            else:
+                self.update(deadline)
+
+        def __enter__(self) -> "Timeout":
+            warnings.warn(
+                "with timeout() is deprecated, use async with timeout()",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            self._do_enter()
+            return self
+
+        def __exit__(
+            self,
+            exc_type: Optional[Type[BaseException]],
+            exc_val: Optional[BaseException],
+            exc_tb: Optional[TracebackType],
+        ) -> Optional[bool]:
+            self._do_exit(exc_type)
+            return None
+
+        async def __aenter__(self) -> "Timeout":
+            self._do_enter()
+            return self
+
+        async def __aexit__(
+            self,
+            exc_type: Optional[Type[BaseException]],
+            exc_val: Optional[BaseException],
+            exc_tb: Optional[TracebackType],
+        ) -> Optional[bool]:
+            self._do_exit(exc_type)
+            return None
+
+        @property
+        def expired(self) -> bool:
+            """Is timeout expired during execution?"""
+            return self._state == _State.TIMEOUT
+
+        @property
+        def deadline(self) -> Optional[float]:
+            return self._deadline
+
+        def reject(self) -> None:
+            """Reject scheduled timeout if any."""
+            # cancel is maybe better name but
+            # task.cancel() raises CancelledError in asyncio world.
+            if self._state not in (_State.INIT, _State.ENTER):
+                raise RuntimeError(f"invalid state {self._state.value}")
+            self._reject()
+
+        def _reject(self) -> None:
+            if self._timeout_handler is not None:
+                self._timeout_handler.cancel()
+                self._timeout_handler = None
+
+        def shift(self, delay: float) -> None:
+            """Advance timeout on delay seconds.
+            The delay can be negative.
+            Raise RuntimeError if shift is called when deadline is not scheduled
+            """
+            deadline = self._deadline
+            if deadline is None:
+                raise RuntimeError(
+                    "cannot shift timeout if deadline is not scheduled")
+            self.update(deadline + delay)
+
+        def update(self, deadline: float) -> None:
+            """Set deadline to absolute value.
+            deadline argument points on the time in the same clock system
+            as loop.time().
+            If new deadline is in the past the timeout is raised immediately.
+            Please note: it is not POSIX time but a time with
+            undefined starting base, e.g. the time of the system power on.
+            """
+            if self._state == _State.EXIT:
+                raise RuntimeError(
+                    "cannot reschedule after exit from context manager")
+            if self._state == _State.TIMEOUT:
+                raise RuntimeError("cannot reschedule expired timeout")
+            if self._timeout_handler is not None:
+                self._timeout_handler.cancel()
+            self._deadline = deadline
+            if self._state != _State.INIT:
+                self._reschedule()
+
+        def _reschedule(self) -> None:
+            assert self._state == _State.ENTER
+            deadline = self._deadline
+            if deadline is None:
+                return
+
+            now = self._loop.time()
+            if self._timeout_handler is not None:
+                self._timeout_handler.cancel()
+
+            task = asyncio.current_task()
+            if deadline <= now:
+                self._timeout_handler = self._loop.call_soon(
+                    self._on_timeout, task)
+            else:
+                self._timeout_handler = self._loop.call_at(
+                    deadline, self._on_timeout, task)
+
+        def _do_enter(self) -> None:
+            if self._state != _State.INIT:
+                raise RuntimeError(f"invalid state {self._state.value}")
+            self._state = _State.ENTER
+            self._reschedule()
+
+        def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None:
+            if exc_type is asyncio.CancelledError and \
+                    self._state == _State.TIMEOUT:
+                self._timeout_handler = None
+                raise asyncio.TimeoutError
+            # timeout has not expired
+            self._state = _State.EXIT
+            self._reject()
+            return None
+
+        def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None:
+            if task:
+                task.cancel()
+            self._state = _State.TIMEOUT
+            # drop the reference early
+            self._timeout_handler = None

From e83db9e7e3d776cd9b059a49024f3950ef579b41 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Wed, 19 Jun 2024 18:01:45 -0400
Subject: [PATCH 57/98] [Doc] Update docker references (#5614)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 docs/source/dev/dockerfile/dockerfile.rst     | 20 +++++++++----------
 docs/source/serving/deploying_with_docker.rst |  7 +++----
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/dev/dockerfile/dockerfile.rst
index a07463392dbe8..40ba818eb57d8 100644
--- a/docs/source/dev/dockerfile/dockerfile.rst
+++ b/docs/source/dev/dockerfile/dockerfile.rst
@@ -2,19 +2,19 @@ Dockerfile
 ====================
 
 See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct 
-the image for running an OpenAI compatible server with vLLM.
+the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`_.
 
--  Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
+Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 
-   - All build stages
-   - The default build target (highlighted in grey)
-   - External images (with dashed borders)
+- All build stages
+- The default build target (highlighted in grey)
+- External images (with dashed borders)
    
-   The edges of the build graph represent:
-   
-   - FROM ... dependencies (with a solid line and a full arrow head)
-   - COPY --from=... dependencies (with a dashed line and an empty arrow head)
-   - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+The edges of the build graph represent:
+
+- FROM ... dependencies (with a solid line and a full arrow head)
+- COPY --from=... dependencies (with a dashed line and an empty arrow head)
+- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
 
    .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
       :alt: query
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index fa82bc8e3bd33..14d94b09e9b9c 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -3,9 +3,8 @@
 Deploying with Docker
 ============================
 
-vLLM offers official docker image for deployment.
-The image can be used to run OpenAI compatible server.
-The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
+vLLM offers an official Docker image for deployment.
+The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
 
 .. code-block:: console
 
@@ -25,7 +24,7 @@ The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.co
         memory to share data between processes under the hood, particularly for tensor parallel inference.
 
 
-You can build and run vLLM from source via the provided dockerfile. To build vLLM:
+You can build and run vLLM from source via the provided `Dockerfile <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_. To build vLLM:
 
 .. code-block:: console
 

From 4a30d7e3ccae6e977d728e2157aaa11ac0fed549 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 19 Jun 2024 18:06:44 -0400
Subject: [PATCH 58/98] [Misc] Add per channel support for static activation
 quantization; update w8a8 schemes to share base classes (#5650)

---
 tests/quantization/test_compressed_tensors.py | 14 ++-
 .../compressed_tensors/compressed_tensors.py  | 10 ++-
 .../schemes/compressed_tensors_w8a8.py        | 84 +++++++++++++++++
 .../compressed_tensors_w8a8_dynamictoken.py   | 89 +++----------------
 .../compressed_tensors_w8a8_statictensor.py   | 60 +++----------
 5 files changed, 121 insertions(+), 136 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index b78081155e2ba..aaa366335d196 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -13,8 +13,12 @@
     CompressedTensorsW8A8StaticTensor)
 
 
-def test_compressed_tensors_w8a8_static_setup(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
+@pytest.mark.parametrize("model_args", [
+    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor"),
+    ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel"),
+])
+def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
+    model_path, strategy = model_args
     with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -33,12 +37,14 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
 
         assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
 
+        assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
         assert o_proj.weight.dtype is torch.int8
         assert gate_up_proj.weight.dtype is torch.int8
 
-        assert qkv_proj.weight_scale.shard_splitter is not None
-        assert qkv_proj.weight_scale.logical_widths is not None
+        if qkv_proj.scheme.strategy == "tensor":
+            assert qkv_proj.weight_scale.shard_splitter is not None
+            assert qkv_proj.weight_scale.logical_widths is not None
         assert qkv_proj.input_scale.dtype is torch.float32
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 347a052a663da..44dd024afe74d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -85,8 +85,11 @@ def get_config_filenames(cls) -> List[str]:
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        is_tensor = (weight_quant.strategy == input_quant.strategy ==
-                     QuantizationStrategy.TENSOR.value)
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_tensor = (weight_strategy and input_quant.strategy
+                     == QuantizationStrategy.TENSOR.value)
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
         is_static = not weight_quant.dynamic and not input_quant.dynamic
 
@@ -131,7 +134,8 @@ def _get_schema(self, weight_quant: BaseModel,
 
         if self.quant_format == CompressionFormat.int_quantized.value:
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8StaticTensor()
+                return CompressedTensorsW8A8StaticTensor(
+                    strategy=weight_quant.strategy)
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
                 return CompressedTensorsW8A8DynamicToken(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
new file mode 100644
index 0000000000000..efed79ec7a11c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
@@ -0,0 +1,84 @@
+from typing import Callable, List, Tuple, Union
+
+import torch
+from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
+from vllm.model_executor.utils import set_weight_attrs
+
+
+class CompressedTensorsW8A8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str):
+        self.strategy = strategy
+
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        assert isinstance(shard_id, str)
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
+    def scales_shard_splitter(
+            self, param: torch.Tensor, loaded_weight: torch.Tensor,
+            shard_id: Union[str, int],
+            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        shard_id = self._shard_id_as_int(shard_id)
+        offset = sum(logical_widths[:shard_id])
+        size = logical_widths[shard_id]
+        # update loaded weight with copies for broadcast.
+        loaded_weight = loaded_weight.repeat(size)
+        return param[offset:offset + size], loaded_weight
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        is_tensor_partitioned = len(output_partition_sizes) != 1
+        weight_scale_dim = sum(output_partition_sizes) if (
+            is_tensor_partitioned
+            or self.strategy == QuantizationStrategy.CHANNEL) else 1
+
+        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            shape = (weight_scale_dim, 1)
+
+        weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
+                                 requires_grad=False)
+
+        layer.register_parameter("weight_scale", weight_scale)
+        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=torch.int8),
+                           requires_grad=False)
+
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(
+            weight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "weight_loader": weight_loader,
+                "logical_widths": output_partition_sizes
+            })
+
+        # Don't need a shard_splitter for channel-wise quantization
+        # Use the default loading method
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            set_weight_attrs(weight_scale, {
+                "output_dim": 0,
+            })
+        else:
+            set_weight_attrs(
+                weight_scale, {
+                    "logical_widths": output_partition_sizes,
+                    "shard_splitter": self.scales_shard_splitter,
+                })
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index 37610c9c2898b..5fc05b8e682d6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -1,42 +1,15 @@
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List
 
 import torch
-from torch.nn import Parameter
 
 from vllm import _custom_ops as custom_ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    QuantizationStrategy)
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import (  # noqa: E501
+    CompressedTensorsW8A8)
 
 __all__ = ["CompressedTensorsW8A8DynamicToken"]
 
 
-class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
-
-    def __init__(self, strategy: str):
-        self.strategy = strategy
-
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-
-        assert isinstance(shard_id, str)
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
-
-    def scales_shard_splitter(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int],
-            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        shard_id = self._shard_id_as_int(shard_id)
-        offset = sum(logical_widths[:shard_id])
-        size = logical_widths[shard_id]
-        # update loaded weight with copies for broadcast.
-        loaded_weight = loaded_weight.repeat(size)
-        return param[offset:offset + size], loaded_weight
+class CompressedTensorsW8A8DynamicToken(CompressedTensorsW8A8):
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
@@ -44,54 +17,12 @@ def create_weights(self, layer: torch.nn.Module,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
-        # When the scales have a single value, it is required that they be
-        # on the CPU for performance and CUDA Graphs compatibility. Please
-        # refer to the comment in
-        # CompressedTensorsW8A8StaticTensor::create_weights for further
-        # information.
-        is_tensor_partitioned = len(output_partition_sizes) != 1
-        # when doing channel-wise quantization, number of scales
-        # is equal to output_dim
-        weight_scale_dim = sum(output_partition_sizes) if (
-            is_tensor_partitioned
-            or self.strategy == QuantizationStrategy.CHANNEL) else 1
-
-        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            shape = (weight_scale_dim, 1)
-
-        weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
-                                 requires_grad=False)
-
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=torch.int8),
-                           requires_grad=False)
-
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(
-            weight, {
-                "input_dim": 1,
-                "output_dim": 0,
-                "weight_loader": weight_loader,
-                "logical_widths": output_partition_sizes
-            })
-
-        layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
-
-        # Don't need a shard_splitter for channel-wise quantization
-        # Use the default loading method
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            set_weight_attrs(weight_scale, {
-                "output_dim": 0,
-            })
-        else:
-            set_weight_attrs(
-                weight_scale, {
-                    "logical_widths": output_partition_sizes,
-                    "shard_splitter": self.scales_shard_splitter,
-                })
+        super().create_weights(
+            layer=layer,
+            output_partition_sizes=output_partition_sizes,
+            input_size_per_partition=input_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader)
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         weight = layer.weight
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
index 414e17a061fb4..79f5358a365ed 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -1,37 +1,17 @@
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List
 
 import torch
 from torch.nn import Parameter
 
 from vllm import _custom_ops as custom_ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import (  # noqa: E501
+    CompressedTensorsW8A8)
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW8A8StaticTensor"]
 
 
-class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
-
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-
-        assert isinstance(shard_id, str)
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
-
-    def scales_shard_splitter(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int],
-            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        shard_id = self._shard_id_as_int(shard_id)
-        offset = sum(logical_widths[:shard_id])
-        size = logical_widths[shard_id]
-        # update loaded weight with copies for broadcast.
-        loaded_weight = loaded_weight.repeat(size)
-        return param[offset:offset + size], loaded_weight
+class CompressedTensorsW8A8StaticTensor(CompressedTensorsW8A8):
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
@@ -39,41 +19,21 @@ def create_weights(self, layer: torch.nn.Module,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
-        is_tensor_partitioned = len(output_partition_sizes) != 1
-        weight_scale_dim = sum(
-            output_partition_sizes) if is_tensor_partitioned else 1
+        super().create_weights(
+            layer=layer,
+            output_partition_sizes=output_partition_sizes,
+            input_size_per_partition=input_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader)
 
         input_scale = Parameter(torch.empty(1, dtype=torch.float32),
                                 requires_grad=False)
 
-        weight_scale = Parameter(torch.empty(weight_scale_dim,
-                                             dtype=torch.float32),
-                                 requires_grad=False)
-
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=torch.int8),
-                           requires_grad=False)
-
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {
-            "weight_loader": weight_loader,
-            "input_dim": 1,
-            "output_dim": 0,
-        })
         layer.register_parameter("input_scale", input_scale)
         set_weight_attrs(input_scale, {
             "weight_loader": weight_loader,
             "ignore_warning": True,
         })
-        layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(
-            weight_scale, {
-                "weight_loader": weight_loader,
-                "shard_splitter": self.scales_shard_splitter,
-                "logical_widths": output_partition_sizes,
-                "ignore_warning": True,
-            })
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         weight = layer.weight

From 949e49a6857080e36ecd62f6e193754290c7c43c Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 19 Jun 2024 16:30:03 -0700
Subject: [PATCH 59/98] [ci] Limit num gpus if specified for A100 (#5694)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml   | 1 +
 .buildkite/test-template-aws.j2 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b1602dd9496ba..95cd5b1989ee2 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -184,6 +184,7 @@ steps:
 
 - label: Distributed Tests (A100)
   gpu: a100
+  num_gpus: 4
   commands: 
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 08146bf4454cc..fb34b787e0cbd 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -75,7 +75,7 @@ steps:
             - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
             resources:
               limits:
-                nvidia.com/gpu: 8
+                nvidia.com/gpu: {{ step.num_gpus or 1 }}
             volumeMounts:
             - name: devshm
               mountPath: /dev/shm

From 3730a1c832bca5ca8128aec3c7659304895edf2e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 20 Jun 2024 10:09:21 +0800
Subject: [PATCH 60/98] [Misc] Improve conftest (#5681)

---
 tests/conftest.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index f37c9883f7cf1..5bbfd87f0bb3b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -365,7 +365,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         cleanup()
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def hf_runner():
     return HfRunner
 
@@ -385,6 +385,7 @@ def __init__(
         block_size: int = 16,
         enable_chunked_prefill: bool = False,
         swap_space: int = 4,
+        enforce_eager: bool = False,
         **kwargs,
     ) -> None:
         self.model = LLM(
@@ -393,6 +394,7 @@ def __init__(
             trust_remote_code=True,
             dtype=dtype,
             swap_space=swap_space,
+            enforce_eager=enforce_eager,
             disable_log_stats=disable_log_stats,
             tensor_parallel_size=tensor_parallel_size,
             max_model_len=max_model_len,

From 1b2eaac3165dc78d4ef51231722735ca9cf37304 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 19 Jun 2024 23:10:47 -0700
Subject: [PATCH 61/98] [Bugfix][Doc] FIx Duplicate Explicit Target Name Errors
 (#5703)

---
 docs/source/dev/dockerfile/dockerfile.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/dev/dockerfile/dockerfile.rst
index 40ba818eb57d8..9c17c27aa61bf 100644
--- a/docs/source/dev/dockerfile/dockerfile.rst
+++ b/docs/source/dev/dockerfile/dockerfile.rst
@@ -1,8 +1,8 @@
 Dockerfile
 ====================
 
-See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct 
-the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`_.
+See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`__ for the main Dockerfile to construct 
+the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`__.
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 

From 111af1fa2c4fdb2d83b466935a327b1a5009874a Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 20 Jun 2024 12:07:08 +0530
Subject: [PATCH 62/98] [Kernel] Update Cutlass int8 kernel configs for SM90
 (#5514)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 165 +++++++++++++++---
 1 file changed, 143 insertions(+), 22 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index f1a2b73ff962b..8f2aa9425a029 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -234,15 +234,15 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
 }
 
 template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue, int32_t M>
-struct sm90_fp8_config {
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
-
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule>;
@@ -250,14 +250,14 @@ struct sm90_fp8_config {
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config<InType, OutType, Epilogue, 128> {
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_64, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
-
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule>;
@@ -265,7 +265,8 @@ struct sm90_fp8_config<InType, OutType, Epilogue, 128> {
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config<InType, OutType, Epilogue, 64> {
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
@@ -278,6 +279,78 @@ struct sm90_fp8_config<InType, OutType, Epilogue, 64> {
                       KernelSchedule, EpilogueSchedule>;
 };
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
 }  // namespace
 
 template <typename InType, typename OutType,
@@ -291,11 +364,12 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
   using Cutlass3xGemmDefault =
-      typename sm90_fp8_config<InType, OutType, Epilogue, 0>::Cutlass3xGemm;
+      typename sm90_fp8_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM64 =
-      typename sm90_fp8_config<InType, OutType, Epilogue, 64>::Cutlass3xGemm;
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
-      typename sm90_fp8_config<InType, OutType, Epilogue, 128>::Cutlass3xGemm;
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
@@ -316,6 +390,61 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_int8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
 void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -326,22 +455,14 @@ void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
-    using TileShape = Shape<_128, _128, _128>;
-    using ClusterShape = Shape<_1, _2, _1>;
-    using KernelSchedule =
-        typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-    using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<cutlass_3x_gemm<
-          int8_t, cutlass::bfloat16_t, ScaledEpilogue, TileShape, ClusterShape,
-          KernelSchedule, EpilogueSchedule>>(out, a, b, a_scales, b_scales);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             ScaledEpilogue>(
+          out, a, b, a_scales, b_scales);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-
-      return cutlass_gemm_caller<
-          cutlass_3x_gemm<int8_t, cutlass::half_t, ScaledEpilogue, TileShape,
-                          ClusterShape, KernelSchedule, EpilogueSchedule>>(
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t,
+                                             ScaledEpilogue>(
           out, a, b, a_scales, b_scales);
     }
   } else {

From ad137cd1112ab9b17ac36fc123fc7806a1d7473d Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 20 Jun 2024 04:52:09 -0700
Subject: [PATCH 63/98] [Model] Port over CLIPVisionModel for VLMs (#5591)

---
 csrc/activation_kernels.cu               |  12 ++
 csrc/ops.h                               |   2 +
 csrc/torch_bindings.cpp                  |   4 +
 vllm/_custom_ops.py                      |   4 +
 vllm/model_executor/layers/activation.py |  16 ++
 vllm/model_executor/models/clip.py       | 203 +++++++++++++++++++++++
 vllm/model_executor/models/llava.py      |  17 +-
 vllm/model_executor/models/llava_next.py |  19 ++-
 vllm/model_executor/models/phi3v.py      |  13 +-
 9 files changed, 269 insertions(+), 21 deletions(-)
 create mode 100644 vllm/model_executor/models/clip.py

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 86ac2e75e78ee..5ed1dc3b8f792 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -135,6 +135,12 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
   return ((T)0.5) * x * (((T)1.0) + t);
 }
 
+template <typename T>
+__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
+  // x * sigmoid(1.702 * x)
+  return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
+}
+
 }  // namespace vllm
 
 void gelu_new(torch::Tensor& out,    // [..., d]
@@ -148,3 +154,9 @@ void gelu_fast(torch::Tensor& out,    // [..., d]
 {
   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
 }
+
+void gelu_quick(torch::Tensor& out,    // [..., d]
+                torch::Tensor& input)  // [..., d]
+{
+  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
+}
diff --git a/csrc/ops.h b/csrc/ops.h
index 9e2e977fa3c2e..ba92cc5373d7a 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -49,6 +49,8 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_fast(torch::Tensor& out, torch::Tensor& input);
 
+void gelu_quick(torch::Tensor& out, torch::Tensor& input);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                         const torch::Tensor& codebooks,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 867bf438937cd..953f2eb4d8e7d 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -68,6 +68,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
 
+  // Quick GELU implementation.
+  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
+
   // Layernorm
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index ab2a67950bfea..a053a3aa237e7 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -66,6 +66,10 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
     torch.ops._C.gelu_new(out, x)
 
 
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    torch.ops._C.gelu_quick(out, x)
+
+
 # page attention ops
 def paged_attention_v1(
     out: torch.Tensor,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index eb0606948686d..80cad15b43426 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -141,6 +141,21 @@ def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
+class QuickGELU(CustomOp):
+
+    # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        return x * torch.sigmoid(1.702 * x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
+
+
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
 
@@ -189,6 +204,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
     "gelu_new": NewGELU(),
     "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
     "relu": nn.ReLU(),
+    "quick_gelu": QuickGELU(),
 }
 
 
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
new file mode 100644
index 0000000000000..aa4e87228a7e4
--- /dev/null
+++ b/vllm/model_executor/models/clip.py
@@ -0,0 +1,203 @@
+"""Minimal implementation of CLIPVisionModel intended to be only used 
+within a vision language model."""
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionConfig
+from transformers.models.clip.modeling_clip import CLIPAttention
+
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+
+def get_clip_num_patches(image_size: int, patch_size: int) -> int:
+    assert image_size % patch_size == 0
+    return (image_size // patch_size)**2
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
+class CLIPVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = get_clip_num_patches(self.image_size,
+                                                self.patch_size)
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+        self.register_buffer("position_ids",
+                             torch.arange(self.num_positions).expand((1, -1)),
+                             persistent=False)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+class CLIPMLP(nn.Module):
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class CLIPEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config, quant_config=quant_config)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self 
+    attention layers. Each layer is a [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            CLIPEncoderLayer(config=config, quant_config=quant_config)
+            for _ in range(config.num_hidden_layers)
+        ])
+
+    def forward(self,
+                inputs_embeds: torch.Tensor,
+                vision_feature_layer: int = -1):
+
+        # Encoder forward pass only up to the required layer
+        num_layer = len(self.layers) + vision_feature_layer + 1
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers[:num_layer]:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class CLIPVisionTransformer(nn.Module):
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+
+        # NOTE: This typo of "layrnorm" is not fixed on purpose to match
+        # the original transformers code and name of the model weights.
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config=config, quant_config=quant_config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        vision_feature_layer: int = -1,
+    ) -> torch.Tensor:
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.encoder(inputs_embeds=hidden_states,
+                                     vision_feature_layer=vision_feature_layer)
+
+        return hidden_states
+
+
+class CLIPVisionModel(nn.Module):
+
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.vision_model = CLIPVisionTransformer(config=config,
+                                                  quant_config=quant_config)
+
+    def forward(self,
+                pixel_values: Optional[torch.Tensor] = None,
+                vision_feature_layer: int = -1):
+
+        return self.vision_model(pixel_values=pixel_values,
+                                 vision_feature_layer=vision_feature_layer)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 39355b9d3ab44..8e36c54b1c511 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -2,9 +2,7 @@
 
 import torch
 import torch.nn as nn
-# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
-# transformers' impl.
-from transformers import CLIPVisionModel, LlavaConfig
+from transformers import LlavaConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
@@ -15,6 +13,7 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -189,12 +188,11 @@ def _select_image_features(self, image_features: torch.Tensor, *,
 
     def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
                                   pixel_values: torch.Tensor) -> torch.Tensor:
-        # TODO(xwjiang): Maybe port minimal CLIPVisionModel over.
-        image_outputs = vision_tower(pixel_values.to(vision_tower.device),
-                                     output_hidden_states=True)
 
-        image_features = image_outputs.hidden_states[
-            self.config.vision_feature_layer]
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values.to(vision_tower.device),
+                                      self.config.vision_feature_layer)
 
         return self._select_image_features(
             image_features,
@@ -317,6 +315,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            # post_layernorm is not needed in CLIPVisionModel
+            if "vision_model.post_layernorm" in name:
+                continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 0ab9afea9ac69..c1158c933c88b 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -4,9 +4,7 @@
 import torch
 import torch.nn as nn
 from PIL import Image
-# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
-# transformers' impl.
-from transformers import CLIPVisionModel, LlavaNextConfig
+from transformers import LlavaNextConfig
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
@@ -20,6 +18,7 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
@@ -121,7 +120,7 @@ def __init__(self,
 
         if self.vision_language_config.image_input_type == (
                 VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
-            self.vision_tower = CLIPVisionModel(config.vision_config)
+            self.vision_tower = CLIPVisionModel(config=config.vision_config)
         else:
             raise TypeError("Image features are not supported by LLaVA-NeXT")
 
@@ -219,12 +218,11 @@ def _select_image_features(self, image_features: torch.Tensor, *,
 
     def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
                                   pixel_values: torch.Tensor) -> torch.Tensor:
-        # TODO(xwjiang): Maybe port minimal CLIPVisionModel over.
-        image_outputs = vision_tower(pixel_values.to(vision_tower.device),
-                                     output_hidden_states=True)
 
-        image_features = image_outputs.hidden_states[
-            self.config.vision_feature_layer]
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values.to(vision_tower.device),
+                                      self.config.vision_feature_layer)
 
         return self._select_image_features(
             image_features,
@@ -430,6 +428,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            # post_layernorm is not needed in CLIPVisionModel
+            if "vision_model.post_layernorm" in name:
+                continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 35f3b894f099a..fa20a7c5903d6 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -17,7 +17,7 @@
 
 import torch
 import torch.nn as nn
-from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
+from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
@@ -27,6 +27,7 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -70,9 +71,10 @@ def get_img_features(self,
         LAYER_IDX = self.layer_idx
         TYPE_FEATURE = self.type_feature
 
-        img_processor_output = self.img_processor(img_embeds,
-                                                  output_hidden_states=True)
-        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the img_processor
+        img_feature = self.img_processor(img_embeds,
+                                         vision_feature_layer=LAYER_IDX)
 
         if TYPE_FEATURE == "patch":
             patch_feature = img_feature[:, 1:]
@@ -352,6 +354,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            # post_layernorm is not needed in CLIPVisionModel
+            if "vision_model.post_layernorm" in name:
+                continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)

From a7dcc62086ea751b46b4821c2811cf8ac83711bf Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 20 Jun 2024 19:03:21 +0530
Subject: [PATCH 64/98] [Kernel] Update Cutlass int8 kernel configs for SM80
 (#5275)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 csrc/quantization/cutlass_w8a8/common.hpp     |   7 +
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 127 ++++++++++++++++--
 .../cutlass_w8a8/scaled_mm_c3x.cu             |   5 -
 3 files changed, 123 insertions(+), 16 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
index 999b7b251ab33..23d0587bbdc5d 100644
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ b/csrc/quantization/cutlass_w8a8/common.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include <climits>
 
 /**
  * Helper function for checking CUTLASS errors
@@ -10,3 +11,9 @@
     TORCH_CHECK(status == cutlass::Status::kSuccess, \
                 cutlassGetStatusString(status))      \
   }
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 7651268dc5316..740b9fb64a754 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -250,8 +250,120 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(status);
 }
 
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_default {
+  // This config is used in 2 cases,
+  //  - M in (128, inf)
+  //  - M in (64, 128] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M64 {
+  // This config is used in 2 cases,
+  // - M in (32, 64]
+  // - M in (64, 128] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M32 {
+  // M in (16, 32]
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M16 {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
 }  // namespace
 
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128BigN =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128SmallN =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM64 =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM16 =
+      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return cutlass_gemm_caller<Cutlass2xGemmM16>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return cutlass_gemm_caller<Cutlass2xGemmM32>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return cutlass_gemm_caller<Cutlass2xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    uint32_t const n = out.size(1);
+    bool const small_n = n < 8192;
+    if (small_n) {
+      return cutlass_gemm_caller<Cutlass2xGemmM128SmallN>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_gemm_caller<Cutlass2xGemmM128BigN>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    // M in (128, inf)
+    return cutlass_gemm_caller<Cutlass2xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
 void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -288,20 +400,13 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::bfloat16_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
+                                      ScaledEpilogue>(out, a, b, a_scales,
+                                                      b_scales);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::half_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 8f2aa9425a029..cfa8f80f7ea04 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -44,11 +44,6 @@ using namespace cute;
 
 namespace {
 
-uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
 // A wrapper for the GEMM kernel that is used to guard against compilation on
 // architectures that will never use the kernel. The purpose of this is to
 // reduce the size of the compiled binary.

From 3f3b6b21500bce2061cae33706bd47c8b6663771 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 20 Jun 2024 14:36:10 -0400
Subject: [PATCH 65/98] [Bugfix] Fix the CUDA version check for FP8 support in
 the CUTLASS kernels (#5715)

---
 csrc/ops.h                                       |  2 ++
 .../quantization/cutlass_w8a8/scaled_mm_entry.cu | 16 ++++++++++++++++
 csrc/torch_bindings.cpp                          |  6 ++++++
 vllm/_custom_ops.py                              |  4 ++++
 vllm/model_executor/layers/quantization/fp8.py   | 15 ++-------------
 5 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index ba92cc5373d7a..6f0a7143c9169 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -92,6 +92,8 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
+bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
+
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales);
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 687f8efd8dc00..f4e582d780ad9 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -25,6 +25,22 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b_scales);
 #endif
 
+bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
+  // CUTLASS FP8 kernels need at least
+  //   CUDA 12.0 on SM90 systems (Hopper)
+  //   CUDA 12.4 on SM89 systems (Lovelace)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  } else if (cuda_device_capability >= 89) {
+    return CUDA_VERSION >= 12040;
+  }
+#endif
+
+  return false;
+}
+
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales) {
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 953f2eb4d8e7d..227b69d79e863 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -144,6 +144,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                  Tensor b, Tensor a_scales,"
       "                  Tensor b_scales) -> ()");
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
+
+  // Check if cutlass scaled_mm is supported for CUDA devices of the given
+  // capability
+  ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+  ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
+           &cutlass_scaled_mm_supports_fp8);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a053a3aa237e7..e050c1172acb5 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -216,6 +216,10 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # cutlass
+def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+
+
 def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
                       scale_b: torch.Tensor,
                       out_dtype: Type[torch.dtype]) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e89fd65813c05..bbf3cde54782d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -20,19 +20,8 @@
 def cutlass_fp8_supported() -> bool:
     capability = torch.cuda.get_device_capability()
     capability = capability[0] * 10 + capability[1]
-    major, minor = torch.version.cuda.split(".")
-    version = int(major) * 10 + int(minor)
-
-    # CUTLASS FP8 kernels need at least
-    #   CUDA 12.0 on SM90 systems (Hopper)
-    #   CUDA 12.4 on SM89 systems (Lovelace)
-    gpu_is_supported = False
-    if capability >= 90:
-        gpu_is_supported = version > 120
-    elif capability >= 89:
-        gpu_is_supported = version > 124
-
-    return gpu_is_supported
+
+    return ops.cutlass_scaled_mm_supports_fp8(capability)
 
 
 class Fp8Config(QuantizationConfig):

From 8065a7e220cca1dd53107da85b6f3932ac9e25e8 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 20 Jun 2024 19:00:13 -0400
Subject: [PATCH 66/98] [Frontend] Add FlexibleArgumentParser to support both
 underscore and dash in names (#5718)

---
 benchmarks/benchmark_latency.py               |  3 ++-
 benchmarks/benchmark_prefix_caching.py        |  4 ++--
 benchmarks/benchmark_serving.py               |  7 ++++++-
 benchmarks/benchmark_throughput.py            |  3 ++-
 .../cutlass_benchmarks/w8a8_benchmarks.py     |  3 ++-
 benchmarks/kernels/benchmark_aqlm.py          |  4 ++--
 benchmarks/kernels/benchmark_marlin.py        |  4 ++--
 benchmarks/kernels/benchmark_moe.py           |  3 ++-
 .../kernels/benchmark_paged_attention.py      |  6 +++---
 benchmarks/kernels/benchmark_rope.py          |  4 ++--
 benchmarks/overheads/benchmark_hashing.py     |  4 ++--
 examples/aqlm_example.py                      |  5 ++---
 examples/llm_engine_example.py                |  3 ++-
 examples/save_sharded_state.py                |  4 ++--
 examples/tensorize_vllm_model.py              |  3 ++-
 tests/async_engine/api_server_async_engine.py |  4 ++--
 vllm/engine/arg_utils.py                      | 17 ++++++++---------
 vllm/entrypoints/api_server.py                |  5 ++---
 vllm/entrypoints/openai/cli_args.py           |  3 ++-
 vllm/entrypoints/openai/run_batch.py          |  5 ++---
 .../model_executor/model_loader/tensorizer.py |  4 ++--
 vllm/utils.py                                 | 19 +++++++++++++++++++
 22 files changed, 72 insertions(+), 45 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index e9d1048c89b64..a4cf0632b7790 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -13,6 +13,7 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptStrictInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import FlexibleArgumentParser
 
 
 def main(args: argparse.Namespace):
@@ -120,7 +121,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description='Benchmark the latency of processing a single batch of '
         'requests till completion.')
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 089966986984f..395107a5ec747 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,7 +1,7 @@
-import argparse
 import time
 
 from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser
 
 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
 
@@ -44,7 +44,7 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description='Benchmark the performance with or without automatic '
         'prefix caching.')
     parser.add_argument('--model',
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index eef03e7d81c39..42867fc40edd2 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -44,6 +44,11 @@
 except ImportError:
     from backend_request_func import get_tokenizer
 
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
 
 @dataclass
 class BenchmarkMetrics:
@@ -511,7 +516,7 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="Benchmark the online serving throughput.")
     parser.add_argument(
         "--backend",
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index ed65002bc7d3c..2c6beb4e89672 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -12,6 +12,7 @@
 
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import FlexibleArgumentParser
 
 
 def sample_requests(
@@ -261,7 +262,7 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
     parser.add_argument("--backend",
                         type=str,
                         choices=["vllm", "hf", "mii"],
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 5cc0fbbd49b8e..377f8683c021f 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -11,6 +11,7 @@
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@@ -293,7 +294,7 @@ def to_torch_dtype(dt):
             return torch.float8_e4m3fn
         raise ValueError("unsupported dtype")
 
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="""
 Benchmark Cutlass GEMM.
 
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index ac6a9f297f95a..601c4ea439aea 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,4 +1,3 @@
-import argparse
 import os
 import sys
 from typing import Optional
@@ -10,6 +9,7 @@
 from vllm.model_executor.layers.quantization.aqlm import (
     dequantize_weight, generic_dequantize_gemm, get_int_dtype,
     optimized_dequantize_gemm)
+from vllm.utils import FlexibleArgumentParser
 
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
@@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
 
 def main():
 
-    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
+    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
 
     # Add arguments
     parser.add_argument("--nbooks",
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 96f01967b351e..261f5829631ee 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,4 +1,3 @@
-import argparse
 from typing import List
 
 import torch
@@ -16,6 +15,7 @@
     MarlinWorkspace, marlin_24_quantize, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     gptq_pack, quantize_weights, sort_weights)
+from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@@ -211,7 +211,7 @@ def main(args):
 #   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
 #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="Benchmark Marlin across specified models/shapes/batches")
     parser.add_argument(
         "--models",
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 62347aaf8ed6d..e00696d6d43cb 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -10,6 +10,7 @@
 from transformers import AutoConfig
 
 from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.utils import FlexibleArgumentParser
 
 
 class BenchmarkConfig(TypedDict):
@@ -315,7 +316,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = FlexibleArgumentParser()
     parser.add_argument("--model",
                         type=str,
                         default="mistralai/Mixtral-8x7B-Instruct-v0.1")
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 687e2369b758c..16de60477c305 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,4 +1,3 @@
-import argparse
 import random
 import time
 from typing import List, Optional
@@ -6,7 +5,8 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
+                        create_kv_caches_with_random)
 
 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@@ -161,7 +161,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="Benchmark the paged attention kernel.")
     parser.add_argument("--version",
                         type=str,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index a53c6c77a5828..78736c7a7ba6f 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,4 +1,3 @@
-import argparse
 from itertools import accumulate
 from typing import List, Optional
 
@@ -7,6 +6,7 @@
 
 from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
                                                          get_rope)
+from vllm.utils import FlexibleArgumentParser
 
 
 def benchmark_rope_kernels_multi_lora(
@@ -86,7 +86,7 @@ def benchmark_rope_kernels_multi_lora(
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="Benchmark the rotary embedding kernels.")
     parser.add_argument("--is-neox-style", type=bool, default=True)
     parser.add_argument("--batch-size", type=int, default=16)
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index c846e47de1fcf..203699e9a8d06 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -1,8 +1,8 @@
-import argparse
 import cProfile
 import pstats
 
 from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser
 
 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"
@@ -47,7 +47,7 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description='Benchmark the performance of hashing function in'
         'automatic prefix caching.')
     parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index 3a63003ab4ba9..40f9a21ec9e51 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -1,11 +1,10 @@
-import argparse
-
 from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser
 
 
 def main():
 
-    parser = argparse.ArgumentParser(description='AQLM examples')
+    parser = FlexibleArgumentParser(description='AQLM examples')
 
     parser.add_argument('--model',
                         '-m',
diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
index a81c4b3e399c3..ca41f32b12b31 100644
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@@ -2,6 +2,7 @@
 from typing import List, Tuple
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.utils import FlexibleArgumentParser
 
 
 def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
@@ -55,7 +56,7 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description='Demo on using the LLMEngine class directly')
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/examples/save_sharded_state.py b/examples/save_sharded_state.py
index c595d98ba2750..4207f8922403b 100644
--- a/examples/save_sharded_state.py
+++ b/examples/save_sharded_state.py
@@ -20,15 +20,15 @@
     tensor_parallel_size=8,
 )
 """
-import argparse
 import dataclasses
 import os
 import shutil
 from pathlib import Path
 
 from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
 
-parser = argparse.ArgumentParser()
+parser = FlexibleArgumentParser()
 EngineArgs.add_cli_args(parser)
 parser.add_argument("--output",
                     "-o",
diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index f9ed5fe08988e..dd77a4ad0c6b7 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -9,6 +9,7 @@
 from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
                                                          TensorizerConfig,
                                                          tensorize_vllm_model)
+from vllm.utils import FlexibleArgumentParser
 
 # yapf conflicts with isort for this docstring
 # yapf: disable
@@ -96,7 +97,7 @@
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="An example script that can be used to serialize and "
         "deserialize vLLM models. These models "
         "can be loaded using tensorizer directly to the GPU "
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index 1be76fdc8d868..495a123c351d7 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -1,5 +1,4 @@
 """vllm.entrypoints.api_server with some extra logging for testing."""
-import argparse
 from typing import Any, Dict
 
 import uvicorn
@@ -8,6 +7,7 @@
 import vllm.entrypoints.api_server
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import FlexibleArgumentParser
 
 app = vllm.entrypoints.api_server.app
 
@@ -33,7 +33,7 @@ def stats() -> Response:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser = AsyncEngineArgs.add_cli_args(parser)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7f760c2775f3d..ef31612420c94 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -11,7 +11,7 @@
                          SpeculativeConfig, TokenizerPoolConfig,
                          VisionLanguageConfig)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import str_to_int_tuple
+from vllm.utils import FlexibleArgumentParser, str_to_int_tuple
 
 
 def nullable_str(val: str):
@@ -110,7 +110,7 @@ def __post_init__(self):
 
     @staticmethod
     def add_cli_args_for_vlm(
-            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+            parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--image-input-type',
                             type=nullable_str,
                             default=None,
@@ -156,8 +156,7 @@ def add_cli_args_for_vlm(
         return parser
 
     @staticmethod
-    def add_cli_args(
-            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
 
         # Model arguments
@@ -800,8 +799,8 @@ class AsyncEngineArgs(EngineArgs):
     max_log_len: Optional[int] = None
 
     @staticmethod
-    def add_cli_args(parser: argparse.ArgumentParser,
-                     async_args_only: bool = False) -> argparse.ArgumentParser:
+    def add_cli_args(parser: FlexibleArgumentParser,
+                     async_args_only: bool = False) -> FlexibleArgumentParser:
         if not async_args_only:
             parser = EngineArgs.add_cli_args(parser)
         parser.add_argument('--engine-use-ray',
@@ -822,13 +821,13 @@ def add_cli_args(parser: argparse.ArgumentParser,
 
 # These functions are used by sphinx to build the documentation
 def _engine_args_parser():
-    return EngineArgs.add_cli_args(argparse.ArgumentParser())
+    return EngineArgs.add_cli_args(FlexibleArgumentParser())
 
 
 def _async_engine_args_parser():
-    return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
+    return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
                                         async_args_only=True)
 
 
 def _vlm_engine_args_parser():
-    return EngineArgs.add_cli_args_for_vlm(argparse.ArgumentParser())
+    return EngineArgs.add_cli_args_for_vlm(FlexibleArgumentParser())
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 075de0b4efb2d..feb904c5a13c9 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -6,7 +6,6 @@
 change `vllm/entrypoints/openai/api_server.py` instead.
 """
 
-import argparse
 import json
 import ssl
 from typing import AsyncGenerator
@@ -19,7 +18,7 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import random_uuid
+from vllm.utils import FlexibleArgumentParser, random_uuid
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 app = FastAPI()
@@ -80,7 +79,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default=None)
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--ssl-keyfile", type=str, default=None)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 4c0cb1e4f3e49..59ad73bf097c8 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -10,6 +10,7 @@
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.utils import FlexibleArgumentParser
 
 
 class LoRAParserAction(argparse.Action):
@@ -23,7 +24,7 @@ def __call__(self, parser, namespace, values, option_string=None):
 
 
 def make_arg_parser():
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser.add_argument("--host",
                         type=nullable_str,
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 2f18701870a6f..488ac89710b6d 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,4 +1,3 @@
-import argparse
 import asyncio
 import sys
 from io import StringIO
@@ -16,14 +15,14 @@
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import random_uuid
+from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible batch runner.")
     parser.add_argument(
         "-i",
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index d79fedaea428e..b009ad8c882d4 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -21,6 +21,7 @@
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.utils import FlexibleArgumentParser
 
 tensorizer_error_msg = None
 
@@ -177,8 +178,7 @@ def __post_init__(self):
                 self.deserializer_params['encryption'] = decryption_params
 
     @staticmethod
-    def add_cli_args(
-            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Tensorizer CLI arguments"""
 
         # Tensorizer options arg group
diff --git a/vllm/utils.py b/vllm/utils.py
index ffe921e650aaa..27a7b1042d88f 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,3 +1,4 @@
+import argparse
 import asyncio
 import datetime
 import enum
@@ -775,3 +776,21 @@ def wrapper(*args, **kwargs) -> Any:
 
     wrapper.has_run = False  # type: ignore[attr-defined]
     return wrapper
+
+
+class FlexibleArgumentParser(argparse.ArgumentParser):
+    """ArgumentParser that allows both underscore and dash in names."""
+
+    def parse_args(self, args=None, namespace=None):
+        if args is None:
+            args = sys.argv[1:]
+
+        # Convert underscores to dashes and vice versa in argument names
+        processed_args = []
+        for arg in args:
+            if arg.startswith('--'):
+                processed_args.append('--' + arg[len('--'):].replace('_', '-'))
+            else:
+                processed_args.append(arg)
+
+        return super().parse_args(processed_args, namespace)

From 6c5b7af1525a2013d7b1806dd6c0c9a53404be6d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 20 Jun 2024 17:06:34 -0700
Subject: [PATCH 67/98] [distributed][misc] use fork by default for mp (#5669)

---
 .buildkite/test-pipeline.yaml                 |  9 ++++++
 .../custom_all_reduce_utils.py                | 28 ++++++++++++++++++-
 vllm/envs.py                                  |  4 +--
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 95cd5b1989ee2..5e92ba3c24f55 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -37,6 +37,9 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -55,6 +58,9 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s distributed/test_pynccl.py
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
@@ -145,6 +151,9 @@ steps:
   num_gpus: 4
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index e0641a54c4194..d3e41fa710676 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,6 +1,9 @@
 import ctypes
 import json
 import os
+import pickle
+import subprocess
+import sys
 from itertools import product
 from typing import Dict, List, Optional, Sequence
 
@@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
         ids = list(range(num_dev))
         # batch of all pairs of GPUs
         batch_src, batch_tgt = zip(*list(product(ids, ids)))
-        result = can_actually_p2p(batch_src, batch_tgt)
+        # NOTE: we use `subprocess` rather than `multiprocessing` here
+        # because the caller might not have `if __name__ == "__main__":`,
+        # in that case we cannot use spawn method in multiprocessing.
+        # However, `can_actually_p2p` requires spawn method.
+        # The fix is, we use `subprocess` to call the function,
+        # where we have `if __name__ == "__main__":` in this file.
+        input_bytes = pickle.dumps((batch_src, batch_tgt))
+        returned = subprocess.run([sys.executable, __file__],
+                                  input=input_bytes,
+                                  capture_output=True)
+        # check if the subprocess is successful
+        try:
+            returned.check_returncode()
+        except Exception as e:
+            # wrap raised exception to provide more information
+            raise RuntimeError(
+                f"Error happened when batch testing "
+                f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
+        result = pickle.loads(returned.stdout)
         for _i, _j, r in zip(batch_src, batch_tgt, result):
             cache[f"{_i}->{_j}"] = r
         with open(path, "w") as f:
@@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
 
 
 __all__ = ["gpu_p2p_access_check"]
+
+if __name__ == "__main__":
+    batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read())
+    result = can_actually_p2p(batch_src, batch_tgt)
+    sys.stdout.buffer.write(pickle.dumps(result))
diff --git a/vllm/envs.py b/vllm/envs.py
index f03b69f4b8866..ae2fcd0826fb1 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -29,7 +29,7 @@
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
     VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
@@ -212,7 +212,7 @@
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
-    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
+    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
 
     # Timeout for fetching images when serving multimodal models
     # Default is 5 seconds

From b12518d3cf4326dfcd10a09780913b86c19fcf1a Mon Sep 17 00:00:00 2001
From: Joshua Rosenkranz <joshua.rosenkranz@gmail.com>
Date: Thu, 20 Jun 2024 20:23:12 -0400
Subject: [PATCH 68/98] [Model] MLPSpeculator speculative decoding support
 (#4947)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>

Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Davis Wertheimer <Davis.Wertheimer@ibm.com>
---
 examples/offline_inference_mlpspeculator.py   |  59 ++++++++
 tests/spec_decode/test_spec_decode_worker.py  |   8 +-
 tests/spec_decode/test_utils.py               |   4 +-
 vllm/config.py                                |  54 +++++--
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/mlp_speculator.py  | 143 ++++++++++++++++++
 vllm/sequence.py                              |  46 ++++++
 vllm/spec_decode/batch_expansion.py           |   6 +-
 vllm/spec_decode/interfaces.py                |   4 +
 vllm/spec_decode/mlp_speculator_worker.py     |  87 +++++++++++
 vllm/spec_decode/spec_decode_worker.py        |  42 ++++-
 vllm/spec_decode/top1_proposer.py             |   4 +
 vllm/spec_decode/util.py                      |   8 -
 vllm/transformers_utils/config.py             |  18 ++-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 .../configs/mlp_speculator.py                 |  50 ++++++
 vllm/worker/model_runner.py                   |  18 ++-
 vllm/worker/worker.py                         |   9 ++
 18 files changed, 523 insertions(+), 40 deletions(-)
 create mode 100644 examples/offline_inference_mlpspeculator.py
 create mode 100644 vllm/model_executor/models/mlp_speculator.py
 create mode 100644 vllm/spec_decode/mlp_speculator_worker.py
 create mode 100644 vllm/transformers_utils/configs/mlp_speculator.py

diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py
new file mode 100644
index 0000000000000..5448ec1f6208c
--- /dev/null
+++ b/examples/offline_inference_mlpspeculator.py
@@ -0,0 +1,59 @@
+import gc
+import time
+from typing import List
+
+from vllm import LLM, SamplingParams
+
+
+def time_generation(llm: LLM, prompts: List[str],
+                    sampling_params: SamplingParams):
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    # Warmup first
+    llm.generate(prompts, sampling_params)
+    llm.generate(prompts, sampling_params)
+    start = time.time()
+    outputs = llm.generate(prompts, sampling_params)
+    end = time.time()
+    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
+    # Print the outputs.
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+
+    template = (
+        "Below is an instruction that describes a task. Write a response "
+        "that appropriately completes the request.\n\n### Instruction:\n{}"
+        "\n\n### Response:\n")
+
+    # Sample prompts.
+    prompts = [
+        "Write about the president of the United States.",
+    ]
+    prompts = [template.format(prompt) for prompt in prompts]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
+
+    # Create an LLM without spec decoding
+    llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
+
+    print("Without speculation")
+    time_generation(llm, prompts, sampling_params)
+
+    del llm
+    gc.collect()
+
+    # Create an LLM with spec decoding
+    llm = LLM(
+        model="meta-llama/Llama-2-13b-chat-hf",
+        speculative_model="ibm-fms/llama-13b-accelerator",
+        # These are currently required for MLPSpeculator decoding
+        use_v2_block_manager=True,
+        enforce_eager=True,
+    )
+
+    print("With speculation")
+    time_generation(llm, prompts, sampling_params)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index afaeffc9681cb..a20c793c9bfd7 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -456,7 +456,9 @@ def test_k_equals_zero(k: int, batch_size: int):
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+    sampler_output = MagicMock(spec=SamplerOutput)
+    sampler_output.hidden_states = None
+    target_worker.execute_model.return_value = [sampler_output]
 
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
@@ -497,7 +499,9 @@ def test_empty_input_batch(k: int, batch_size: int):
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+    sampler_output = MagicMock(spec=SamplerOutput)
+    sampler_output.hidden_states = None
+    target_worker.execute_model.return_value = [sampler_output]
 
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 6b6f35a1a1d05..bccbf9a6aaaeb 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -2,8 +2,8 @@
 
 import pytest
 
-from vllm.sequence import SequenceGroupMetadata
-from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len
+from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids
+from vllm.spec_decode.util import split_batch_by_proposal_len
 
 
 def test_get_all_seq_ids():
diff --git a/vllm/config.py b/vllm/config.py
index 5de00d7d38d46..8d004902fe4ff 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -230,7 +230,8 @@ def verify_with_parallel_config(
         self,
         parallel_config: "ParallelConfig",
     ) -> None:
-        total_num_attention_heads = self.hf_text_config.num_attention_heads
+        total_num_attention_heads = getattr(self.hf_text_config,
+                                            "num_attention_heads", 0)
         tensor_parallel_size = parallel_config.tensor_parallel_size
         if total_num_attention_heads % tensor_parallel_size != 0:
             raise ValueError(
@@ -238,7 +239,8 @@ def verify_with_parallel_config(
                 " must be divisible by tensor parallel size "
                 f"({tensor_parallel_size}).")
 
-        total_num_hidden_layers = self.hf_text_config.num_hidden_layers
+        total_num_hidden_layers = getattr(self.hf_text_config,
+                                          "num_hidden_layers", 0)
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
         if total_num_hidden_layers % pipeline_parallel_size != 0:
             raise ValueError(
@@ -341,8 +343,8 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
 
     def get_num_attention_heads(self,
                                 parallel_config: "ParallelConfig") -> int:
-        return self.hf_text_config.num_attention_heads // \
-            parallel_config.tensor_parallel_size
+        num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
+        return num_heads // parallel_config.tensor_parallel_size
 
     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
         total_num_hidden_layers = self.hf_text_config.num_hidden_layers
@@ -818,7 +820,8 @@ def maybe_create_spec_config(
             speculative_model (Optional[str]): The name of the speculative
                 model, if provided.
             num_speculative_tokens (Optional[int]): The number of speculative
-                tokens, if provided.
+                tokens, if provided. Will default to the number in the draft
+                model config if present, otherwise is required.
             speculative_max_model_len (Optional[int]): The maximum model len of
                 the speculative model. Used when testing the ability to skip
                 speculation for some sequences.
@@ -841,24 +844,18 @@ def maybe_create_spec_config(
                 the necessary conditions are met, else None.
         """
 
-        if speculative_model is None and num_speculative_tokens is None:
+        if speculative_model is None:
+            if num_speculative_tokens is not None:
+                raise ValueError("num_speculative_tokens was provided without "
+                                 "speculative_model.")
             return None
 
-        if speculative_model is not None and num_speculative_tokens is None:
-            raise ValueError(
-                "Expected both speculative_model and "
-                "num_speculative_tokens to be provided, but found "
-                f"{speculative_model=} and {num_speculative_tokens=}.")
-
         if (speculative_disable_by_batch_size is not None
                 and speculative_disable_by_batch_size < 2):
             raise ValueError("Expect the batch size threshold of disabling "
                              "speculative decoding is > 1, but got "
                              f"{speculative_disable_by_batch_size=}")
 
-        assert (speculative_model is not None
-                and num_speculative_tokens is not None)
-
         if enable_chunked_prefill:
             raise ValueError(
                 "Speculative decoding and chunked prefill are "
@@ -912,6 +909,27 @@ def maybe_create_spec_config(
                 max_logprobs=target_model_config.max_logprobs,
             )
 
+            if (draft_model_config.hf_config.model_type == "mlp_speculator"
+                    and target_parallel_config.world_size != 1):
+                # MLPSpeculator TP support will be added very soon
+                raise ValueError(
+                    "Speculative decoding with mlp_speculator models does not "
+                    "yet support distributed inferencing (TP > 1).")
+
+            n_predict = getattr(draft_model_config.hf_config, "n_predict",
+                                None)
+            if n_predict is not None:
+                if num_speculative_tokens is None:
+                    # Default to max value defined in draft model config.
+                    num_speculative_tokens = n_predict
+                elif num_speculative_tokens > n_predict:
+                    # Verify provided value doesn't exceed the maximum
+                    # supported by the draft model.
+                    raise ValueError(
+                        "Expected both speculative_model and "
+                        "num_speculative_tokens to be provided, but found "
+                        f"{speculative_model=} and {num_speculative_tokens=}.")
+
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
                     speculative_max_model_len,
@@ -923,6 +941,12 @@ def maybe_create_spec_config(
                 SpeculativeConfig.create_draft_parallel_config(
                     target_parallel_config))
 
+        if num_speculative_tokens is None:
+            raise ValueError(
+                "num_speculative_tokens must be provided with "
+                "speculative_model unless the draft model config contains an "
+                "n_predict parameter.")
+
         return SpeculativeConfig(
             draft_model_config,
             draft_parallel_config,
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index f9ec7209689e7..5afb2e1d44d39 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -60,6 +60,7 @@
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
+    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
 
 _EMBEDDING_MODELS = {
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
new file mode 100644
index 0000000000000..b18269777cd01
--- /dev/null
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -0,0 +1,143 @@
+import math
+from typing import Iterable, List, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.sequence import SamplerOutput
+
+
+class MLPSpeculatorLayerNorm(nn.Module):
+    """
+    A L2 normalization implementation
+    ...
+    Args
+    ----
+    normalized_shape : int
+        Dimensionality of input data (size of final tensor axis)
+    eps : float
+        Safety term to prevent division by zero. Make sure the chosen value
+         fits in the range of your encoding scheme
+         (i.e. fp16 requires eps >= 6e-8).
+    """
+
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-06,
+    ):
+        super(MLPSpeculatorLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.empty(normalized_shape))
+        self.bias = nn.Parameter(torch.empty(normalized_shape))
+        self.eps = eps
+
+    def forward(self, x):
+        xf = x
+        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
+        x = xf.type_as(x)
+        x = self.weight * x
+        x = x + self.bias
+        return x
+
+
+class MLPSpeculator(nn.Module):
+
+    def __init__(self, config, **kwargs) -> None:
+        super().__init__()
+        self.n_predict = config.n_predict
+        self.vocab_size = config.vocab_size
+        self.emb_dim = config.emb_dim
+        self.inner_dim = config.inner_dim if config.inner_dim != 0 \
+            else config.emb_dim
+
+        self.max_speculative_tokens = getattr(config, "max_speculative_tokens",
+                                              self.n_predict)
+
+        self.emb = nn.ModuleList([
+            VocabParallelEmbedding(config.vocab_size,
+                                   self.inner_dim,
+                                   org_num_embeddings=config.vocab_size)
+            for _ in range(self.max_speculative_tokens)
+        ])
+
+        self.proj = nn.ModuleList([
+            nn.Linear((self.emb_dim if i == 0 else self.inner_dim),
+                      self.inner_dim,
+                      bias=False) for i in range(self.max_speculative_tokens)
+        ])
+
+        self.head = nn.ModuleList([
+            nn.Linear(self.inner_dim, self.vocab_size, bias=False)
+            for _ in range(self.max_speculative_tokens)
+        ])
+        self.ln = nn.ModuleList([
+            MLPSpeculatorLayerNorm(self.inner_dim)
+            for _ in range(self.max_speculative_tokens)
+        ])
+
+        self.state_weight = 0.5**(0.5 / config.n_predict)
+        self.emb_weight = math.sqrt(
+            (1 - self.state_weight**2) * (self.inner_dim / 2))
+        self.activation = nn.GELU()
+        self.config = config
+        self.logits_processor = LogitsProcessor(config.vocab_size,
+                                                config.vocab_size, 1.0)
+        self.sampler = Sampler()
+
+    def generate_proposals(
+        self,
+        input_ids: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        num_predict_tokens: int,
+        sampling_metadata: SamplingMetadata,
+    ) -> List[SamplerOutput]:
+        if num_predict_tokens > self.max_speculative_tokens:
+            raise ValueError(f"Max speculative tokens for model is "
+                             f"{self.max_speculative_tokens}, but "
+                             f"{num_predict_tokens} were requested")
+
+        # b x 1 x d
+        previous_hidden_states = previous_hidden_states.unsqueeze(1)
+
+        # b x 1
+        last_tokens = input_ids.unsqueeze(1)
+
+        next_tokens = []
+
+        for head_index in range(num_predict_tokens):
+
+            # Project and predict
+            z = self.emb[head_index](last_tokens)  # b k d
+            states = self.proj[head_index](previous_hidden_states)
+
+            # Weighted add of state_weight*state and emb_weight*z
+            # Let subsequent LN take care of denominator
+            # state_weight is close to 1, so shouldn't be any precision issues
+            states.add_(z, alpha=self.emb_weight / self.state_weight)
+
+            states = self.activation(self.ln[head_index](states))  # b k d
+            # TODO: not yet supporting top_k_tokens_per_head
+            previous_hidden_states = states
+
+            logits = self.logits_processor(self.head[head_index].weight,
+                                           states, sampling_metadata)
+
+            output = self.sampler(logits.flatten(0, 1), sampling_metadata)
+            last_tokens = output.sampled_token_ids
+            next_tokens.append(output)
+
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            param = params_dict[name.replace("speculator.", "")]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 38d3349f2ab41..287e1b9df6165 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -794,6 +794,9 @@ class SamplerOutput:
     # Spec decode metrics populated by workers.
     spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
 
+    # Optional last hidden states from the model.
+    hidden_states: Optional[torch.Tensor] = None
+
     def __getitem__(self, idx: int):
         return self.outputs[idx]
 
@@ -842,6 +845,46 @@ def __eq__(self, other: object):
                           self.__class__) and self.outputs == other.outputs
 
 
+def get_all_seq_ids(
+        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
+    """Given a list of SequenceGroupMetadata, create a list of all
+    sequence ids.
+    """
+    return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
+
+
+class HiddenStates:
+    """Hidden states corresponding to in-progress sequences.
+    Used in speculative decoding to pass hidden states from
+    the target model to the proposer model in the subsequent step.
+
+    seq_ids are the sequence ids of each entry of the batch
+    dimension of the hidden_states tensor"""
+
+    def __init__(self, seq_group_metadata_list: List[SequenceGroupMetadata],
+                 hidden_states: torch.Tensor):
+        assert len(seq_group_metadata_list) == len(hidden_states)
+        self.seq_ids: List[int] = get_all_seq_ids(seq_group_metadata_list)
+        self.hidden_states: torch.Tensor = hidden_states
+
+    def update(self, seq_group_metadata_list: List[SequenceGroupMetadata],
+               hidden_states: torch.Tensor) -> None:
+        """Update hidden states from target model invocation."""
+        assert len(seq_group_metadata_list) == len(hidden_states)
+        self.seq_ids.extend(get_all_seq_ids(seq_group_metadata_list))
+        self.hidden_states = torch.cat([self.hidden_states, hidden_states])
+
+    def prune(self,
+              seq_group_metadata_list: List[SequenceGroupMetadata]) -> None:
+        """Prune to provided list of sequence ids."""
+        seq_ids = get_all_seq_ids(seq_group_metadata_list)
+        if seq_ids != self.seq_ids:
+            # Batch contents changed - prune removed sequences.
+            index = [self.seq_ids.index(seq_id) for seq_id in seq_ids]
+            self.hidden_states = self.hidden_states[index]
+            self.seq_ids = seq_ids
+
+
 @dataclass
 class ExecuteModelRequest:
     """The model execution request."""
@@ -857,6 +900,8 @@ class ExecuteModelRequest:
     num_lookahead_slots: int = 0
     # The number of requests in the running queue.
     running_queue_size: int = 0
+    # Optional hidden states from prior step.
+    previous_hidden_states: Optional[HiddenStates] = None
 
     def clone(
         self, seq_group_metadata_list: List[SequenceGroupMetadata]
@@ -869,4 +914,5 @@ def clone(
             blocks_to_copy=self.blocks_to_copy.copy(),
             num_lookahead_slots=self.num_lookahead_slots,
             running_queue_size=self.running_queue_size,
+            previous_hidden_states=self.previous_hidden_states,
         )
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 1bde042086f0b..40516556344e9 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -4,11 +4,10 @@
 import torch
 
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
-                           SequenceGroupMetadata)
+                           SequenceGroupMetadata, get_all_seq_ids)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
-from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
-                                   sampler_output_to_torch,
+from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch,
                                    split_batch_by_proposal_len)
 from vllm.worker.worker_base import WorkerBase
 
@@ -98,6 +97,7 @@ def score_proposals(
             probs=all_probs,
             token_ids=all_tokens,
             logprobs=spec_logprobs,
+            hidden_states=target_sampler_output.hidden_states,
         )
 
     def _expand_batch(
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index 72d7818eb1177..d236fc0f2cb6b 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from typing import Optional
 
 import torch
 
@@ -46,6 +47,9 @@ class SpeculativeScores:
     # tokens and also non-speculative normal decoding.
     token_ids: torch.Tensor
 
+    # Optional last hidden states from the scoring model.
+    hidden_states: Optional[torch.Tensor] = None
+
     def __repr__(self):
         return (f"SpeculativeScores("
                 f"probs={self.probs.shape}, "
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
new file mode 100644
index 0000000000000..0926e13bedab1
--- /dev/null
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -0,0 +1,87 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.model_executor import SamplingMetadata
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+                           SequenceGroupMetadata)
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
+from vllm.worker.model_runner import ModelInput
+
+
+class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker):
+    """Worker for MLPSpeculator models.
+
+    Not currently compatible with LoRA or chunked prefill.
+    """
+
+    @torch.inference_mode()
+    def sampler_output(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        sample_len: int,
+    ) -> Tuple[List[SamplerOutput], bool]:
+        """Run the model forward pass to generate sample_len future tokens.
+        Returns the list of sampler output, one per layer, along with indicator
+        of whether torch tensor in sampler output need to be transposed in
+        latter sampler_output_to_torch logic.
+
+        For mlp spec worker, this indicator shall be True.
+        """
+        self._raise_if_unsupported(execute_model_req)
+
+        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
+        (input_tokens, seq_lens,
+         query_lens) = self._prepare_input_tensors(seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list, seq_lens, query_lens, self.device,
+            self.model_runner.pin_memory)
+
+        model_outputs = self.model_runner.model.generate_proposals(
+            input_ids=input_tokens,
+            previous_hidden_states=execute_model_req.previous_hidden_states.
+            hidden_states,
+            num_predict_tokens=sample_len,
+            sampling_metadata=sampling_metadata)
+
+        assert len(model_outputs) == sample_len
+
+        return model_outputs, True
+
+    def _prepare_input_tensors(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ) -> Tuple[torch.Tensor, List[int], List[int]]:
+        if not seq_group_metadata_list:
+            return ModelInput.empty(self.device)
+
+        input_tokens: List[int] = []
+        seq_lens: List[int] = []
+        query_lens: List[int] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            is_prompt = seq_group_metadata.is_prompt
+
+            for seq_data in seq_group_metadata.seq_data.values():
+                seq_data_len = seq_data.get_len()
+                if is_prompt:
+                    context_len = seq_data.get_num_computed_tokens()
+                    seq_len = min(
+                        seq_data_len,
+                        context_len + seq_group_metadata.token_chunk_size)
+                    tokens = seq_data.get_token_ids()[context_len:seq_len]
+                    seq_lens.append(seq_len)
+                    input_tokens.extend(tokens)
+                    query_lens.append(seq_len - context_len)
+                else:
+                    seq_lens.append(seq_data_len)
+                    input_tokens.append(seq_data.get_last_token_id())
+                    query_lens.append(1)
+
+        input_tokens_tensor = torch.tensor(input_tokens,
+                                           dtype=torch.long,
+                                           device=self.device)
+        return input_tokens_tensor, seq_lens, query_lens
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 03fad5663037b..58d3461a25188 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -8,16 +8,18 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
-                           SamplerOutput, SequenceGroupMetadata)
+                           HiddenStates, SamplerOutput, SequenceGroupMetadata,
+                           get_all_seq_ids)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.metrics import AsyncMetricsCollector
+from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.util import (create_sequence_group_output,
-                                   get_all_num_logprobs, get_all_seq_ids,
+                                   get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
                                    split_batch_by_proposal_len)
 from vllm.worker.worker import Worker
@@ -104,6 +106,10 @@ def create_worker(
             proposer_worker = NGramWorker(**draft_worker_kwargs)
             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                                   ngram_prompt_lookup_max)
+        elif draft_worker_kwargs[
+                "model_config"].hf_config.model_type == "mlp_speculator":
+            proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
+            disable_bonus_tokens = False
         else:
             proposer_worker = MultiStepWorker(**draft_worker_kwargs)
 
@@ -155,6 +161,10 @@ def __init__(
         # Lazy initiazliation.
         self.scorer: SpeculativeScorer
 
+        # Hidden states from target model to pass to proposer
+        # in the subsequent step.
+        self.previous_hidden_states: Optional[HiddenStates] = None
+
     def init_device(self) -> None:
         """Initialize both scorer and proposer models.
         """
@@ -337,6 +347,16 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
         assert len(sampler_output) == 1
         sampler_output = sampler_output[0]
 
+        # Store hidden states from target model execution.
+        hidden_states = sampler_output.hidden_states
+        if hidden_states is not None:
+            if self.previous_hidden_states is None:
+                self.previous_hidden_states = HiddenStates(
+                    execute_model_req.seq_group_metadata_list, hidden_states)
+            else:
+                self.previous_hidden_states.update(
+                    execute_model_req.seq_group_metadata_list, hidden_states)
+
         # Clear device tensors from sampler output. This reduces communication
         # overhead when the engine runs in a different process than the workers.
         sampler_output.probs = None
@@ -383,6 +403,10 @@ def _run_speculative_decoding_step(
         """
         assert num_lookahead_slots == execute_model_req.num_lookahead_slots
 
+        # Pass last hidden states from target model to proposer
+        execute_model_req.previous_hidden_states = self.previous_hidden_states
+        self.previous_hidden_states = None
+
         # Generate proposals using draft worker.
         proposals = self.proposer_worker.get_spec_proposals(execute_model_req)
 
@@ -466,6 +490,20 @@ def _verify_tokens(
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
 
+        hidden_states = proposal_scores.hidden_states
+        if hidden_states is not None:
+            # Contract hidden states based on accepted tokens
+            hs_size = hidden_states.shape[1]
+            hidden_states = hidden_states.reshape(-1, max_proposal_len + 1,
+                                                  hs_size)
+            accepted_index = accepted_token_ids + 1  # Convert -1 to 0
+            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)
+            index = accepted_index[:, None, None].expand(-1, 1, hs_size)
+            hidden_states = hidden_states.gather(1, index).squeeze(1)  # b x d
+            # Store hidden states from target model for subsequent decode step
+            self.previous_hidden_states = HiddenStates(seq_group_metadata_list,
+                                                       hidden_states)
+
         return accepted_token_ids, logprobs
 
     def _create_output_sampler_list(
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 278db94bfc0da..d3e280e6843b8 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -65,9 +65,13 @@ def get_spec_proposals(
             # token_ids is like [batch] format in proposal_len size list,
             # while if it is false, the format would be [proposal_len]
             # in batch size list
+            hidden_states = execute_model_req.previous_hidden_states
+            if hidden_states is not None:
+                hidden_states.prune(nonzero_proposal_len_seqs)
             nonzero_execute_model_req = ExecuteModelRequest(
                 seq_group_metadata_list=nonzero_proposal_len_seqs,
                 num_lookahead_slots=proposal_len,
+                previous_hidden_states=hidden_states,
             )
             maybe_sampler_output, transposed = self._worker.sampler_output(
                 execute_model_req=nonzero_execute_model_req,
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 9bbe3f8d16117..80710419e602d 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -10,14 +10,6 @@
 SeqId = int
 
 
-def get_all_seq_ids(
-        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]:
-    """Given a list of SequenceGroupMetadata, create a list of all
-    sequence ids.
-    """
-    return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
-
-
 def get_all_num_logprobs(
         seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
     """Given a list of SequenceGroupMetadata, create a list of all num_logprobs.
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index ada84018212a0..60fc756a12e3d 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,3 +1,4 @@
+import contextlib
 from typing import Dict, Optional, Type
 
 from transformers import PretrainedConfig
@@ -5,7 +6,13 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             JAISConfig, MPTConfig, RWConfig)
+                                             JAISConfig, MLPSpeculatorConfig,
+                                             MPTConfig, RWConfig)
+
+if VLLM_USE_MODELSCOPE:
+    from modelscope import AutoConfig
+else:
+    from transformers import AutoConfig
 
 logger = init_logger(__name__)
 
@@ -16,8 +23,13 @@
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
     "jais": JAISConfig,
+    "mlp_speculator": MLPSpeculatorConfig,
 }
 
+for name, cls in _CONFIG_REGISTRY.items():
+    with contextlib.suppress(ValueError):
+        AutoConfig.register(name, cls)
+
 
 def get_config(model: str,
                trust_remote_code: bool,
@@ -26,10 +38,6 @@ def get_config(model: str,
                rope_scaling: Optional[dict] = None,
                rope_theta: Optional[float] = None) -> PretrainedConfig:
     try:
-        if VLLM_USE_MODELSCOPE:
-            from modelscope import AutoConfig
-        else:
-            from transformers import AutoConfig
         config = AutoConfig.from_pretrained(
             model,
             trust_remote_code=trust_remote_code,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 0e486928824ca..d8170858c2a9a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -5,6 +5,7 @@
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
 __all__ = [
@@ -13,4 +14,5 @@
     "MPTConfig",
     "RWConfig",
     "JAISConfig",
+    "MLPSpeculatorConfig",
 ]
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
new file mode 100644
index 0000000000000..dd1d92b861b81
--- /dev/null
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -0,0 +1,50 @@
+from typing import List, Optional
+
+from transformers import PretrainedConfig
+
+
+class MLPSpeculatorConfig(PretrainedConfig):
+    model_type = "mlp_speculator"
+
+    attribute_map = {
+        "hidden_size": "emb_dim",
+    }
+
+    def __init__(self,
+                 vocab_size: int = 32000,
+                 emb_dim: int = 4096,
+                 inner_dim: int = 0,
+                 n_predict: int = 3,
+                 top_k_tokens_per_head: Optional[List[int]] = None,
+                 n_candidates: int = 5,
+                 **kwargs):
+        """
+        Initialize an MLPSpeculatorConfig
+
+        Args:
+            vocab_size: int
+                the model vocab size
+            emb_dim: int
+                the model embedding dimension
+            inner_dim: int
+                the inner dimension of the model. If 0, will be the emb_dim.
+            n_predict: int
+                the number of lookaheads for the speculator
+            top_k_tokens_per_head: List[int]
+                Number of tokens to consider from each head when forming the
+                candidate tree.
+                For each candidate branch in the tree, head n produces topk[n]
+                additional sub-branches.
+            n_candidates: int
+                number of child candidates to create per sequence
+        """
+        if top_k_tokens_per_head is None:
+            top_k_tokens_per_head = [5, 4, 3]
+        assert len(top_k_tokens_per_head) == n_predict
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.inner_dim = inner_dim
+        self.n_predict = n_predict
+        self.top_k_tokens_per_head = top_k_tokens_per_head
+        self.n_candidates = n_candidates
+        super().__init__(**kwargs)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index d0baa4337f84a..e24835a1ea7fb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -86,6 +86,7 @@ def __init__(
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         vision_language_config: Optional[VisionLanguageConfig] = None,
+        return_hidden_states: bool = False,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -96,6 +97,7 @@ def __init__(
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
         self.vision_language_config = vision_language_config
+        self.return_hidden_states = return_hidden_states
 
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
@@ -116,15 +118,17 @@ def __init__(
         self.graph_block_tables = np.zeros(
             (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()),
             dtype=np.int32)
+        num_attn_heads = self.model_config.get_num_attention_heads(
+            self.parallel_config)
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
+            num_attn_heads,
             self.model_config.get_head_size(),
             self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
-        )
+        ) if num_attn_heads else None
 
         # Create processor for multi-modal data
         if self.vision_language_config is not None:
@@ -762,11 +766,19 @@ def execute_model(
             return None
 
         # Sample the next token.
-        output = self.model.sample(
+        output: SamplerOutput = self.model.sample(
             logits=logits,
             sampling_metadata=sampling_metadata,
         )
 
+        if self.return_hidden_states:
+            # we only need to pass hidden states of most recent token
+            assert seq_group_metadata_list is not None
+            if seq_group_metadata_list[0].is_prompt:
+                hidden_states = hidden_states.index_select(
+                    0, sampling_metadata.selected_token_indices)
+            output.hidden_states = hidden_states
+
         return output
 
     @torch.inference_mode()
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f9b8a065a8b24..e334ffbb755bb 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -70,6 +70,14 @@ def __init__(
             assert not self.lora_config, (
                 "To be tested: vision language model with LoRA settings.")
 
+        # Return hidden states from target model if the draft model is an
+        # mlp_speculator
+        speculative_args = {} if speculative_config is None \
+            or (speculative_config.draft_model_config.model ==
+                model_config.model) \
+              or (speculative_config.draft_model_config.hf_config.model_type !=
+                  "mlp_speculator") else {"return_hidden_states": True}
+
         ModelRunnerClass = (EmbeddingModelRunner if
                             self.model_config.embedding_mode else ModelRunner)
         self.model_runner = ModelRunnerClass(
@@ -83,6 +91,7 @@ def __init__(
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
             vision_language_config=vision_language_config,
+            **speculative_args,
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.

From 1f5674218f968dec625d0995fe5cd5d626db9188 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Fri, 21 Jun 2024 08:55:41 +0800
Subject: [PATCH 69/98] [Kernel] Add punica dimension for Qwen2 LoRA (#5441)

---
 csrc/punica/bgmv/bgmv_config.h | 38 ++++++++++++++++++++++++++++++++--
 tests/lora/test_punica.py      | 17 +++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index c38db2dcd2c4d..cb6694b3036e9 100755
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -16,15 +16,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 512) \
     f(in_T, out_T, W_T, narrow, 640) \
     f(in_T, out_T, W_T, narrow, 768) \
+    f(in_T, out_T, W_T, narrow, 896) \
     f(in_T, out_T, W_T, narrow, 1024) \
     f(in_T, out_T, W_T, narrow, 1152) \
+    f(in_T, out_T, W_T, narrow, 1216) \
     f(in_T, out_T, W_T, narrow, 1280) \
     f(in_T, out_T, W_T, narrow, 1536) \
     f(in_T, out_T, W_T, narrow, 1664) \
     f(in_T, out_T, W_T, narrow, 1728) \
     f(in_T, out_T, W_T, narrow, 1792) \
     f(in_T, out_T, W_T, narrow, 2048) \
+    f(in_T, out_T, W_T, narrow, 2240) \
     f(in_T, out_T, W_T, narrow, 2304) \
+    f(in_T, out_T, W_T, narrow, 2368) \
+    f(in_T, out_T, W_T, narrow, 2432) \
     f(in_T, out_T, W_T, narrow, 2560) \
     f(in_T, out_T, W_T, narrow, 2752) \
     f(in_T, out_T, W_T, narrow, 2816) \
@@ -32,8 +37,12 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 3328) \
     f(in_T, out_T, W_T, narrow, 3456) \
     f(in_T, out_T, W_T, narrow, 3584) \
+    f(in_T, out_T, W_T, narrow, 3712) \
     f(in_T, out_T, W_T, narrow, 4096) \
+    f(in_T, out_T, W_T, narrow, 4480) \
     f(in_T, out_T, W_T, narrow, 4608) \
+    f(in_T, out_T, W_T, narrow, 4736) \
+    f(in_T, out_T, W_T, narrow, 4864) \
     f(in_T, out_T, W_T, narrow, 5120) \
     f(in_T, out_T, W_T, narrow, 5504) \
     f(in_T, out_T, W_T, narrow, 5632) \
@@ -43,8 +52,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 6848) \
     f(in_T, out_T, W_T, narrow, 6912) \
     f(in_T, out_T, W_T, narrow, 7168) \
+    f(in_T, out_T, W_T, narrow, 7424) \
     f(in_T, out_T, W_T, narrow, 8192) \
+    f(in_T, out_T, W_T, narrow, 8960) \
     f(in_T, out_T, W_T, narrow, 9216) \
+    f(in_T, out_T, W_T, narrow, 9472) \
     f(in_T, out_T, W_T, narrow, 10240) \
     f(in_T, out_T, W_T, narrow, 11008) \
     f(in_T, out_T, W_T, narrow, 11264) \
@@ -52,8 +64,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 13696) \
     f(in_T, out_T, W_T, narrow, 13824) \
     f(in_T, out_T, W_T, narrow, 14336) \
+    f(in_T, out_T, W_T, narrow, 14784) \
+    f(in_T, out_T, W_T, narrow, 14848) \
     f(in_T, out_T, W_T, narrow, 15360) \
     f(in_T, out_T, W_T, narrow, 16384) \
+    f(in_T, out_T, W_T, narrow, 18944) \
     f(in_T, out_T, W_T, narrow, 20480) \
     f(in_T, out_T, W_T, narrow, 22016) \
     f(in_T, out_T, W_T, narrow, 22528) \
@@ -61,6 +76,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 27392) \
     f(in_T, out_T, W_T, narrow, 27648) \
     f(in_T, out_T, W_T, narrow, 28672) \
+    f(in_T, out_T, W_T, narrow, 29568) \
+    f(in_T, out_T, W_T, narrow, 29696) \
     f(in_T, out_T, W_T, narrow, 32000) \
     f(in_T, out_T, W_T, narrow, 32256) \
     f(in_T, out_T, W_T, narrow, 32512) \
@@ -85,9 +102,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
 // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
 // and vllm/tests/lora/test_punica.py
 
-// Used for defining kernels going from the variety of 
+// Used for defining kernels going from the variety of
 // dim in to the narrow dim out
-    // Using it for the fully sharded column 
+    // Using it for the fully sharded column
     // parallel LoRA A which splits the rank dim
 #define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
     f(in_T, out_T, W_T, 128, narrow) \
@@ -95,15 +112,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 512, narrow) \
     f(in_T, out_T, W_T, 640, narrow) \
     f(in_T, out_T, W_T, 768, narrow) \
+    f(in_T, out_T, W_T, 896, narrow) \
     f(in_T, out_T, W_T, 1024, narrow) \
     f(in_T, out_T, W_T, 1152, narrow) \
+    f(in_T, out_T, W_T, 1216, narrow) \
     f(in_T, out_T, W_T, 1280, narrow) \
     f(in_T, out_T, W_T, 1536, narrow) \
     f(in_T, out_T, W_T, 1664, narrow) \
     f(in_T, out_T, W_T, 1728, narrow) \
     f(in_T, out_T, W_T, 1792, narrow) \
     f(in_T, out_T, W_T, 2048, narrow) \
+    f(in_T, out_T, W_T, 2240, narrow) \
     f(in_T, out_T, W_T, 2304, narrow) \
+    f(in_T, out_T, W_T, 2368, narrow) \
+    f(in_T, out_T, W_T, 2432, narrow) \
     f(in_T, out_T, W_T, 2560, narrow) \
     f(in_T, out_T, W_T, 2752, narrow) \
     f(in_T, out_T, W_T, 2816, narrow) \
@@ -111,8 +133,12 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 3328, narrow) \
     f(in_T, out_T, W_T, 3456, narrow) \
     f(in_T, out_T, W_T, 3584, narrow) \
+    f(in_T, out_T, W_T, 3712, narrow) \
     f(in_T, out_T, W_T, 4096, narrow) \
+    f(in_T, out_T, W_T, 4480, narrow) \
     f(in_T, out_T, W_T, 4608, narrow) \
+    f(in_T, out_T, W_T, 4736, narrow) \
+    f(in_T, out_T, W_T, 4864, narrow) \
     f(in_T, out_T, W_T, 5120, narrow) \
     f(in_T, out_T, W_T, 5504, narrow) \
     f(in_T, out_T, W_T, 5632, narrow) \
@@ -122,8 +148,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 6848, narrow) \
     f(in_T, out_T, W_T, 6912, narrow) \
     f(in_T, out_T, W_T, 7168, narrow) \
+    f(in_T, out_T, W_T, 7424, narrow) \
     f(in_T, out_T, W_T, 8192, narrow) \
+    f(in_T, out_T, W_T, 8960, narrow) \
     f(in_T, out_T, W_T, 9216, narrow) \
+    f(in_T, out_T, W_T, 9472, narrow) \
     f(in_T, out_T, W_T, 10240, narrow) \
     f(in_T, out_T, W_T, 11008, narrow) \
     f(in_T, out_T, W_T, 11264, narrow) \
@@ -131,8 +160,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 13696, narrow) \
     f(in_T, out_T, W_T, 13824, narrow) \
     f(in_T, out_T, W_T, 14336, narrow) \
+    f(in_T, out_T, W_T, 14784, narrow) \
+    f(in_T, out_T, W_T, 14848, narrow) \
     f(in_T, out_T, W_T, 15360, narrow) \
     f(in_T, out_T, W_T, 16384, narrow) \
+    f(in_T, out_T, W_T, 18944, narrow) \
     f(in_T, out_T, W_T, 20480, narrow) \
     f(in_T, out_T, W_T, 22016, narrow) \
     f(in_T, out_T, W_T, 22528, narrow) \
@@ -140,6 +172,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 27392, narrow) \
     f(in_T, out_T, W_T, 27648, narrow) \
     f(in_T, out_T, W_T, 28672, narrow) \
+    f(in_T, out_T, W_T, 29568, narrow) \
+    f(in_T, out_T, W_T, 29696, narrow) \
     f(in_T, out_T, W_T, 32000, narrow) \
     f(in_T, out_T, W_T, 32256, narrow) \
     f(in_T, out_T, W_T, 32512, narrow) \
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index dae1d56874628..110c9b243507d 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -49,21 +49,30 @@ def _lora_ref_impl(
     128,
     256,
     512,
+    896,
     1024,
     1152,
+    1216,
     1280,
     1536,
     1664,
     2048,
+    2240,
     2304,
+    2368,
+    2432,
     2560,
     2752,
     3072,
     3328,
     3456,
     3584,
+    3712,
     4096,
+    4480,
     4608,
+    4736,
+    4864,
     5120,
     5504,
     5632,
@@ -73,19 +82,27 @@ def _lora_ref_impl(
     6848,
     6912,
     7168,
+    7424,
     8192,
+    8960,
     9216,
+    9472,
     10240,
     11008,
     11264,
     13824,
     14336,
+    14784,
+    14848,
     15360,
+    18944,
     22016,
     22528,
     24576,
     27392,
     27648,
+    29568,
+    29696,
     32000,
     32256,
     32512,

From c35e4a3dd74fa5952b04354a3c7cfd0ed09e2eb0 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Thu, 20 Jun 2024 21:45:34 -0700
Subject: [PATCH 70/98] [BugFix] Fix test_phi3v.py (#5725)

---
 tests/conftest.py          |  4 +++-
 tests/models/test_phi3v.py | 10 ++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 5bbfd87f0bb3b..67885b93285c5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -233,11 +233,13 @@ def generate_greedy(
         prompts: List[str],
         max_tokens: int,
         images: Optional[List[Image.Image]] = None,
+        **kwargs,
     ) -> List[Tuple[List[int], str]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
                                 max_new_tokens=max_tokens,
-                                images=images)
+                                images=images,
+                                **kwargs)
 
         return [(output_ids[0], output_str[0])
                 for output_ids, output_str in outputs]
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 1732e8f08a897..23454759827d5 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -77,7 +77,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 # numeric difference for longer context and test can't pass
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
@@ -95,9 +95,11 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     hf_model_kwargs = {"_attn_implementation": "eager"}
     with hf_runner(model_id, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
+        hf_outputs = hf_model.generate_greedy(
+            HF_IMAGE_PROMPTS,
+            max_tokens,
+            images=hf_images,
+            eos_token_id=hf_model.processor.tokenizer.eos_token_id)
 
     vllm_image_prompts = [
         p.replace("<|image_1|>",

From 67005a07bc0991211ba2acccb3e56c72a47f9def Mon Sep 17 00:00:00 2001
From: Jee Li <pandaleefree@163.com>
Date: Fri, 21 Jun 2024 12:46:28 +0800
Subject: [PATCH 71/98] [Bugfix] Add  fully sharded layer for
 QKVParallelLinearWithLora (#5665)

Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
---
 tests/lora/test_baichuan.py       | 14 +++++---
 tests/lora/test_layers.py         |  7 ++--
 vllm/lora/fully_sharded_layers.py | 58 +++++++++++++++++++++++++++++--
 vllm/lora/layers.py               | 36 +++++++++++--------
 vllm/lora/utils.py                |  4 ++-
 5 files changed, 93 insertions(+), 26 deletions(-)

diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index e1b81655c5610..56cec4db89e64 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -64,7 +64,8 @@ def test_baichuan_lora(baichuan_lora_files):
 
 
 @pytest.mark.skip("Requires multiple GPUs")
-def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
+@pytest.mark.parametrize("fully_sharded", [True, False])
+def test_baichuan_tensor_parallel_equality(baichuan_lora_files, fully_sharded):
     # Cannot use as it will initialize torch.cuda too early...
     # if torch.cuda.device_count() < 4:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
@@ -75,7 +76,8 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
                        max_loras=4,
                        max_lora_rank=64,
                        tensor_parallel_size=1,
-                       trust_remote_code=True)
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
     output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
 
     del llm_tp1
@@ -87,7 +89,8 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
                        max_loras=4,
                        max_lora_rank=64,
                        tensor_parallel_size=2,
-                       trust_remote_code=True)
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
     output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
 
     del llm_tp2
@@ -101,10 +104,11 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
                        max_loras=4,
                        max_lora_rank=64,
                        tensor_parallel_size=4,
-                       trust_remote_code=True)
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
     output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
 
     del llm_tp4
     cleanup()
 
-    assert output_tp1 == output_tp4
\ No newline at end of file
+    assert output_tp1 == output_tp4
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 4b489670f53f9..2e51e95a38f2e 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -12,7 +12,8 @@
 from vllm.lora.fully_sharded_layers import (
     ColumnParallelLinearWithShardedLoRA,
     MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
+    MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora,
+    RowParallelLinearWithShardedLoRA)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
@@ -684,7 +685,9 @@ def create_column_parallel_packed_layer():
                                        bias=False,
                                        params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = QKVParallelLinearWithLora(linear)
+            lora_linear = QKVParallelLinearWithLora(
+                linear
+            ) if not fully_shard else QKVParallelLinearWithShardedLora(linear)
 
         @dataclass
         class FakeConfig:
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index ffdc32b7339af..d27171f720832 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -12,6 +12,7 @@
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLora,
+                              QKVParallelLinearWithLora,
                               RowParallelLinearWithLoRA)
 from vllm.lora.punica import bgmv, dispatch_bgmv_low_level
 
@@ -90,11 +91,11 @@ def can_replace_layer(cls, source_layer: nn.Module,
 def _mcp_apply(x, bias, layer):
     """
     MergedColumnParallelLinearWithShardedLoRA and 
-    QKVParallelLinearWithShardedLora share the same 
+    MergedQKVParallelLinearWithShardedLora share the same 
     LoRa weight application method.
     
     The main difference is the step by shard_size for lora_b which can
-    vary for QKVParallelLinearWithShardedLora but is constant for 
+    vary for MergedQKVParallelLinearWithShardedLora but is constant for 
     MergedColumnParallelLinearWithShardedLoRA.
     """
     # expecting 2 for column parallel and 3 for qkv
@@ -167,7 +168,7 @@ def can_replace_layer(cls, source_layer: nn.Module,
         )
 
 
-class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
+class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
     """
     Differs from QKVParallelLinearWithLora by slicing the 
     LoRA A's also.
@@ -175,6 +176,57 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
     Based on S-LoRA, slicing happens along the rank dim.
     """
 
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.lora_a_stacked.shape[2]
+        start_idx = tp_rank * shard_size
+        lora_a = lora_a[:, start_idx:start_idx + shard_size]
+        return lora_a
+
+    def apply(self, x: torch.Tensor,
+              bias: Optional[torch.Tensor]) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+
+        x = x.view(-1, x.shape[-1])
+        output, out_orig_shape = output.view(-1,
+                                             output.shape[-1]), output.shape
+        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
+                             dtype=torch.float32,
+                             device=x.device)
+
+        bgmv(buffer, x, self.lora_a_stacked,
+             self.indices[:self.indices_len[0]], 0, 1.0)
+        buffer = tensor_model_parallel_all_gather(buffer)
+        bgmv(output, buffer, self.lora_b_stacked,
+             self.indices[:self.indices_len[0]], 0, 1.0)
+        # now have column partitioned output
+
+        output = output.view(*out_orig_shape)
+        return output
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(cls, source_layer: nn.Module,
+                          lora_config: LoRAConfig, packed_modules_list: List,
+                          model_config: Optional[PretrainedConfig]) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
+
+
+class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
+    """
+    Differs from MergedQKVParallelLinearWithLora by slicing the 
+    LoRA A's also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
     ) -> List[Union[torch.Tensor, None]]:
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e3ab1708c3fdf..e4a23273f7282 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -641,6 +641,24 @@ def __init__(self, base_layer: QKVParallelLinear) -> None:
         self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
                                    self.base_layer.head_size)
 
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        self.q_shard_id = tp_rank
+        self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
+        lora_b_q = lora_b[:, self.q_proj_shard_size *
+                          self.q_shard_id:self.q_proj_shard_size *
+                          (self.q_shard_id + 1)]
+        k_offset = self.q_proj_total_size
+        lora_b_k = lora_b[:, k_offset +
+                          self.kv_proj_shard_size * self.kv_shard_id:k_offset +
+                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        v_offset = k_offset + self.kv_proj_total_size
+        lora_b_v = lora_b[:, v_offset +
+                          self.kv_proj_shard_size * self.kv_shard_id:v_offset +
+                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
+        return lora_b
+
     def set_lora(
         self,
         index: int,
@@ -650,21 +668,8 @@ def set_lora(
     ):
         self.reset_lora(index)
         if self.tp_size > 1:
-            tp_rank = get_tensor_model_parallel_rank()
-            self.q_shard_id = tp_rank
-            self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
-            lora_b_q = lora_b[:, self.q_proj_shard_size *
-                              self.q_shard_id:self.q_proj_shard_size *
-                              (self.q_shard_id + 1)]
-            k_offset = self.q_proj_total_size
-            lora_b_k = lora_b[:, k_offset + self.kv_proj_shard_size *
-                              self.kv_shard_id:k_offset +
-                              self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-            v_offset = k_offset + self.kv_proj_total_size
-            lora_b_v = lora_b[:, v_offset + self.kv_proj_shard_size *
-                              self.kv_shard_id:v_offset +
-                              self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-            lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
 
         self.lora_a_stacked[index,
                             0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
@@ -674,6 +679,7 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
 
     @classmethod
+    @_not_fully_sharded_can_replace
     def can_replace_layer(cls, source_layer: nn.Module,
                           lora_config: LoRAConfig, packed_modules_list: List,
                           model_config: Optional[PretrainedConfig]) -> bool:
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 4a86c16cf64db..ab3b99eee6fc1 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -8,7 +8,8 @@
 from vllm.lora.fully_sharded_layers import (
     ColumnParallelLinearWithShardedLoRA,
     MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
+    MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora,
+    RowParallelLinearWithShardedLoRA)
 # being imported for _all_lora_classes below
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -35,6 +36,7 @@
     RowParallelLinearWithLoRA,
     LogitsProcessorWithLoRA,
     ColumnParallelLinearWithShardedLoRA,
+    QKVParallelLinearWithShardedLora,
     MergedColumnParallelLinearWithShardedLoRA,
     MergedQKVParallelLinearWithShardedLora,
     RowParallelLinearWithShardedLoRA,

From d9a252bc8e8a2741d8a2997032a94208fb8f29d9 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 20 Jun 2024 22:12:35 -0700
Subject: [PATCH 72/98] [Core][Distributed] add shm broadcast (#5399)

Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   4 +-
 tests/distributed/test_shm_broadcast.py       |  82 ++++++
 .../device_communicators/shm_broadcast.py     | 259 ++++++++++++++++++
 vllm/distributed/parallel_state.py            |  44 ++-
 vllm/envs.py                                  |   5 +
 5 files changed, 384 insertions(+), 10 deletions(-)
 create mode 100644 tests/distributed/test_shm_broadcast.py
 create mode 100644 vllm/distributed/device_communicators/shm_broadcast.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5e92ba3c24f55..c337a81d4a0d2 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -28,9 +28,11 @@ steps:
 
 - label: Distributed Comm Ops Test
   #mirror_hardwares: [amd]
-  command: pytest -v -s distributed/test_comm_ops.py
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
 
 - label: Distributed Tests (2 GPUs)
   mirror_hardwares: [amd]
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
new file mode 100644
index 0000000000000..d92900ffce00b
--- /dev/null
+++ b/tests/distributed/test_shm_broadcast.py
@@ -0,0 +1,82 @@
+import multiprocessing
+import random
+import time
+
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.shm_broadcast import (
+    ShmRingBuffer, ShmRingBufferIO)
+from vllm.utils import update_environment_variables
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes = []
+    for i in range(number_of_processes):
+        env = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        dist.init_process_group(backend="gloo")
+        fn()
+
+    return wrapped_fn
+
+
+@worker_fn_wrapper
+def worker_fn():
+    writer_rank = 2
+    broadcaster = ShmRingBufferIO.create_from_process_group(
+        dist.group.WORLD, 1024, 2, writer_rank)
+    if dist.get_rank() == writer_rank:
+        time.sleep(random.random())
+        broadcaster.broadcast_object(0)
+        time.sleep(random.random())
+        broadcaster.broadcast_object({})
+        time.sleep(random.random())
+        broadcaster.broadcast_object([])
+    else:
+        time.sleep(random.random())
+        a = broadcaster.broadcast_object(None)
+        time.sleep(random.random())
+        b = broadcaster.broadcast_object(None)
+        time.sleep(random.random())
+        c = broadcaster.broadcast_object(None)
+        assert a == 0
+        assert b == {}
+        assert c == []
+    dist.barrier()
+
+
+def test_shm_broadcast():
+    distributed_run(worker_fn, 4)
+
+
+def test_singe_process():
+    buffer = ShmRingBuffer(1, 1024, 4)
+    reader = ShmRingBufferIO(buffer, reader_rank=0)
+    writer = ShmRingBufferIO(buffer, reader_rank=-1)
+    writer.enqueue([0])
+    writer.enqueue([1])
+    assert reader.dequeue() == [0]
+    assert reader.dequeue() == [1]
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
new file mode 100644
index 0000000000000..119befcf64052
--- /dev/null
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -0,0 +1,259 @@
+import pickle
+import time
+from contextlib import contextmanager
+from multiprocessing import shared_memory
+from typing import Optional
+from unittest.mock import patch
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
+
+logger = init_logger(__name__)
+
+
+class ShmRingBuffer:
+
+    def __init__(self,
+                 n_reader: int,
+                 max_chunk_bytes: int,
+                 max_chunks: int,
+                 name: Optional[str] = None):
+        """
+        A shared memory ring buffer implementation for broadcast communication.
+        Essentially, it is a queue where only one will `enqueue` and multiple
+        will `dequeue`. The max size of each item, together with the max number
+        of items that can be stored in the buffer are known in advance.
+        In this case, we don't need to synchronize the access to
+         the buffer.
+        
+        Buffer memory layout:
+                  data                                 metadata
+                    |                                      |
+                    | (current_idx)                        | (current_idx)
+                    v                                      v
+        +-------------------------------+----------------------------------------+
+        | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata |
+        +-------------------------------+----------------------------------------+
+        | max_chunks x max_chunk_bytes  | max_chunks x (1 + n_reader) bytes      |
+
+        metadata memory layout: each byte is a flag, the first byte is the written
+        flag, and the rest are reader flags. The flags are set to 0 by default.
+        +--------------+--------------+--------------+-----+--------------+
+        | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
+        +--------------+--------------+--------------+-----+--------------+
+
+        During creation, `name` is None and the buffer is created. We can pass the
+        created object to other processes by pickling it. The other processes will
+        get the name of the shared memory and open it, so that they can access the
+        same shared memory buffer.
+        """# noqa
+        self.n_reader = n_reader
+        self.metadata_size = 1 + n_reader
+        self.max_chunk_bytes = max_chunk_bytes
+        self.max_chunks = max_chunks
+        self.total_bytes_of_buffer = (self.max_chunk_bytes +
+                                      self.metadata_size) * self.max_chunks
+        self.data_offset = 0
+        self.metadata_offset = self.max_chunk_bytes * self.max_chunks
+
+        if name is None:
+            # we are creating a buffer
+            self.is_creator = True
+            self.shared_memory = shared_memory.SharedMemory(
+                create=True, size=self.total_bytes_of_buffer)
+            # initialize the metadata section to 0
+            with memoryview(self.shared_memory.buf[self.metadata_offset:]
+                            ) as metadata_buffer:
+                torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0)
+        else:
+            # we are opening an existing buffer
+            self.is_creator = False
+            # fix to https://stackoverflow.com/q/62748654/9191338
+            # Python incorrectly tracks shared memory even if it is not
+            # created by the process. The following patch is a workaround.
+            with patch("multiprocessing.resource_tracker.register",
+                       lambda *args, **kwargs: None):
+                self.shared_memory = shared_memory.SharedMemory(name=name)
+            assert self.shared_memory.size == self.total_bytes_of_buffer
+            with memoryview(self.shared_memory.buf[self.metadata_offset:]
+                            ) as metadata_buffer:
+                tensor = torch.frombuffer(metadata_buffer, dtype=torch.uint8)
+                assert torch.all(tensor == 0)
+
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (self.n_reader, self.max_chunk_bytes, self.max_chunks,
+             self.shared_memory.name),
+        )
+
+    def __del__(self):
+        self.shared_memory.close()
+        if self.is_creator:
+            self.shared_memory.unlink()
+
+    @contextmanager
+    def get_data(self, current_idx: int):
+        start = self.data_offset + current_idx * self.max_chunk_bytes
+        end = start + self.max_chunk_bytes
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+    @contextmanager
+    def get_metadata(self, current_idx: int):
+        start = self.metadata_offset + current_idx * self.metadata_size
+        end = start + self.metadata_size
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+
+class ShmRingBufferIO:
+
+    def __init__(self, buffer: ShmRingBuffer, reader_rank: int):
+        self.buffer = buffer
+        self.reader_rank = reader_rank
+        self._is_writer = self.reader_rank == -1
+        self._is_reader = not self._is_writer
+        if self._is_reader:
+            assert 0 <= self.reader_rank < buffer.n_reader, \
+                (f"Invalid reader rank {self.reader_rank} for buffer"
+                f" created with {buffer.n_reader} readers")
+        self.current_idx = 0
+
+    @contextmanager
+    def acquire_write(self):
+        assert self._is_writer, "Only writers can acquire write"
+        start_index = self.current_idx
+        start_time = time.time()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_count = sum(metadata_buffer[1:])
+                written_flag = metadata_buffer[0]
+                if written_flag and read_count != self.buffer.n_reader:
+                    # this block is written and not read by all readers
+                    # try to write to the next block
+                    self.current_idx = (self.current_idx +
+                                        1) % self.buffer.max_chunks
+                    if self.current_idx == start_index:
+                        # no empty block found
+                        if time.time(
+                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
+                            logger.warning(
+                                "No available block found in %s second. ",
+                                VLLM_RINGBUFFER_WARNING_INTERVAL)
+                            n_warning += 1
+                        # wait for a while (0.1 us)
+                        time.sleep(1e-7)
+                    continue
+                # found a block that is either
+                # (1) not written
+                # (2) read by all readers
+
+                # mark the block as not written
+                metadata_buffer[0] = 0
+                # let caller write to the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has written to the buffer
+                # mark the block as written
+                metadata_buffer[0] = 1
+                for i in range(1, self.buffer.n_reader + 1):
+                    # set read flag to 0, meaning it is not read yet
+                    metadata_buffer[i] = 0
+                break
+
+    @contextmanager
+    def acquire_read(self):
+        assert self._is_reader, "Only readers can acquire read"
+        start_index = self.current_idx
+        start_time = time.time()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_flag = metadata_buffer[self.reader_rank + 1]
+                written_flag = metadata_buffer[0]
+                if not written_flag or read_flag:
+                    # this block is either
+                    # (1) not written
+                    # (2) already read by this reader
+                    # try to read the next block
+                    self.current_idx = (self.current_idx +
+                                        1) % self.buffer.max_chunks
+                    if self.current_idx == start_index:
+                        # no block found
+                        if time.time(
+                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
+                            logger.warning(
+                                "No available block found in %s second. ",
+                                VLLM_RINGBUFFER_WARNING_INTERVAL)
+                            n_warning += 1
+                        # wait for a while (0.1 us)
+                        time.sleep(1e-7)
+                    continue
+                # found a block that is not read by this reader
+                # let caller read from the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has read from the buffer
+                # set the read flag
+                metadata_buffer[self.reader_rank + 1] = 1
+                break
+
+    def enqueue(self, obj):
+        assert self._is_writer, "Only writers can enqueue"
+        serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+        if len(serialized_obj) > self.buffer.max_chunk_bytes:
+            raise RuntimeError(
+                f"{len(serialized_obj)=} larger than the allowed value "
+                f"{self.buffer.max_chunk_bytes},"
+                "Please increase the max_chunk_bytes parameter.")
+        with self.acquire_write() as buf:
+            buf[:len(serialized_obj)] = serialized_obj
+
+    def dequeue(self):
+        assert self._is_reader, "Only readers can dequeue"
+        with self.acquire_read() as buf:
+            # no need to know the size of serialized object
+            # pickle format itself contains the size information internally
+            # see https://docs.python.org/3/library/pickle.html
+            obj = pickle.loads(buf)
+        return obj
+
+    def broadcast_object(self, obj=None):
+        if self._is_writer:
+            self.enqueue(obj)
+            return obj
+        else:
+            return self.dequeue()
+
+    def create_from_process_group(pg: ProcessGroup,
+                                  max_chunk_bytes,
+                                  max_chunks,
+                                  writer_rank=0) -> "ShmRingBufferIO":
+        group_rank = dist.get_rank(pg)
+        group_world_size = dist.get_world_size(pg)
+        ranks_inside_group = list(range(group_world_size))
+        global_ranks = dist.get_process_group_ranks(pg)
+        n_reader = group_world_size - 1
+        buffer: ShmRingBuffer
+        if group_rank == writer_rank:
+            buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks)
+            dist.broadcast_object_list([buffer], src=global_ranks[writer_rank])
+            dist.barrier(pg)
+            return ShmRingBufferIO(buffer, -1)
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, src=global_ranks[writer_rank])
+            dist.barrier(pg)
+            buffer = recv[0]  # type: ignore
+            rest_ranks = [r for r in ranks_inside_group if r != writer_rank]
+            return ShmRingBufferIO(buffer, rest_ranks.index(group_rank))
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 02b0dcbcb6b24..5188fadbb92a5 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -98,6 +98,7 @@ class GroupCoordinator:
     # communicators are only created for world size > 1
     pynccl_comm: Optional[Any]  # PyNccl communicator
     ca_comm: Optional[Any]  # Custom allreduce communicator
+    shm_broadcaster: Optional[Any]  # shared memory broadcaster
 
     def __init__(
         self,
@@ -162,6 +163,13 @@ def __init__(
         else:
             self.ca_comm = None
 
+        from vllm.distributed.device_communicators.shm_broadcast import (
+            ShmRingBufferIO)
+        self.shm_broadcaster: Optional[ShmRingBufferIO] = None
+        if self.world_size > 1 and is_in_the_same_node(self.cpu_group):
+            self.shm_broadcaster = ShmRingBufferIO.create_from_process_group(
+                self.cpu_group, 1 << 20, 6)
+
     @property
     def first_rank(self):
         """Return the global rank of the first process in the group"""
@@ -324,6 +332,30 @@ def broadcast(self, input_: torch.Tensor, src: int = 0):
                                     group=self.device_group)
         return input_
 
+    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
+        """Broadcast the input object.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj
+        if self.shm_broadcaster is not None:
+            assert src == 0, "Shared memory broadcaster only supports src=0"
+            return self.shm_broadcaster.broadcast_object(obj)
+        if self.rank_in_group == src:
+            torch.distributed.broadcast_object_list([obj],
+                                                    src=self.ranks[src],
+                                                    group=self.cpu_group)
+            return obj
+        else:
+            recv = [None]
+            torch.distributed.broadcast_object_list(recv,
+                                                    src=self.ranks[src],
+                                                    group=self.cpu_group)
+            return recv[0]
+
     def broadcast_object_list(self,
                               obj_list: List[Any],
                               src: int = 0,
@@ -371,9 +403,7 @@ def broadcast_tensor_dict(
             # `metadata_list` lives in CPU memory.
             # `broadcast_object_list` has serialization & deserialization,
             # all happening on CPU. Therefore, we can use the CPU group.
-            torch.distributed.broadcast_object_list([metadata_list],
-                                                    src=src,
-                                                    group=metadata_group)
+            self.broadcast_object(metadata_list, src=src)
             async_handles = []
             for tensor in tensor_list:
                 if tensor.numel() == 0:
@@ -396,14 +426,10 @@ def broadcast_tensor_dict(
                 async_handle.wait()
 
         else:
-            recv_metadata_list = [None]
-            torch.distributed.broadcast_object_list(recv_metadata_list,
-                                                    src=src,
-                                                    group=metadata_group)
-            assert recv_metadata_list[0] is not None
+            metadata_list = self.broadcast_object(None, src=src)
             tensor_dict = {}
             async_handles = []
-            for key, value in recv_metadata_list[0]:
+            for key, value in metadata_list:
                 if isinstance(value, TensorMetadata):
                     tensor = torch.empty(value.size,
                                          dtype=value.dtype,
diff --git a/vllm/envs.py b/vllm/envs.py
index ae2fcd0826fb1..49277e2d3519f 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -5,6 +5,7 @@
     VLLM_HOST_IP: str = ""
     VLLM_PORT: Optional[int] = None
     VLLM_USE_MODELSCOPE: bool = False
+    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
     VLLM_INSTANCE_ID: Optional[str] = None
     VLLM_NCCL_SO_PATH: Optional[str] = None
     LD_LIBRARY_PATH: Optional[str] = None
@@ -114,6 +115,10 @@
     "VLLM_INSTANCE_ID":
     lambda: os.environ.get("VLLM_INSTANCE_ID", None),
 
+    # Interval in seconds to log a warning message when the ring buffer is full
+    "VLLM_RINGBUFFER_WARNING_INTERVAL":
+    lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
+
     # path to cudatoolkit home directory, under which should be bin, include,
     # and lib directories.
     "CUDA_HOME":

From bd620b01fb74d5269ca6fc0fd32f66bfb205a358 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 20 Jun 2024 23:39:40 -0700
Subject: [PATCH 73/98] [Kernel][CPU] Add Quick `gelu` to CPU (#5717)

---
 csrc/cpu/activation.cpp                  | 19 +++++++++++++++++++
 csrc/cpu/torch_bindings.cpp              |  4 ++++
 vllm/_ipex_ops.py                        |  3 +++
 vllm/model_executor/layers/activation.py |  3 +++
 4 files changed, 29 insertions(+)

diff --git a/csrc/cpu/activation.cpp b/csrc/cpu/activation.cpp
index becd2ac42f17a..039b8d5c30d46 100644
--- a/csrc/cpu/activation.cpp
+++ b/csrc/cpu/activation.cpp
@@ -59,6 +59,13 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
   return w3 * x * (ones + t);
 }
 
+FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) {
+  const vec_op::FP32Vec8 zeros(0.0);
+  const vec_op::FP32Vec8 ones(1.0);
+  const vec_op::FP32Vec8 w1(1.702f);
+  return x / (ones + (zeros - w1 * x).exp());
+}
+
 FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
   const vec_op::FP32Vec8 ones(1.0);
   const vec_op::FP32Vec8 w1(M_SQRT1_2);
@@ -142,3 +149,15 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
     CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
   });
 }
+
+void gelu_quick(torch::Tensor& out, torch::Tensor& input) {
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] {
+    CPU_KERNEL_GUARD_IN(gelu_quick_impl)
+    activation_kernel<scalar_t, gelu_quick_act, false>(
+        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+    CPU_KERNEL_GUARD_OUT(gelu_quick_impl)
+  });
+}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index a2bf0d49adba5..39e8cf3ed3c10 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -58,6 +58,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
 
+  // Quick GELU implementation.
+  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_quick", torch::kCPU, &gelu_quick);
+
   // Layernorm
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 1e60e0848673b..99a875c9b3fb7 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -43,6 +43,9 @@ def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
     def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
         out.copy_(torch.nn.functional.gelu(x))
 
+    # TODO add implementation of gelu_quick here
+    # def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+
     def paged_attention_v1(
         out: torch.Tensor,
         query: torch.Tensor,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 80cad15b43426..5bfdba67b443d 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -155,6 +155,9 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         ops.gelu_quick(out, x)
         return out
 
+    # TODO implement forward_xpu for QuickGELU
+    # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+
 
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.

From 5b15bde5399cbcb1052bfb49584f81ed300cd4ac Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 21 Jun 2024 12:44:29 -0400
Subject: [PATCH 74/98] [Doc] Documentation on supported hardware for
 quantization methods (#5745)

---
 docs/source/index.rst                         |  1 +
 docs/source/quantization/fp8.rst              |  4 ++-
 .../quantization/supported_hardware.rst       | 30 +++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/quantization/supported_hardware.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8795a865c3db6..05133eb6d867a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -100,6 +100,7 @@ Documentation
    :maxdepth: 1
    :caption: Quantization
 
+   quantization/supported_hardware
    quantization/auto_awq
    quantization/fp8
    quantization/fp8_e5m2_kvcache
diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst
index 312a564595cc8..09f3136644c37 100644
--- a/docs/source/quantization/fp8.rst
+++ b/docs/source/quantization/fp8.rst
@@ -3,7 +3,9 @@
 FP8
 ==================
 
-vLLM supports FP8 (8-bit floating point) computation using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are supported. Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
+vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. 
+Currently, only Hopper and Ada Lovelace GPUs are supported. 
+Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
 
 Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM <https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127>`_.
 
diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst
new file mode 100644
index 0000000000000..df445e00a3958
--- /dev/null
+++ b/docs/source/quantization/supported_hardware.rst
@@ -0,0 +1,30 @@
+.. _supported_hardware_for_quantization:
+
+Supported Hardware for Quantization Kernels
+===========================================
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+Implementation  Volta   Turing   Ampere   Ada    Hopper  AMD GPU  Intel GPU  x86 CPU  AWS Inferentia  Google TPU
+==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+AQLM            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+AWQ             ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+DeepSpeedFP     ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+FP8             ❌      ❌       ❌       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+Marlin          ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+GPTQ            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+SqueezeLLM      ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+bitsandbytes    ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+
+Notes:
+^^^^^^
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- "✅" indicates that the quantization method is supported on the specified hardware.
+- "❌" indicates that the quantization method is not supported on the specified hardware.
+
+Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
\ No newline at end of file

From f1e72cc19a21928400b63743d5fe164ec8ed30e8 Mon Sep 17 00:00:00 2001
From: zhyncs <me@zhyncs.com>
Date: Sat, 22 Jun 2024 03:15:48 +0800
Subject: [PATCH 75/98] [BugFix] exclude version 1.15.0 for modelscope (#5668)

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 5b3e682a80169..d031d98c5b7e4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -172,7 +172,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer modelscope
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 

From 7187507301aa8361407e04be42d0d50680891493 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 21 Jun 2024 14:04:26 -0700
Subject: [PATCH 76/98] [ci][test] fix ca test in main (#5746)

---
 .buildkite/test-pipeline.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c337a81d4a0d2..0b87e6280f0bb 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -197,6 +197,9 @@ steps:
   gpu: a100
   num_gpus: 4
   commands: 
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py

From f5dda63eb5fcb5624b93fa5f09da01d5372bbce4 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Fri, 21 Jun 2024 15:42:46 -0700
Subject: [PATCH 77/98] [LoRA] Add support for pinning lora adapters in the LRU
 cache (#5603)

---
 tests/lora/test_lora_manager.py           | 64 +++++++++++++++++++++++
 vllm/engine/llm_engine.py                 |  3 ++
 vllm/executor/cpu_executor.py             |  3 ++
 vllm/executor/distributed_gpu_executor.py |  7 +++
 vllm/executor/executor_base.py            |  4 ++
 vllm/executor/gpu_executor.py             |  4 ++
 vllm/executor/neuron_executor.py          |  3 ++
 vllm/lora/models.py                       | 26 +++++++++
 vllm/lora/worker_manager.py               |  3 ++
 vllm/utils.py                             | 43 +++++++++++++--
 vllm/worker/model_runner.py               |  5 ++
 vllm/worker/worker.py                     |  3 ++
 vllm/worker/worker_base.py                |  8 +++
 13 files changed, 171 insertions(+), 5 deletions(-)

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 51a56b121ae2c..2133bce14957b 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -209,6 +209,34 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model):
     assert manager.activate_lora(3)
     assert manager.lora_index_to_id[0] == 2
     assert manager.lora_index_to_id[1] == 3
+    assert manager.pin_lora(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 3
+    assert manager.activate_lora(1)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.deactivate_lora(2)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.pin_lora(3)
+    assert manager.pin_lora(1)
+    with pytest.raises(RuntimeError):
+        assert manager.pin_lora(2)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    with pytest.raises(RuntimeError):
+        assert manager.activate_lora(2)
+
+    assert manager.deactivate_lora(3)
+    assert manager.pin_lora(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.remove_lora(3)
+    with pytest.raises(ValueError):
+        assert manager.pin_lora(3)
 
 
 def test_lru_lora_model_manager(dist_init, dummy_model):
@@ -288,6 +316,42 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
     assert set(manager.list_loras()) == set()
     assert all(x is None for x in manager.lora_index_to_id)
 
+    # pinning
+    assert manager.add_lora(model_lora3)
+    assert manager.activate_lora(3)
+    assert manager.add_lora(model_lora4)
+    assert manager.activate_lora(4)
+    assert set(manager.list_loras()) == {3, 4}
+    with pytest.raises(ValueError):
+        assert manager.pin_lora(1)
+    assert manager.pin_lora(3)
+    # Remove manually
+    assert manager.remove_lora(3)
+    assert not manager.remove_lora(3)
+
+    assert set(manager.list_loras()) == {4}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.add_lora(model_lora1)
+    assert manager.pin_lora(1)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(2)
+
+    assert set(manager.list_loras()) == {1, 2}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.remove_oldest_lora()
+    assert set(manager.list_loras()) == {1}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] is None
+
+    with pytest.raises(RuntimeError):
+        assert manager.remove_oldest_lora()
+
+    assert set(manager.list_loras()) == {1}
+
 
 def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
                                        sql_lora_files):
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 75d417f525e3a..f7eae257fdd16 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1009,6 +1009,9 @@ def remove_lora(self, lora_id: int) -> bool:
     def list_loras(self) -> Set[int]:
         return self.model_executor.list_loras()
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_executor.pin_lora(lora_id)
+
     def check_health(self) -> None:
         self.model_executor.check_health()
 
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index a2212459f034e..6137cecd881d0 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -84,6 +84,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.driver_worker.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
index f7c608af1ad39..235b5bc47021d 100644
--- a/vllm/executor/distributed_gpu_executor.py
+++ b/vllm/executor/distributed_gpu_executor.py
@@ -100,6 +100,13 @@ def remove_lora(self, lora_id: int) -> bool:
             lora_id=lora_id,
         )
 
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "pin_lora",
+            lora_id=lora_id,
+        )
+
     def list_loras(self) -> Set[int]:
         return self._run_workers("list_loras")
 
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 4d01939c2e38b..7c2520b5a64f5 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -86,6 +86,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    @abstractmethod
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError  # type: ignore
+
     @abstractmethod
     def list_loras(self) -> Set[int]:
         raise NotImplementedError
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 3ad201f4757ec..0a654200ed796 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -99,6 +99,10 @@ def remove_lora(self, lora_id: int) -> bool:
         assert lora_id > 0, "lora_id must be greater than 0."
         return self.driver_worker.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index e7f0e887921b7..c5e2fb0f67736 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -65,6 +65,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.driver_worker.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 3e82856866d85..afb9ba4550671 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -525,6 +525,12 @@ def remove_lora(self, lora_id: int) -> bool:
             self.long_lora_context.offsets_by_lora_id.pop(lora_id, None)
         return bool(self._registered_loras.pop(lora_id, None))
 
+    def pin_lora(self, lora_id: int) -> bool:
+        """Pin a LoRAModel in the manager cache."""
+        raise NotImplementedError(
+            "Pinning is not supported in LoRAModelManager."
+            "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
+
     # TODO see if this can be vectorized
     def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
         (base_indices, sampler_indices, sampler_indices_padded,
@@ -777,6 +783,26 @@ def remove_oldest_lora(self) -> bool:
             return True
         return False
 
+    def pin_lora(self, lora_id: int) -> bool:
+        """Pin a LoRAModel in the manager cache."""
+        self._pin_lora_in_cpu_cache(lora_id)
+        self._pin_lora_in_gpu_cache(lora_id)
+        return True
+
+    def _pin_lora_in_cpu_cache(self, lora_id: int):
+        try:
+            self._registered_loras.pin(lora_id)
+        except ValueError as err:
+            raise ValueError("Pinning failed. "
+                             f"LoRA {lora_id} is not registered.") from err
+
+    def _pin_lora_in_gpu_cache(self, lora_id: int):
+        if lora_id not in self._active_loras:
+            # move lora to gpu if not already active
+            self.activate_lora(lora_id)
+
+        self._active_loras.pin(lora_id)
+
 
 def create_lora_manager(
         model: nn.Module,
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 498b2b9ddb18a..ca4903c23bcaa 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -221,6 +221,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self._lora_manager.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self._lora_manager.pin_lora(lora_id)
+
     def remove_all_loras(self):
         self._lora_manager.remove_all_loras()
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 27a7b1042d88f..ce5c377eff2d4 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -15,7 +15,7 @@
 from functools import lru_cache, partial, wraps
 from platform import uname
 from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
-                    Hashable, List, Optional, OrderedDict, Tuple, TypeVar,
+                    Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
                     Union)
 
 import numpy as np
@@ -44,6 +44,13 @@
 T = TypeVar("T")
 
 
+class _Sentinel:
+    ...
+
+
+ALL_PINNED_SENTINEL = _Sentinel()
+
+
 class Device(enum.Enum):
     GPU = enum.auto()
     CPU = enum.auto()
@@ -67,6 +74,7 @@ class LRUCache(Generic[T]):
 
     def __init__(self, capacity: int):
         self.cache: OrderedDict[Hashable, T] = OrderedDict()
+        self.pinned_items: Set[Hashable] = set()
         self.capacity = capacity
 
     def __contains__(self, key: Hashable) -> bool:
@@ -102,14 +110,36 @@ def put(self, key: Hashable, value: T) -> None:
         self.cache.move_to_end(key)
         self._remove_old_if_needed()
 
+    def pin(self, key: Hashable) -> None:
+        """
+        Pins a key in the cache preventing it from being
+        evicted in the LRU order.
+        """
+        if key not in self.cache:
+            raise ValueError(f"Cannot pin key: {key} not in cache.")
+        self.pinned_items.add(key)
+
+    def _unpin(self, key: Hashable) -> None:
+        self.pinned_items.remove(key)
+
     def _on_remove(self, key: Hashable, value: Optional[T]):
         pass
 
-    def remove_oldest(self):
+    def remove_oldest(self, remove_pinned=False):
         if not self.cache:
             return
-        key, value = self.cache.popitem(last=False)
-        self._on_remove(key, value)
+
+        if not remove_pinned:
+            # pop the oldest item in the cache that is not pinned
+            lru_key = next(
+                (key for key in self.cache if key not in self.pinned_items),
+                ALL_PINNED_SENTINEL)
+            if lru_key is ALL_PINNED_SENTINEL:
+                raise RuntimeError("All items are pinned, "
+                                   "cannot remove oldest from the cache.")
+        else:
+            lru_key = next(iter(self.cache))
+        self.pop(lru_key)
 
     def _remove_old_if_needed(self) -> None:
         while len(self.cache) > self.capacity:
@@ -120,13 +150,16 @@ def pop(self,
             default_value: Optional[T] = None) -> Optional[T]:
         run_on_remove = key in self.cache
         value: Optional[T] = self.cache.pop(key, default_value)
+        # remove from pinned items
+        if key in self.pinned_items:
+            self._unpin(key)
         if run_on_remove:
             self._on_remove(key, value)
         return value
 
     def clear(self):
         while len(self.cache) > 0:
-            self.remove_oldest()
+            self.remove_oldest(remove_pinned=True)
         self.cache.clear()
 
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index e24835a1ea7fb..a321eafce1a2f 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -878,6 +878,11 @@ def remove_lora(self, lora_id: int) -> bool:
             raise RuntimeError("LoRA is not enabled.")
         return self.lora_manager.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index e334ffbb755bb..c60764ef1bed8 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -333,6 +333,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.model_runner.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.model_runner.list_loras()
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 3d52fd71ec4b8..dc09718de4a32 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -70,6 +70,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    @abstractmethod
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
     @abstractmethod
     def list_loras(self) -> Set[int]:
         raise NotImplementedError
@@ -86,6 +90,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         raise ValueError(f"{type(self)} does not support LoRA")
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return ValueError(
+            f"{type(self)} does not support LoRA")  # type: ignore
+
     def list_loras(self) -> Set[int]:
         raise ValueError(f"{type(self)} does not support LoRA")
 

From cf90ae01237018f70573f69c599d26648ff7740b Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 22 Jun 2024 08:09:34 +0800
Subject: [PATCH 78/98] [CI][Hardware][Intel GPU] add Intel GPU(XPU) ci
 pipeline (#5616)

---
 .buildkite/test-template-aws.j2 | 10 ++++++++--
 README.md                       |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index fb34b787e0cbd..1a7fb44c2ecc5 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -42,12 +42,18 @@ steps:
     command: bash .buildkite/run-neuron-test.sh
     soft_fail: false
 
-  - label: "Intel Test"
+  - label: "Intel CPU Test"
     depends_on: ~
     agents:
-      queue: intel
+      queue: intel-cpu
     command: bash .buildkite/run-cpu-test.sh
 
+  - label: "Intel GPU Test"
+    depends_on: ~
+    agents:
+      queue: intel-gpu
+    command: bash .buildkite/run-xpu-test.sh
+
   {% for step in steps %}
   {% if step.gpu == "a100" %}
   - label: "{{ step.label }}"
diff --git a/README.md b/README.md
index c24768bf78173..3e0da945d9be8 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs
+- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support
 

From 9c62db07ed8ee28d9f1a0e6ac215446d49532008 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Sat, 22 Jun 2024 10:07:08 +0800
Subject: [PATCH 79/98] [Model] Support Qwen-VL and Qwen-VL-Chat models with
 text-only inputs (#5710)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/qwen.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index d22ea6b79de0f..b6ea6ab396642 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -28,6 +28,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
+from vllm.utils import print_warning_once
 
 
 class QWenMLP(nn.Module):
@@ -288,6 +289,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip loading visual weights to support Qwen-VL models
+                # in cases with text-only inputs
+                # TODO: add support for Qwen-VL
+                if (name not in params_dict
+                        and name.startswith("transformer.visual.")):
+                    print_warning_once(
+                        "Only text inputs are allowed. Images won't be handled "
+                        "until Qwen-VL models are fully supported.")
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From ff9ddbceee63efba6ba1f8d4dc66a92f1191da04 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Fri, 21 Jun 2024 20:33:12 -0700
Subject: [PATCH 80/98] [Misc] Remove #4789 workaround left in
 vllm/entrypoints/openai/run_batch.py (#5756)

---
 vllm/entrypoints/openai/run_batch.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 488ac89710b6d..dac6c2b4cd48f 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,5 +1,4 @@
 import asyncio
-import sys
 from io import StringIO
 from typing import Awaitable, List
 
@@ -137,9 +136,6 @@ async def main(args):
     output_buffer.seek(0)
     await write_file(args.output_file, output_buffer.read().strip())
 
-    # Temporary workaround for https://github.com/vllm-project/vllm/issues/4789
-    sys.exit(0)
-
 
 if __name__ == "__main__":
     args = parse_args()

From 0cbc1d2b4ff9e3afa32ffd2d5d308c136c2d15e3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Jun 2024 22:25:14 -0700
Subject: [PATCH 81/98] [Bugfix] Fix pin_lora error in TPU executor (#5760)

---
 vllm/executor/tpu_executor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 7061ad85f88c0..5ed00e1374100 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -82,6 +82,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError("LoRA is not implemented for TPU backend.")
 
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
     def list_loras(self) -> Set[int]:
         raise NotImplementedError("LoRA is not implemented for TPU backend.")
 

From 8c00f9c15d13aed34b129b31c32a227be230e218 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Jun 2024 23:09:40 -0700
Subject: [PATCH 82/98] [Docs][TPU] Add installation tip for TPU (#5761)

---
 .../getting_started/tpu-installation.rst       | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index 3627600e1f23a..e96aabbb63279 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -73,3 +73,21 @@ Next, build vLLM from source. This will only take a few seconds:
 .. code-block:: console
 
     $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
+
+
+.. tip::
+
+    If you encounter the following error:
+
+    .. code-block:: console
+
+        from torch._C import *  # noqa: F403
+        ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
+
+
+    You can install OpenBLAS with the following command:
+
+    .. code-block:: console
+
+        $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+

From 832ea88fcb4819037b685fb47b3a0de37f2804d3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 22 Jun 2024 10:00:43 -0700
Subject: [PATCH 83/98] [core][distributed] improve shared memory broadcast
 (#5754)

---
 .../device_communicators/shm_broadcast.py     | 42 ++++++++++++++-----
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 119befcf64052..c44bd2f11ee8b 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -48,6 +48,26 @@ def __init__(self,
         | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
         +--------------+--------------+--------------+-----+--------------+
 
+        The state of metadata is as follows:
+
+        (case 1) 0???...???: the block is not written yet, cannot read, can write
+        (case 2) 1000...000: the block is just written, can read, cannot write
+        (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write
+        (case 4) 1111...111: the block is written and read by all readers, cannot read, can write
+
+        State transition for readers:
+
+        When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read.
+        Only after the caller finishes reading the block, the reader can mark the block as read.
+        Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0).
+
+        State transition for writer:
+
+        When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case
+        to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer
+        can reset the reader flags to 0, and mark the block as written (from 0 to 1).
+        NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct.
+
         During creation, `name` is None and the buffer is created. We can pass the
         created object to other processes by pickling it. The other processes will
         get the name of the shared memory and open it, so that they can access the
@@ -81,10 +101,6 @@ def __init__(self,
                        lambda *args, **kwargs: None):
                 self.shared_memory = shared_memory.SharedMemory(name=name)
             assert self.shared_memory.size == self.total_bytes_of_buffer
-            with memoryview(self.shared_memory.buf[self.metadata_offset:]
-                            ) as metadata_buffer:
-                tensor = torch.frombuffer(metadata_buffer, dtype=torch.uint8)
-                assert torch.all(tensor == 0)
 
     def __reduce__(self):
         return (
@@ -163,11 +179,15 @@ def acquire_write(self):
                     yield buf
 
                 # caller has written to the buffer
-                # mark the block as written
-                metadata_buffer[0] = 1
+                # NOTE: order is important here
+                # first set the read flags to 0
+                # then set the written flag to 1
+                # otherwise, the readers may think they already read the block
                 for i in range(1, self.buffer.n_reader + 1):
                     # set read flag to 0, meaning it is not read yet
                     metadata_buffer[i] = 0
+                # mark the block as written
+                metadata_buffer[0] = 1
                 break
 
     @contextmanager
@@ -247,13 +267,15 @@ def create_from_process_group(pg: ProcessGroup,
         buffer: ShmRingBuffer
         if group_rank == writer_rank:
             buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks)
-            dist.broadcast_object_list([buffer], src=global_ranks[writer_rank])
-            dist.barrier(pg)
+            dist.broadcast_object_list([buffer],
+                                       src=global_ranks[writer_rank],
+                                       group=pg)
             return ShmRingBufferIO(buffer, -1)
         else:
             recv = [None]
-            dist.broadcast_object_list(recv, src=global_ranks[writer_rank])
-            dist.barrier(pg)
+            dist.broadcast_object_list(recv,
+                                       src=global_ranks[writer_rank],
+                                       group=pg)
             buffer = recv[0]  # type: ignore
             rest_ranks = [r for r in ranks_inside_group if r != writer_rank]
             return ShmRingBufferIO(buffer, rest_ranks.index(group_rank))

From 6c916ac8a80d1b2f4e0d0113a67767dc254a3598 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 24 Jun 2024 02:37:11 +0530
Subject: [PATCH 84/98] [BugFix] [Kernel] Add Cutlass2x fallback kernels
 (#5744)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 csrc/quantization/cutlass_w8a8/common.hpp     |  8 +++
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 52 ++++++++++++++++---
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
index 23d0587bbdc5d..bf04bb400790f 100644
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ b/csrc/quantization/cutlass_w8a8/common.hpp
@@ -17,3 +17,11 @@ inline uint32_t next_pow_2(uint32_t const num) {
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
 
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 740b9fb64a754..38a20a1727d18 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -250,12 +250,39 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(status);
 }
 
+template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
+void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                  torch::Tensor const& b,
+                                  EpilogueArgs&&... args) {
+  // In some cases, the GPU isn't able to accommodate the
+  // shared memory requirements of the Gemm. In such cases, use
+  // the FallbackGemm instead.
+  static const int max_shared_mem_per_block_opt_in =
+      get_cuda_max_shared_memory_per_block_opt_in(0);
+
+  size_t const gemm_shared_mem_size =
+      sizeof(typename Gemm::KernelType::SharedStorage);
+  size_t const fallback_gemm_shared_mem_size =
+      sizeof(typename FallbackGemm::KernelType::SharedStorage);
+
+  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
+    return cutlass_gemm_caller<Gemm>(out, a, b,
+                                     std::forward<EpilogueArgs>(args)...);
+  } else {
+    TORCH_CHECK(fallback_gemm_shared_mem_size <=
+                max_shared_mem_per_block_opt_in);
+    return cutlass_gemm_caller<FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
 template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
 struct sm80_config_default {
   // This config is used in 2 cases,
   //  - M in (128, inf)
   //  - M in (64, 128] and N >= 8192
+  // Shared Memory required by this Gemm - 81920 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
@@ -271,6 +298,7 @@ struct sm80_config_M64 {
   // This config is used in 2 cases,
   // - M in (32, 64]
   // - M in (64, 128] and N < 8192
+  // Shared Memory required by this Gemm - 122880 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
@@ -284,6 +312,7 @@ template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
 struct sm80_config_M32 {
   // M in (16, 32]
+  // Shared Memory required by this Gemm - 61440 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
   using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
@@ -297,6 +326,7 @@ template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
 struct sm80_config_M16 {
   // M in [1, 16]
+  // Shared Memory required by this Gemm - 51200 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
   using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
@@ -331,35 +361,45 @@ void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
   using Cutlass2xGemmM16 =
       typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
 
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm80_config_M16 has the least shared-memory requirement. However,
+  // based on some profiling, we select sm80_config_M32 as a better alternative
+  // performance wise.
+  using FallbackGemm =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
       std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
   if (mp2 <= 16) {
     // M in [1, 16]
-    return cutlass_gemm_caller<Cutlass2xGemmM16>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 32) {
     // M in (16, 32]
-    return cutlass_gemm_caller<Cutlass2xGemmM32>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 64) {
     // M in (32, 64]
-    return cutlass_gemm_caller<Cutlass2xGemmM64>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // M in (64, 128]
     uint32_t const n = out.size(1);
     bool const small_n = n < 8192;
     if (small_n) {
-      return cutlass_gemm_caller<Cutlass2xGemmM128SmallN>(
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
+                                          FallbackGemm>(
           out, a, b, std::forward<EpilogueArgs>(args)...);
     } else {
-      return cutlass_gemm_caller<Cutlass2xGemmM128BigN>(
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
           out, a, b, std::forward<EpilogueArgs>(args)...);
     }
   } else {
     // M in (128, inf)
-    return cutlass_gemm_caller<Cutlass2xGemmDefault>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   }
 }

From 5d4d90536fa24c032bb91ae629b7b4958e045b03 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Sun, 23 Jun 2024 17:42:28 -0400
Subject: [PATCH 85/98] [Distributed] Add send and recv helpers (#5719)

---
 tests/distributed/test_comm_ops.py            |  78 +++++++-
 tests/distributed/test_custom_all_reduce.py   |   5 +-
 tests/distributed/test_pynccl.py              |  16 +-
 tests/utils.py                                |   2 +-
 .../device_communicators/pynccl.py            |  14 +-
 vllm/distributed/parallel_state.py            | 187 ++++++++++++++++++
 6 files changed, 278 insertions(+), 24 deletions(-)

diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 53654dc40d10d..bf0f31df02fa5 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -8,12 +8,11 @@
 import ray
 import torch
 
-from vllm.distributed import (broadcast_tensor_dict,
+from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 
-from ..utils import (init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+from ..utils import init_test_distributed_environment, multi_process_parallel
 
 
 @ray.remote(num_gpus=1, max_calls=1)
@@ -105,6 +104,68 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
         assert torch.allclose(recv_dict["f"], test_dict["f"])
 
 
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+                                      distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if not get_pp_group().is_first_rank:
+        recv_dict = get_pp_group().recv_tensor_dict()
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send_tensor_dict(test_dict)
+
+    if not get_pp_group().is_first_rank:
+        assert len(recv_dict) == len(test_dict)
+        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        assert torch.allclose(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+                          distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    size = 64
+    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
+
+    if not get_pp_group().is_first_rank:
+        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send(test_tensor)
+
+    if not get_pp_group().is_first_rank:
+        assert torch.allclose(test_tensor, recv_tensor)
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("tp_size", [2])
@@ -113,4 +174,13 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
     broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_tensor_parallel(tp_size, 1, test_target)
+    multi_process_parallel(tp_size, 1, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize(
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+def test_multi_process_pipeline_parallel(pp_size, test_target):
+    multi_process_parallel(1, pp_size, test_target)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 9a39160b8a462..3c281a45fcaf1 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -12,8 +12,7 @@
                                              get_tp_group, graph_capture)
 
 from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+                     init_test_distributed_environment, multi_process_parallel)
 
 random.seed(42)
 test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
@@ -113,4 +112,4 @@ def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    multi_process_tensor_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 964dbc5423e75..e0e424439e3a5 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -168,9 +168,13 @@ def send_recv_worker_fn():
                              dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
         if pynccl_comm.rank == 0:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     assert result == 1
 
@@ -203,9 +207,13 @@ def multiple_send_recv_worker_fn():
                              device=device)
     with pynccl_comm.change_state(enable=True):
         if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     if torch.distributed.get_rank() in [0, 2]:
         assert result == 1
diff --git a/tests/utils.py b/tests/utils.py
index bc30515c83100..174efca4af532 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -129,7 +129,7 @@ def init_test_distributed_environment(
     ensure_model_parallel_initialized(tp_size, pp_size)
 
 
-def multi_process_tensor_parallel(
+def multi_process_parallel(
     tp_size: int,
     pp_size: int,
     test_target,
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 83eec264b6f81..7319566545678 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -121,10 +121,7 @@ def all_reduce(self,
                                 ncclRedOpTypeEnum.from_torch(op), self.comm,
                                 cudaStream_t(stream.cuda_stream))
 
-    def send(self,
-             tensor: torch.Tensor,
-             dst: Optional[int] = None,
-             stream=None):
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
@@ -132,16 +129,11 @@ def send(self,
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if dst is None:
-            dst = (self.rank + 1) % self.world_size
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
 
-    def recv(self,
-             tensor: torch.Tensor,
-             src: Optional[int] = None,
-             stream=None):
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
@@ -149,8 +141,6 @@ def recv(self,
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if src is None:
-            src = (self.rank - 1) % self.world_size
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5188fadbb92a5..5f1decb376af5 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -20,6 +20,7 @@
  steps.
 """
 import contextlib
+import pickle
 from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
@@ -28,6 +29,7 @@
 from unittest.mock import patch
 
 import torch
+import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
 import vllm.envs as envs
@@ -180,6 +182,16 @@ def last_rank(self):
         """Return the global rank of the last process in the group"""
         return self.ranks[-1]
 
+    @property
+    def is_first_rank(self):
+        """Return whether the caller is the first process in the group"""
+        return self.rank == self.first_rank
+
+    @property
+    def is_last_rank(self):
+        """Return whether the caller is the last process in the group"""
+        return self.rank == self.last_rank
+
     @property
     def next_rank(self):
         """Return the global rank of the process that follows the caller"""
@@ -374,6 +386,70 @@ def broadcast_object_list(self,
                                                 group=self.device_group)
         return obj_list
 
+    def send_object(self, obj: Any, dst: int) -> None:
+        """Send the input object list to the destination rank."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        assert dst != self.rank, (
+            "Invalid destination rank. Destination rank is the same "
+            "as the current rank.")
+
+        # Serialize object to tensor and get the size as well
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
+
+        size_tensor = torch.tensor([object_tensor.numel()],
+                                   dtype=torch.long,
+                                   device="cpu")
+
+        # Send object size
+
+        torch.distributed.send(size_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        # Send object
+        torch.distributed.send(object_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        return None
+
+    def recv_object(self, src: int) -> Any:
+        """Receive the input object list from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        assert src != self.rank, (
+            "Invalid source rank. Source rank is the same as the current rank."
+        )
+
+        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
+
+        # Receive object size
+        rank_size = torch.distributed.recv(size_tensor,
+                                           src=src,
+                                           group=self.cpu_group)
+
+        # Tensor to receive serialized objects into.
+        object_tensor = torch.empty(  # type: ignore[call-overload]
+            size_tensor.item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device="cpu")
+
+        rank_object = torch.distributed.recv(object_tensor,
+                                             src=src,
+                                             group=self.cpu_group)
+
+        assert rank_object == rank_size, (
+            "Received object sender rank does not match the size sender rank.")
+
+        obj = pickle.loads(object_tensor.numpy().tobytes())
+
+        return obj
+
     def broadcast_tensor_dict(
         self,
         tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
@@ -459,6 +535,88 @@ def broadcast_tensor_dict(
                 async_handle.wait()
         return tensor_dict
 
+    def send_tensor_dict(
+        self,
+        tensor_dict: Dict[Any, Union[torch.Tensor, Any]],
+        dst: Optional[int] = None
+    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+        """Send the input tensor dictionary.
+        NOTE: `dst` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if dst is None:
+            dst = self.next_rank
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        metadata_list: List[Tuple[Any, Any]] = []
+        assert isinstance(
+            tensor_dict,
+            dict), f"Expecting a dictionary, got {type(tensor_dict)}"
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        # `metadata_list` lives in CPU memory.
+        # `send_object_list` has serialization & deserialization,
+        # all happening on CPU. Therefore, we can use the CPU group.
+        self.send_object(metadata_list, dst=dst)
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip sending empty tensors.
+                continue
+            if tensor.is_cpu:
+                # use metadata_group for CPU tensors
+                torch.distributed.send(tensor, dst=dst, group=metadata_group)
+            else:
+                # use group for GPU tensors
+                torch.distributed.send(tensor, dst=dst, group=group)
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: Optional[int] = None
+    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+        """Recv the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return None
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if src is None:
+            src = self.prev_rank
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        recv_metadata_list = self.recv_object(src=src)
+        tensor_dict = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size,
+                                     dtype=value.dtype,
+                                     device=value.device)
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    tensor_dict[key] = tensor
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    torch.distributed.recv(tensor,
+                                           src=src,
+                                           group=metadata_group)
+                else:
+                    # use group for GPU tensors
+                    torch.distributed.recv(tensor, src=src, group=group)
+                tensor_dict[key] = tensor
+            else:
+                tensor_dict[key] = value
+        return tensor_dict
+
     def barrier(self):
         """Barrier synchronization among the group.
         NOTE: don't use `device_group` here! `barrier` in NCCL is
@@ -468,6 +626,35 @@ def barrier(self):
         """
         torch.distributed.barrier(group=self.cpu_group)
 
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = self.next_rank
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the src rank."""
+        """NOTE: `src` is the local rank of the destination rank."""
+        if src is None:
+            src = self.prev_rank
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
     def destroy(self):
         if self.device_group is not None:
             torch.distributed.destroy_process_group(self.device_group)

From edd5fe5fa29b8f9cc5fa37a30cc7211e0ff37067 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 24 Jun 2024 12:11:53 +0800
Subject: [PATCH 86/98] [Bugfix] Add phi3v resize for dynamic shape and fix
 torchvision requirement (#5772)

---
 requirements-cpu.txt                |  1 +
 requirements-cuda.txt               |  2 +
 requirements-test.txt               |  1 -
 tests/models/test_phi3v.py          |  4 ++
 vllm/model_executor/models/phi3v.py | 69 +++++++++++++++++++++++++++--
 5 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 8b7d86e686217..21acee91d7b57 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,4 +3,5 @@
 
 # Dependencies for x86_64 CPUs
 torch == 2.3.1+cpu
+torchvision == 0.18.1+cpu   # required for the image processor of phi3v, this must be updated alongside torch
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 3536179835967..10596ed85d600 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,5 +5,7 @@
 ray >= 2.9
 nvidia-ml-py # for pynvml package
 torch == 2.3.0
+# These must be updated alongside torch
+torchvision == 0.18.0   # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
 vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
diff --git a/requirements-test.txt b/requirements-test.txt
index fef0ede7be0ff..8b68e0e939669 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,7 +14,6 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
-torchvision # required for the image processor of phi3v
 
 # Benchmarking
 aiohttp
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 23454759827d5..a29d50df4c4e5 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -22,6 +22,7 @@
 def iter_phi3v_configs(model_name: str):
     image_hw_to_feature_size = {
         (1008, 1344): 1921,
+        (2016, 2688): 1933,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -75,6 +76,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 # TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
 # Since we use _attn_implementation="eager" for hf_runner, here is
 # numeric difference for longer context and test can't pass
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fa20a7c5903d6..dac832a686c2c 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -13,14 +13,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
 
+import numpy as np
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -32,9 +35,11 @@
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
+from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
+logger = init_logger(__name__)
+
 _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
 }
@@ -268,7 +273,63 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
+# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported
+# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def calc_padded_size(width, height, padding_unit=336):
+    target_height = int(np.ceil(height / padding_unit) * padding_unit)
+    top_padding = int((target_height - height) / 2)
+    bottom_padding = target_height - height - top_padding
+    padded_width = width
+    padded_height = height + top_padding + bottom_padding
+    return padded_width, padded_height
+
+
+# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def calc_hd_transform_size(width, height, hd_num=16):
+    transposed = False
+    if width < height:
+        width, height = height, width
+        transposed = True
+
+    ratio = width / height
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= hd_num:
+        scale += 1
+    scale -= 1
+
+    new_width = int(scale * 336)
+    new_height = int(new_width / ratio)
+
+    padded_width, padded_height = calc_padded_size(new_width, new_height)
+
+    if transposed:
+        padded_width, padded_height = padded_height, padded_width
+
+    return padded_width, padded_height
+
+
+def _image_processor(
+    data: ImagePixelData,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Dict[str, torch.Tensor]:
+    image = data.image
+
+    if isinstance(image, Image.Image):
+        # Temporary patch before dynamic number of image tokens is supported
+        _, _, h, w = vlm_config.image_input_shape
+        if (w, h) != calc_hd_transform_size(image.width, image.height):
+            logger.warning(
+                "Dynamic image shape is currently not supported. "
+                "Resizing input image to (%d, %d).", w, h)
+
+            data.image = image.resize((w, h))
+
+    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
+            ._default_input_processor(data, model_config, vlm_config)
+
+
+@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor)
 @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
 class Phi3VForCausalLM(VisionLanguageModelBase):
 

From c2462129521a64b62ace77b28641d2e3bec5831c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 24 Jun 2024 00:37:42 -0700
Subject: [PATCH 87/98] [doc][faq] add warning to download models for every
 nodes (#5783)

---
 docs/source/serving/distributed_serving.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index b0c45dbf70268..2a7937a9189c1 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -35,4 +35,7 @@ To scale vLLM beyond a single machine, install and start a `Ray runtime <https:/
     $ # On worker nodes
     $ ray start --address=<ray-head-address>
 
-After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
\ No newline at end of file
+After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
+
+.. warning::
+    Please make sure you downloaded the model to all the nodes, or the model is downloaded to some distributed file system that is accessible by all nodes.

From e72dc6cb3507d914eec8dfd0d5c7b9478f6a8ccc Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 24 Jun 2024 13:26:17 -0400
Subject: [PATCH 88/98] [Doc] Add "Suggest edit" button to doc pages (#5789)

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index ca26dcec4bb5a..af1f22b23c126 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -66,6 +66,7 @@
     'path_to_docs': 'docs/source',
     'repository_url': 'https://github.com/vllm-project/vllm',
     'use_repository_button': True,
+    'use_edit_page_button': True,
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,

From 1744cc99ba9bdefea8f3f798cf51ed650b81a98e Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 24 Jun 2024 13:48:55 -0400
Subject: [PATCH 89/98] [Doc] Add Phi-3-medium to list of supported models
 (#5788)

---
 docs/source/models/supported_models.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f4673dc27092f..47737ae525209 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -129,7 +129,7 @@ Alongside each architecture, we include some popular models that use it.
     - ✅︎
   * - :code:`Phi3ForCausalLM`
     - Phi-3
-    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc.
+    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc.
     -
   * - :code:`Phi3SmallForCausalLM`
     - Phi-3-Small

From ba991d5c84adbc0685075af88333c688ddb06011 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Mon, 24 Jun 2024 16:01:19 -0700
Subject: [PATCH 90/98] [Bugfix] Fix FlexibleArgumentParser replaces _ with -
 for actual args (#5795)

---
 vllm/utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index ce5c377eff2d4..f0c7df5cf8c22 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -822,7 +822,13 @@ def parse_args(self, args=None, namespace=None):
         processed_args = []
         for arg in args:
             if arg.startswith('--'):
-                processed_args.append('--' + arg[len('--'):].replace('_', '-'))
+                if '=' in arg:
+                    key, value = arg.split('=', 1)
+                    key = '--' + key[len('--'):].replace('_', '-')
+                    processed_args.append(f'{key}={value}')
+                else:
+                    processed_args.append('--' +
+                                          arg[len('--'):].replace('_', '-'))
             else:
                 processed_args.append(arg)
 

From e9de9dd551ac595a9f3825fcd1507deceef4f332 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 24 Jun 2024 21:09:02 -0700
Subject: [PATCH 91/98] [ci] Remove aws template (#5757)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml   |   7 +-
 .buildkite/test-template-aws.j2 | 145 --------------------------------
 2 files changed, 5 insertions(+), 147 deletions(-)
 delete mode 100644 .buildkite/test-template-aws.j2

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0b87e6280f0bb..19b1bce162880 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1,7 +1,10 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template-aws.j2` to generate
-# the final pipeline yaml file.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
+# to generate the final pipeline yaml file.
+
 
 steps:
 - label: Regression Test
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
deleted file mode 100644
index 1a7fb44c2ecc5..0000000000000
--- a/.buildkite/test-template-aws.j2
+++ /dev/null
@@ -1,145 +0,0 @@
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-
-steps:
-  - label: ":docker: build image"
-    agents:
-      queue: cpu_queue
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
-  - group: "AMD Tests"
-    depends_on: ~
-    steps:
-    {% for step in steps %}
-    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
-      - label: "AMD: {{ step.label }}"
-        agents:
-          queue: amd
-        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
-        env:
-          DOCKER_BUILDKIT: "1"
-        priority: 100
-        soft_fail: true
-    {% endif %}
-    {% endfor %}
-
-  - label: "Neuron Test"
-    depends_on: ~
-    agents:
-      queue: neuron
-    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: false
-
-  - label: "Intel CPU Test"
-    depends_on: ~
-    agents:
-      queue: intel-cpu
-    command: bash .buildkite/run-cpu-test.sh
-
-  - label: "Intel GPU Test"
-    depends_on: ~
-    agents:
-      queue: intel-gpu
-    command: bash .buildkite/run-xpu-test.sh
-
-  {% for step in steps %}
-  {% if step.gpu == "a100" %}
-  - label: "{{ step.label }}"
-    agents:
-      queue: a100-queue
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: ci
-          containers:
-          - image: {{ docker_image }}
-            command: ["bash"]
-            args:
-            - '-c'
-            - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
-            resources:
-              limits:
-                nvidia.com/gpu: {{ step.num_gpus or 1 }}
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  {% else %}
-  - label: "{{ step.label }}"
-    agents:
-      {% if step.label == "Documentation Build" %}
-      queue: small_cpu_queue
-      {% elif step.no_gpu %}
-      queue: cpu_queue
-      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
-      queue: gpu_4_queue
-      {% else %}
-      queue: gpu_1_queue
-      {% endif %}
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-      - docker#v5.2.0:
-          image: {{ docker_image }}
-          always-pull: true
-          propagate-environment: true
-          {% if not step.no_gpu %}
-          gpus: all
-          {% endif %}
-          {% if step.label == "Benchmarks" %}
-          mount-buildkite-agent: true
-          {% endif %}
-          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
-          environment:
-            - VLLM_USAGE_SOURCE=ci-test
-            - HF_TOKEN
-            {% if step.label == "Speculative decoding tests" %}
-            - VLLM_ATTENTION_BACKEND=XFORMERS
-            {% endif %}
-          volumes:
-            - /dev/shm:/dev/shm
-  {% endif %}
-  {% endfor %}

From f23871e9eead900d6146961ca894f5bc91f30f5e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 16:25:03 +0800
Subject: [PATCH 92/98] [Doc] Add notice about breaking changes to VLMs (#5818)

---
 docs/source/models/vlm.rst | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 70ac82e2005b9..de55a1a099192 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -5,6 +5,9 @@ Using VLMs
 
 vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
 
+.. important::
+    We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
+
 Engine Arguments
 ----------------
 
@@ -39,6 +42,10 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
         image_feature_size=576,
     )
 
+.. important::
+    We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
+
+
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
@@ -63,6 +70,9 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
 
+.. important::
+    We will remove the need to format image tokens in a future release. Afterwards, the input text will follow the same format as that for the original HuggingFace model.
+
 Online OpenAI Vision API Compatible Inference
 ----------------------------------------------
 
@@ -89,6 +99,9 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
         --image-feature-size 576 \
         --chat-template template_llava.jinja
 
+.. important::
+    We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
+
 To consume the server, you can use the OpenAI client like in the example below:
 
 .. code-block:: python

From 2ce5d6688bae64e467640b05e73af2888e93afcf Mon Sep 17 00:00:00 2001
From: Woo-Yeon Lee <wooyeonlee0@gmail.com>
Date: Tue, 25 Jun 2024 18:56:06 +0900
Subject: [PATCH 93/98]  [Speculative Decoding] Support draft model on
 different tensor-parallel size than target model (#5414)

---
 .buildkite/test-pipeline.yaml                 |   3 +-
 benchmarks/benchmark_latency.py               |   6 +
 .../e2e/test_integration_dist_tp2.py          | 111 +++++++++++++
 ...n_dist.py => test_integration_dist_tp4.py} |  41 +++--
 vllm/config.py                                |  24 ++-
 vllm/distributed/parallel_state.py            |  76 ++++++---
 vllm/engine/arg_utils.py                      |  10 ++
 vllm/spec_decode/multi_step_worker.py         |  11 +-
 vllm/spec_decode/proposer_worker_base.py      |   4 +-
 .../spec_decode/smaller_tp_proposer_worker.py | 149 ++++++++++++++++++
 vllm/spec_decode/spec_decode_worker.py        |  12 +-
 11 files changed, 388 insertions(+), 59 deletions(-)
 create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py
 rename tests/spec_decode/e2e/{test_integration_dist.py => test_integration_dist_tp4.py} (62%)
 create mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 19b1bce162880..10cfe35d85be4 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -54,7 +54,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
@@ -71,6 +71,7 @@ steps:
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
 - label: Engine Test
   mirror_hardwares: [amd]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index a4cf0632b7790..f3d00e456f159 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -25,6 +25,8 @@ def main(args: argparse.Namespace):
         model=args.model,
         speculative_model=args.speculative_model,
         num_speculative_tokens=args.num_speculative_tokens,
+        speculative_draft_tensor_parallel_size=\
+            args.speculative_draft_tensor_parallel_size,
         tokenizer=args.tokenizer,
         quantization=args.quantization,
         tensor_parallel_size=args.tensor_parallel_size,
@@ -127,6 +129,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
     parser.add_argument('--speculative-model', type=str, default=None)
     parser.add_argument('--num-speculative-tokens', type=int, default=None)
+    parser.add_argument('--speculative-draft-tensor-parallel-size',
+                        '-spec-draft-tp',
+                        type=int,
+                        default=None)
     parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
new file mode 100644
index 0000000000000..5534b80c0aaa0
--- /dev/null
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -0,0 +1,111 @@
+"""Tests which cover integration of the speculative decoding framework with
+tensor parallelism.
+"""
+
+import pytest
+import torch
+
+from vllm.utils import is_hip
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+    },
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
+                              batch_size: int, output_len: int):
+    """Verify greedy equality when tensor parallelism is used.
+    """
+    if is_hip():
+        pytest.skip("hip is not well-supported yet")
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "speculative_draft_tensor_parallel_size": 1,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
similarity index 62%
rename from tests/spec_decode/e2e/test_integration_dist.py
rename to tests/spec_decode/e2e/test_integration_dist_tp4.py
index d444ef24cbfda..56cb0147d9e4f 100644
--- a/tests/spec_decode/e2e/test_integration_dist.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -5,16 +5,16 @@
 import pytest
 import torch
 
-from vllm.utils import is_hip
-
 from .conftest import run_greedy_equality_correctness_test
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
         "model": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
@@ -22,7 +22,7 @@
 
         # Required for spec decode.
         "use_v2_block_manager": True,
-        "tensor_parallel_size": 2,
+        "tensor_parallel_size": 4,
 
         # Use AsyncLLM engine, so that the engine runs in its own process.
         # Otherwise, since vLLM does not follow true SPMD, the test runner
@@ -31,35 +31,30 @@
         # second run of the test to fail with internal NCCL error.
         "use_async": True,
     }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
     {
         "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-    },
-    {
-        "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
     },
 ])
-@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
-    "output_len",
+    "test_llm_kwargs",
     [
-        # Use smaller output len for fast test.
-        32,
+        #TODO(wooyeon): add spec_draft_dp=2 case
+        {
+            "speculative_draft_tensor_parallel_size": 1,
+        },
     ])
+@pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
-                              batch_size: int, output_len: int):
-    """Verify greedy equality when tensor parallelism is used.
+def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
     """
-    if is_hip():
-        pytest.skip("hip is not well-supported yet")
     run_greedy_equality_correctness_test(baseline_llm_generator,
                                          test_llm_generator,
                                          batch_size,
-                                         max_output_len=output_len,
+                                         max_output_len=32,
                                          force_output_len=True)
diff --git a/vllm/config.py b/vllm/config.py
index 8d004902fe4ff..0217a2b569928 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -797,6 +797,7 @@ def maybe_create_spec_config(
         target_parallel_config: ParallelConfig,
         target_dtype: str,
         speculative_model: Optional[str],
+        speculative_draft_tensor_parallel_size: Optional[int],
         num_speculative_tokens: Optional[int],
         speculative_max_model_len: Optional[int],
         enable_chunked_prefill: bool,
@@ -819,6 +820,8 @@ def maybe_create_spec_config(
             target_dtype (str): The data type used for the target model.
             speculative_model (Optional[str]): The name of the speculative
                 model, if provided.
+            speculative_draft_tensor_parallel_size (Optional[int]): The degree
+                of the tensor parallelism for the draft model.
             num_speculative_tokens (Optional[int]): The number of speculative
                 tokens, if provided. Will default to the number in the draft
                 model config if present, otherwise is required.
@@ -939,7 +942,8 @@ def maybe_create_spec_config(
 
             draft_parallel_config = (
                 SpeculativeConfig.create_draft_parallel_config(
-                    target_parallel_config))
+                    target_parallel_config,
+                    speculative_draft_tensor_parallel_size))
 
         if num_speculative_tokens is None:
             raise ValueError(
@@ -993,16 +997,26 @@ def _maybe_override_draft_max_model_len(
 
     @staticmethod
     def create_draft_parallel_config(
-            target_parallel_config: ParallelConfig) -> ParallelConfig:
+        target_parallel_config: ParallelConfig,
+        speculative_draft_tensor_parallel_size: Optional[int]
+    ) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
-        This is mostly a copy of the target parallel config. In the future the
-        draft worker can have a different parallel strategy, e.g. TP=1.
+        This is mostly a copy of the target parallel config, except the tp_size.
         """
+        if speculative_draft_tensor_parallel_size is None:
+            speculative_draft_tensor_parallel_size = \
+                  target_parallel_config.tensor_parallel_size
+        elif speculative_draft_tensor_parallel_size != 1:
+            # TODO(wooyeon): allow tp values larger than 1
+            raise ValueError(
+                f"{speculative_draft_tensor_parallel_size=} cannot be"
+                f"other value than 1")
+
         draft_parallel_config = ParallelConfig(
             pipeline_parallel_size=target_parallel_config.
             pipeline_parallel_size,
-            tensor_parallel_size=target_parallel_config.tensor_parallel_size,
+            tensor_parallel_size=speculative_draft_tensor_parallel_size,
             distributed_executor_backend=target_parallel_config.
             distributed_executor_backend,
             max_parallel_loading_workers=target_parallel_config.
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5f1decb376af5..a7a806b055681 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -676,6 +676,28 @@ def get_world_group() -> GroupCoordinator:
     return _WORLD
 
 
+def init_world_group(ranks: List[int], local_rank: int,
+                     backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=[ranks],
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=False,
+        use_custom_allreduce=False,
+    )
+
+
+def init_model_parallel_group(group_ranks: List[List[int]], local_rank: int,
+                              backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
+    )
+
+
 _TP: Optional[GroupCoordinator] = None
 
 
@@ -764,13 +786,7 @@ def init_distributed_environment(
     global _WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
-        _WORLD = GroupCoordinator(
-            group_ranks=[ranks],
-            local_rank=local_rank,
-            torch_distributed_backend=backend,
-            use_pynccl=False,
-            use_custom_allreduce=False,
-        )
+        _WORLD = init_world_group(ranks, local_rank, backend)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
@@ -827,13 +843,8 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
-    _TP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+    _TP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank, backend)
 
     # Build the pipeline model-parallel groups.
     num_pipeline_model_parallel_groups: int = (world_size //
@@ -845,13 +856,8 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    _PP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+    _PP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank, backend)
 
 
 def ensure_model_parallel_initialized(
@@ -887,6 +893,34 @@ def model_parallel_is_initialized():
     return (_TP is not None and _PP is not None)
 
 
+_TP_STATE_PATCHED = False
+
+
+@contextmanager
+def patch_tensor_parallel_group(tp_group: GroupCoordinator):
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _TP_STATE_PATCHED
+    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+
+    _TP_STATE_PATCHED = True
+    old_tp_group = get_tp_group()
+    global _TP
+    _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        _TP_STATE_PATCHED = False
+        _TP = old_tp_group
+
+
 def get_tensor_model_parallel_world_size():
     """Return world size for the tensor model parallel group."""
     return get_tp_group().world_size
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ef31612420c94..16374098b23d4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -94,6 +94,7 @@ class EngineArgs:
     guided_decoding_backend: str = 'outlines'
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
+    speculative_draft_tensor_parallel_size: Optional[int] = None
     num_speculative_tokens: Optional[int] = None
     speculative_max_model_len: Optional[int] = None
     speculative_disable_by_batch_size: Optional[int] = None
@@ -537,6 +538,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.num_speculative_tokens,
             help='The number of speculative tokens to sample from '
             'the draft model in speculative decoding.')
+        parser.add_argument(
+            '--speculative-draft-tensor-parallel-size',
+            '-spec-draft-tp',
+            type=int,
+            default=EngineArgs.speculative_draft_tensor_parallel_size,
+            help='Number of tensor parallel replicas for '
+            'the draft model in speculative decoding.')
 
         parser.add_argument(
             '--speculative-max-model-len',
@@ -686,6 +694,8 @@ def create_engine_config(self, ) -> EngineConfig:
             target_parallel_config=parallel_config,
             target_dtype=self.dtype,
             speculative_model=self.speculative_model,
+            speculative_draft_tensor_parallel_size = \
+                self.speculative_draft_tensor_parallel_size,
             num_speculative_tokens=self.num_speculative_tokens,
             speculative_disable_by_batch_size=self.
             speculative_disable_by_batch_size,
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 668ceefe6175f..e469fd7c3a160 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -6,7 +6,8 @@
 
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
-from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                         SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -28,9 +29,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # Lazy initialization list.
-        self._proposer: Top1Proposer
+        self._proposer: SpeculativeProposer
 
-    def init_device(self):
+    def init_device(self) -> None:
         super().init_device()
 
         self._proposer = Top1Proposer(
@@ -40,7 +41,7 @@ def init_device(self):
             max_proposal_len=self.max_model_len,
         )
 
-    def set_include_gpu_probs_tensor(self):
+    def set_include_gpu_probs_tensor(self) -> None:
         # Need include_gpu_probs_tensor for multi_step_worker
         self.model_runner.model.sampler.include_gpu_probs_tensor = True
 
@@ -73,7 +74,7 @@ def sampler_output(
         # Run model sample_len times.
         model_outputs: List[SamplerOutput] = []
         for _ in range(sample_len):
-            model_output = super().execute_model(
+            model_output: List[SamplerOutput] = super().execute_model(
                 execute_model_req=copied_execute_model_req)
             assert (len(model_output) == 1
                     ), "composing multistep workers not supported"
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index fd67ceb912eee..b691659fb292b 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -3,10 +3,10 @@
 
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.interfaces import SpeculativeProposer
-from vllm.worker.worker_base import WorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
 
 
-class ProposerWorkerBase(WorkerBase, SpeculativeProposer):
+class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer):
     """Interface for proposer workers"""
 
     @abstractmethod
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
new file mode 100644
index 0000000000000..b78e4489513f7
--- /dev/null
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -0,0 +1,149 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.distributed.parallel_state import (get_tp_group,
+                                             init_model_parallel_group,
+                                             patch_tensor_parallel_group)
+from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+
+logger = init_logger(__name__)
+
+
+class SmallerTpProposerWorker(ProposerWorkerBase):
+    """Class which allows a speculative draft model to run with smaller tensor
+    parallel degree than target model.
+    This reduces the communication overhead of small draft models.
+
+    To implement this feature, this class differs behavior based on is_dummy
+    flag, where dummy means worker that does not participate draft generation.
+    Participating workers use a smaller tp group by patching vLLM's tensor
+    parallel group temporarily during forward passes of draft models.
+    """
+
+    @classmethod
+    def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int,
+                          target_tensor_parallel_size: int):
+        """Wrap the worker in a SmallerTpProposerWorker if necessary.
+        """
+        if draft_tensor_parallel_size == target_tensor_parallel_size:
+            return worker
+
+        # gpu ranks that will generate draft tokens together
+        draft_ranks = list(range(draft_tensor_parallel_size))
+
+        logger.info("Wrapping {%s} in {%s}", type(worker), cls)
+        return cls(worker, draft_ranks)
+
+    def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]):
+        """Create a SmallerTpProposerWorker.
+
+        Args:
+            worker (MultiStepWorker): an actual worker wrapped with this class
+            draft_ranks (List[int]): if this value is given, only the GPU ranks
+            written in this value participate in draft generation
+        """
+        self._worker = worker
+        self._draft_ranks = draft_ranks
+
+        # init during init_device
+        self._is_dummy = False
+        self._tp_group = None
+
+    def _patch_tensor_parallel_group(self):
+        """Temporarily patch the global tp group state with its own tp group
+        state.
+        """
+        return patch_tensor_parallel_group(self._tp_group)
+
+    def init_device(self) -> None:
+        self._is_dummy = get_tp_group().rank not in self._draft_ranks
+
+        # dummy workers do nothing
+        if self._is_dummy:
+            return
+
+        # creates tp process group containing only a subset of gpu ranks
+        local_rank = get_tp_group().local_rank
+        tp_backend = torch.distributed.get_backend(get_tp_group().device_group)
+        self._tp_group = init_model_parallel_group([self._draft_ranks],
+                                                   local_rank, tp_backend)
+
+        with self._patch_tensor_parallel_group():
+            self._worker.init_device()
+
+    def set_include_gpu_probs_tensor(self) -> None:
+        if self._is_dummy:
+            return
+
+        # Need include_gpu_probs_tensor for multi_step_worker
+        self._worker.set_include_gpu_probs_tensor()
+
+    def load_model(self) -> None:
+        if self._is_dummy:
+            return
+
+        with self._patch_tensor_parallel_group():
+            self._worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        if self._is_dummy:
+            # this case is not used now
+            return -1, -1
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        if self._is_dummy:
+            return
+
+        with self._patch_tensor_parallel_group():
+            self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def sampler_output(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        sample_len: int,
+    ) -> Tuple[List[SamplerOutput], bool]:
+        # Do not check _is_dummy, as it's always called by get_spec_proposals
+        return self._worker.sampler_output(execute_model_req, sample_len)
+
+    def get_spec_proposals(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> SpeculativeProposals:
+        """Produce speculations given an input batch of sequences. The number of
+        speculative tokens per sequence is determined by max_proposal_len.
+        """
+        if self._is_dummy:
+            return SpeculativeProposals(None, None, None)
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.get_spec_proposals(execute_model_req)
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        if self._is_dummy:
+            return []
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.execute_model(execute_model_req)
+
+    def get_cache_block_size_bytes(self) -> int:
+        if self._is_dummy:
+            # by returning zero, target worker can use the entire kv cache space
+            return 0
+
+        return self._worker.get_cache_block_size_bytes()
+
+    @property
+    def vocab_size(self) -> int:
+        return self._worker.vocab_size
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 58d3461a25188..5089e3dd556e9 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from vllm.config import SpeculativeConfig
+from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
@@ -18,6 +18,7 @@
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker
 from vllm.spec_decode.util import (create_sequence_group_output,
                                    get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
@@ -90,7 +91,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     @classmethod
     def create_worker(
         cls,
-        scorer_worker: WorkerBase,
+        scorer_worker: Worker,
         draft_worker_kwargs: Dict[str, Any],
         disable_by_batch_size: Optional[int],
     ) -> "SpecDecodeWorker":
@@ -111,7 +112,14 @@ def create_worker(
             proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
             disable_bonus_tokens = False
         else:
+            draft_parallel_config: ParallelConfig = draft_worker_kwargs[
+                'parallel_config']
+            draft_tp = draft_parallel_config.tensor_parallel_size
+            target_tp = scorer_worker.parallel_config.tensor_parallel_size
+
             proposer_worker = MultiStepWorker(**draft_worker_kwargs)
+            proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
+                proposer_worker, draft_tp, target_tp)
 
         logger.info("Configuring SpecDecodeWorker with proposer=%s",
                     type(proposer_worker))

From 7b993143014c95844b380a5b05eebd14ad77b7aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Wed, 26 Jun 2024 00:41:36 +0800
Subject: [PATCH 94/98] [Misc] Remove useless code in cpu_worker (#5824)

---
 vllm/worker/cpu_worker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3ee394f9912e9..914df0c7df0e8 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -277,7 +277,6 @@ def execute_model(
             assert seq_group_metadata_list is not None
             num_seq_groups: int = len(seq_group_metadata_list)
             assert execute_model_req is not None
-            blocks_to_copy = execute_model_req.blocks_to_copy
             blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                           device="cpu",
                                           dtype=torch.int64).view(-1, 2)

From 67882dbb44186d781ab6db9eaec08f6616dc86bd Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 25 Jun 2024 10:15:10 -0700
Subject: [PATCH 95/98] [Core] Add fault tolerance for `RayTokenizerGroupPool`
 (#5748)

---
 tests/tokenization/test_tokenizer_group.py    |  99 ++++++++++++++++
 vllm/engine/async_llm_engine.py               |   2 +
 vllm/engine/llm_engine.py                     |   2 +
 .../tokenizer_group/base_tokenizer_group.py   |   4 +
 .../tokenizer_group/ray_tokenizer_group.py    | 112 ++++++++++++++----
 5 files changed, 195 insertions(+), 24 deletions(-)

diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 31571dbfff6f6..1b9a590750429 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,5 +1,7 @@
 import asyncio
 import os
+import sys
+from typing import List, Optional
 from unittest.mock import patch
 
 import pytest
@@ -100,3 +102,100 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
             max_num_seqs=1,
             max_input_length=None)
         tokenizer_pool.ping()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
+async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
+    """Test that Ray tokenizer pool group can recover from failures and
+    if that's not possible, mark itself as unhealthy."""
+
+    class FailingTokenizerGroup(TokenizerGroup):
+
+        def __init__(self,
+                     *args,
+                     fail_at: Optional[List[int]] = None,
+                     **kwargs):
+            super().__init__(*args, **kwargs)
+            self.i = 0
+            self.fail_at = fail_at or []
+
+        def encode(self, *args, **kwargs):
+            self.i += 1
+            if self.i in self.fail_at:
+                sys.exit(1)
+            return super().encode(*args, **kwargs)
+
+    class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
+        _worker_cls = FailingTokenizerGroup
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Modify fail at to not fail at all (will be re-read when actor is
+    # re-initialized).
+    fail_at[0] = 1000
+
+    # We should recover successfully.
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+
+    # Check that we have a new actor
+    assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
+    assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+
+    # We should fail after re-initialization.
+    with pytest.raises(RuntimeError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt",
+                                                lora_request=None)
+
+    # check_health should raise the same thing
+    with pytest.raises(RuntimeError):
+        tokenizer_group_pool.check_health()
+
+    # Ensure that non-ActorDiedErrors are still propagated correctly and do not
+    # cause a re-initialization.
+    fail_at = []
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=2,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Prompt too long error
+    with pytest.raises(ValueError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt" * 100,
+                                                lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    # Actors should stay the same.
+    assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index df25eb111e87f..7994b873fe9bd 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -310,6 +310,8 @@ async def add_request_async(
         )
 
     async def check_health_async(self) -> None:
+        if self.tokenizer:
+            self.tokenizer.check_health()
         self.model_executor.check_health()
 
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f7eae257fdd16..0ad957ef9f958 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1013,6 +1013,8 @@ def pin_lora(self, lora_id: int) -> bool:
         return self.model_executor.pin_lora(lora_id)
 
     def check_health(self) -> None:
+        if self.tokenizer:
+            self.tokenizer.check_health()
         self.model_executor.check_health()
 
     def is_tracing_enabled(self) -> bool:
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index 3cce96e06d1a0..18fbd894f1c0e 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -53,3 +53,7 @@ async def get_lora_tokenizer_async(
     ) -> "PreTrainedTokenizer":
         """Get a tokenizer for a LoRA request."""
         pass
+
+    def check_health(self):
+        """Raise exception if the tokenizer group is unhealthy."""
+        return
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 7c605416854b8..21ec2b52bc95e 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -2,17 +2,21 @@
 import os
 from typing import List, Optional
 
+from ray.exceptions import ActorDiedError
 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 from transformers import PreTrainedTokenizer
 
 from vllm.config import TokenizerPoolConfig
 from vllm.executor.ray_utils import ray
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
     BaseTokenizerGroup)
 from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
     TokenizerGroup)
 
+logger = init_logger(__name__)
+
 
 class RayTokenizerGroupPool(BaseTokenizerGroup):
     """A Ray-based pool of TokenizerGroups for async tokenization."""
@@ -46,24 +50,28 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
                  ray_actor_options: dict, **tokenizer_config):
         # Store a local copy of the TokenizerGroup for quick access
         # to underlying HF tokenizers.
+        self._tokenizer_config = {
+            "tokenizer_id": tokenizer_id,
+            "enable_lora": enable_lora,
+            "max_num_seqs": max_num_seqs,
+            "max_input_length": max_input_length,
+            **tokenizer_config
+        }
         self._local_tokenizer_group = self._worker_cls(
-            tokenizer_id=tokenizer_id,
-            enable_lora=enable_lora,
-            max_num_seqs=max_num_seqs,
-            max_input_length=max_input_length,
-            **tokenizer_config,
-        )
-
-        ray_tokenizer_group_cls = ray.remote(
+            **self._tokenizer_config, )
+
+        self._ray_tokenizer_group_cls = ray.remote(
             self._worker_cls).options(**ray_actor_options)
-        self.tokenizer_actors = [
-            ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora,
-                                           max_num_seqs, max_input_length,
-                                           **tokenizer_config)
-            for _ in range(num_actors)
-        ]
+        self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
         self._idle_actors: Optional[asyncio.Queue] = None
 
+        # If set, actor is unhealthy. Will reraise on the next
+        # check_health call.
+        self._exception: Optional[ActorDiedError] = None
+
+    def _init_actor(self) -> ray.ObjectRef:
+        return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
+
     @property
     def pool_size(self) -> int:
         return len(self.tokenizer_actors)
@@ -78,6 +86,22 @@ def _ensure_queue_initialized(self):
             for actor in self.tokenizer_actors:
                 self._idle_actors.put_nowait(actor)
 
+    def _finalize_encode(self, actor: ray.ObjectRef,
+                         original_actor: ray.ObjectRef, actor_is_alive: bool):
+        assert self._idle_actors is not None
+        # Cleanup the dead actor.
+        if not actor_is_alive or original_actor is not actor:
+            self.tokenizer_actors.remove(original_actor)
+        if actor_is_alive:
+            # Put the actor back in the queue.
+            # This is done in a finally block to ensure that the actor is
+            # always put back in the queue, even if an exception/cancellation
+            # is raised.
+            self._idle_actors.put_nowait(actor)
+            # Add back the new actor.
+            if original_actor is not actor:
+                self.tokenizer_actors.append(actor)
+
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
@@ -88,23 +112,41 @@ def encode(self,
         The actor is then put back in the queue for future use.
         This is blocking.
         """
+        self.check_health()
         self._ensure_queue_initialized()
         assert self._idle_actors is not None
 
         if self._idle_actors.empty():
             raise RuntimeError("No idle actors available.")
         actor = self._idle_actors.get_nowait()
+        actor_is_alive = True
+        original_actor = actor
         try:
             ret = ray.get(
                 actor.encode.remote(request_id=request_id,
                                     prompt=prompt,
                                     lora_request=lora_request))
+        except ActorDiedError as e:
+            # If the actor is dead, we first try to reinitialize it.
+            logger.warning("%s died with ActorDiedError, reinitializing.",
+                           actor,
+                           exc_info=e)
+            actor = self._init_actor()
+            try:
+                ret = ray.get(
+                    actor.encode.remote(request_id=request_id,
+                                        prompt=prompt,
+                                        lora_request=lora_request))
+            except ActorDiedError as e:
+                logger.error(
+                    "%s died for second time in a row, marking "
+                    "RayTokenizerGroupPool as unhealthy.", actor)
+                actor_is_alive = False
+                if not self._exception:
+                    self._exception = e
+                self.check_health()
         finally:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
+            self._finalize_encode(actor, original_actor, actor_is_alive)
         return ret
 
     async def encode_async(
@@ -120,20 +162,37 @@ async def encode_async(
         The actor is then put back in the queue for future use.
         This is non-blocking.
         """
+        self.check_health()
         self._ensure_queue_initialized()
         assert self._idle_actors is not None
 
         actor = await self._idle_actors.get()
+        actor_is_alive = True
+        original_actor = actor
         try:
             ret = await actor.encode.remote(request_id=request_id,
                                             prompt=prompt,
                                             lora_request=lora_request)
+        except ActorDiedError as e:
+            # If the actor is dead, we first try to reinitialize it.
+            logger.warning("%s died with ActorDiedError, reinitializing.",
+                           actor,
+                           exc_info=e)
+            actor = self._init_actor()
+            try:
+                ret = await actor.encode.remote(request_id=request_id,
+                                                prompt=prompt,
+                                                lora_request=lora_request)
+            except ActorDiedError as e:
+                logger.error(
+                    "%s died for second time in a row, marking "
+                    "RayTokenizerGroupPool as unhealthy.", actor)
+                actor_is_alive = False
+                if not self._exception:
+                    self._exception = e
+                self.check_health()
         finally:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
+            self._finalize_encode(actor, original_actor, actor_is_alive)
         return ret
 
     def get_max_input_len(self,
@@ -155,6 +214,11 @@ async def get_lora_tokenizer_async(
         return await self._local_tokenizer_group.get_lora_tokenizer_async(
             lora_request)
 
+    def check_health(self):
+        if self._exception:
+            raise RuntimeError(
+                "TokenizerGroupPool is unhealthy.") from self._exception
+
 
 def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
     """Copy over all current process environment variables to the runtime_env.

From c18ebfdd71d16eb18617676b0b1d82ebde0027f0 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 25 Jun 2024 12:10:28 -0700
Subject: [PATCH 96/98] [doc][distributed] add both gloo and nccl tests (#5834)

---
 docs/source/getting_started/debugging.rst | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index a22bba1478abb..4cd34769ecfb4 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -28,8 +28,8 @@ If it crashes, and the error trace shows somewhere around ``self.graph.replay()`
 
 Here are some common issues that can cause hangs:
 
-- **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
-- **Incorrect hardware/driver**: GPU communication cannot be established. You can run the following sanity check script to see if the GPU communication is working correctly.
+- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address.
+- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly.
 
 .. code-block:: python
 
@@ -41,7 +41,14 @@ Here are some common issues that can cause hangs:
     dist.all_reduce(data, op=dist.ReduceOp.SUM)
     torch.cuda.synchronize()
     value = data.mean().item()
-    assert value == dist.get_world_size()
+    world_size = dist.get_world_size()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+    gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+    cpu_data = torch.FloatTensor([1,] * 128)
+    dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+    value = cpu_data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
 
 .. tip::
 

From d9b34baeddc7f48a526dc610429a3c8670b3b339 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 25 Jun 2024 15:18:03 -0400
Subject: [PATCH 97/98] [CI/Build] Add unit testing for FlexibleArgumentParser
 (#5798)

---
 tests/test_utils.py | 61 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0b674ea6a85c1..8203b5d2f960d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,7 +7,8 @@
 
 import pytest
 
-from vllm.utils import deprecate_kwargs, get_open_port, merge_async_iterators
+from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs,
+                        get_open_port, merge_async_iterators)
 
 from .utils import error_on_warning
 
@@ -130,3 +131,61 @@ def test_get_open_port():
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
                 s3.bind(("localhost", get_open_port()))
     os.environ.pop("VLLM_PORT")
+
+
+# Tests for FlexibleArgumentParser
+@pytest.fixture
+def parser():
+    parser = FlexibleArgumentParser()
+    parser.add_argument('--image-input-type',
+                        choices=['pixel_values', 'image_features'])
+    parser.add_argument('--model-name')
+    parser.add_argument('--batch-size', type=int)
+    parser.add_argument('--enable-feature', action='store_true')
+    return parser
+
+
+def test_underscore_to_dash(parser):
+    args = parser.parse_args(['--image_input_type', 'pixel_values'])
+    assert args.image_input_type == 'pixel_values'
+
+
+def test_mixed_usage(parser):
+    args = parser.parse_args([
+        '--image_input_type', 'image_features', '--model-name',
+        'facebook/opt-125m'
+    ])
+    assert args.image_input_type == 'image_features'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_equals_sign(parser):
+    args = parser.parse_args(
+        ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
+    assert args.image_input_type == 'pixel_values'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_int_value(parser):
+    args = parser.parse_args(['--batch_size', '32'])
+    assert args.batch_size == 32
+    args = parser.parse_args(['--batch-size', '32'])
+    assert args.batch_size == 32
+
+
+def test_with_bool_flag(parser):
+    args = parser.parse_args(['--enable_feature'])
+    assert args.enable_feature is True
+    args = parser.parse_args(['--enable-feature'])
+    assert args.enable_feature is True
+
+
+def test_invalid_choice(parser):
+    with pytest.raises(SystemExit):
+        parser.parse_args(['--image_input_type', 'invalid_choice'])
+
+
+def test_missing_required_argument(parser):
+    parser.add_argument('--required-arg', required=True)
+    with pytest.raises(SystemExit):
+        parser.parse_args([])

From dd248f76756adba4a1637b882e79ab639f957feb Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 25 Jun 2024 15:23:35 -0400
Subject: [PATCH 98/98] [Misc] Update `w4a16` `compressed-tensors` support to
 include `w8a16` (#5794)

---
 tests/quantization/test_compressed_tensors.py | 23 +++++++--------
 .../compressed_tensors/compressed_tensors.py  | 28 +++++++++++--------
 .../compressed_tensors/schemes/__init__.py    |  5 ++--
 .../schemes/compressed_tensors_w4a16_24.py    |  1 +
 ...s_w4a16.py => compressed_tensors_wNa16.py} |  5 ++--
 5 files changed, 36 insertions(+), 26 deletions(-)
 rename vllm/model_executor/layers/quantization/compressed_tensors/schemes/{compressed_tensors_w4a16.py => compressed_tensors_wNa16.py} (98%)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index aaa366335d196..6eb7ff72fb11d 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -8,9 +8,9 @@
 
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor,
+    CompressedTensorsWNA16)
 
 
 @pytest.mark.parametrize("model_args", [
@@ -74,26 +74,27 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
         assert qkv_proj.weight.dtype is torch.int8
 
 
-@pytest.mark.parametrize("w4a16_args", [
-    ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None),
-    ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
-])
-def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
-    model, strategy, group = w4a16_args
+@pytest.mark.parametrize(
+    "wNa16_args",
+    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
+     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
+     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
+def test_compressed_tensors_w4a16(vllm_runner, wNa16_args):
+    model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
 
         qkv_proj = layer.self_attn.qkv_proj
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
 
         assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.scheme.group_size == group
 
         assert qkv_proj.weight_packed.dtype is torch.int32
         assert qkv_proj.weight_scale.dtype is torch.float16
-        assert qkv_proj.weight_packed.pack_factor == 8
+        assert qkv_proj.weight_packed.pack_factor == pack_factor
 
 
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 44dd024afe74d..c69e2f3bcf9fa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -7,9 +7,10 @@
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW4A16,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     find_first_name_or_class_match)
@@ -108,26 +109,31 @@ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
 
         return is_8_bits and is_token and is_symmetric and is_dynamic
 
-    def _is_w4a16(self, weight_quant: BaseModel,
-                  input_quant: BaseModel) -> bool:
+    def _is_wNa16_group_channel(self, weight_quant: BaseModel,
+                                input_quant: BaseModel) -> bool:
         input_quant_none = input_quant is None
-        is_4_bits = weight_quant.num_bits == 4
         is_symmetric = weight_quant.symmetric
+        is_channel_group = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value)
         is_static = not weight_quant.dynamic
 
-        return is_4_bits and input_quant_none and is_symmetric and is_static
+        return (is_channel_group and input_quant_none and is_symmetric
+                and is_static)
 
     def _get_schema(self, weight_quant: BaseModel,
                     input_quant: BaseModel) -> "CompressedTensorsScheme":
 
-        if self._is_w4a16(weight_quant, input_quant):
-            if self.quant_format == CompressionFormat.marlin_24.value:
+        if self._is_wNa16_group_channel(weight_quant, input_quant):
+            if (self.quant_format == CompressionFormat.marlin_24.value
+                    and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
                     group_size=weight_quant.group_size)
-            if self.quant_format == CompressionFormat.pack_quantized.value:
-                return CompressedTensorsW4A16(
+            if (self.quant_format == CompressionFormat.pack_quantized.value
+                    and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
+                return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
                     group_size=weight_quant.group_size)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 3c95aa11fc76c..f6d20ce2c6f77 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,10 +1,11 @@
 from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
     CompressedTensorsUnquantized)
-from .compressed_tensors_w4a16 import CompressedTensorsW4A16  # noqa: F401
 from .compressed_tensors_w4a16_24 import (  # noqa: F401
-    CompressedTensorsW4A16Sparse24)
+    W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
     CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
     CompressedTensorsW8A8StaticTensor)
+from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS  # noqa: F401
+from .compressed_tensors_wNa16 import CompressedTensorsWNA16  # noqa: F401
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index d7e04ddb8d94a..607029c819ddb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -11,6 +11,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW4A16Sparse24"]
+W4A16SPARSE24_SUPPORTED_BITS = [4]
 
 
 class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
similarity index 98%
rename from vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 373458cfffe04..7707ea6ee94bc 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -11,10 +11,11 @@
     marlin_permute_scales)
 from vllm.model_executor.utils import set_weight_attrs
 
-__all__ = ["CompressedTensorsW4A16"]
+__all__ = ["CompressedTensorsWNA16"]
+WNA16_SUPPORTED_BITS = [4, 8]
 
 
-class CompressedTensorsW4A16(CompressedTensorsScheme):
+class CompressedTensorsWNA16(CompressedTensorsScheme):
 
     def __init__(self,
                  strategy: str,