diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index eec2a51e2f8fd..64ba1b32fb074 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -9,8 +9,11 @@ steps:
           - image: badouralix/curl-jq
             command:
             - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+
   - wait
+
   - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: A100
     plugins:
@@ -18,7 +21,7 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
             command:
             - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
             resources:
@@ -41,20 +44,48 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  # - label: "H100"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN
 
+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  - block: "Run H100 Benchmark"
+    key: block-h100
+    depends_on: ~
+
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H100
+    depends_on: block-h100
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 7cf05610b9953..9d3646e2f6a15 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -157,6 +157,18 @@ def results_to_json(latency, throughput, serving):
                                              throughput_results,
                                              serving_results)
 
+    for df in [latency_results, serving_results, throughput_results]:
+        if df.empty:
+            continue
+
+        # Sort all dataframes by their respective "Test name" columns
+        df.sort_values(by="Test name", inplace=True)
+
+        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
+        # we want to turn it into "8xGPUTYPE"
+        df["GPU"] = df["GPU"].apply(
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
+
     # get markdown tables
     latency_md_table = tabulate(latency_results,
                                 headers='keys',
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index d397b05cdff23..0d16a83781ab2 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -6,6 +6,7 @@
 
 # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
 # and we still want to see other benchmarking results even when mixtral crashes.
+set -x
 set -o pipefail
 
 check_gpus() {
@@ -85,11 +86,7 @@ kill_gpu_processes() {
 
   ps -aux
   lsof -t -i:8000 | xargs -r kill -9
-  pkill -f pt_main_thread
-  # this line doesn't work now
-  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
-  pkill -f python3
-  pkill -f /usr/bin/python3
+  pgrep python3 | xargs -r kill -9
 
 
   # wait until GPU memory usage smaller than 1GB
@@ -289,7 +286,7 @@ run_serving_tests() {
     # run the server
     echo "Running test case $test_name"
     echo "Server command: $server_command"
-    eval "$server_command" &
+    bash -c "$server_command" &
     server_pid=$!
 
     # wait until the server is alive
@@ -322,7 +319,7 @@ run_serving_tests() {
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
 
-      eval "$client_command"
+      bash -c "$client_command"
 
       # record the benchmarking commands
       jq_output=$(jq -n \
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
index 19f7160e68a4d..aa0f7ade808e0 100644
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
-URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
+URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
 
 TIMEOUT_SECONDS=10
 
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index f78e360b7afd3..2de6fceb0c3fe 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,7 +1,7 @@
 steps:
   - label: "Build wheel - CUDA 12.1"
     agents:
-      queue: cpu_queue
+      queue: cpu_queue_postmerge
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
@@ -18,7 +18,7 @@ steps:
   - label: "Build wheel - CUDA 11.8"
     # depends_on: block-build-cu118-wheel
     agents:
-      queue: cpu_queue
+      queue: cpu_queue_postmerge
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
@@ -26,3 +26,32 @@ steps:
       - "bash .buildkite/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
+
+  - block: "Build release image"
+    depends_on: ~
+    key: block-release-image-build
+
+  - label: "Build release image"
+    depends_on: block-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Build and publish TPU release image"
+    depends_on: ~
+    if: build.env("NIGHTLY") == "1"
+    agents:
+      queue: tpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "docker push vllm/vllm-tpu:nightly"
+      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllm
+          password-env: DOCKERHUB_TOKEN
+    env:
+      DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 902e162720b89..3515ccd65667e 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -85,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_encoder_decoder_attn.py \
   --ignore=kernels/test_flash_attn.py \
   --ignore=kernels/test_flashinfer.py \
-  --ignore=kernels/test_gguf.py \
   --ignore=kernels/test_int8_quant.py \
   --ignore=kernels/test_machete_gemm.py \
   --ignore=kernels/test_mamba_ssm.py \
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 5d7a0bff90963..bc06838d804ff 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -4,49 +4,11 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
-# Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
-
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; }
+remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image, setting --shm-size=4g for tensor parallel.
-source /etc/environment
-#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
-
-function cpu_tests() {
-  set -e
-
-  # Run basic model test
-  docker exec cpu-test bash -c "
-    set -e
-    pip install pytest pytest-asyncio \
-      decord einops librosa peft Pillow sentence-transformers soundfile \
-      transformers_stream_generator matplotlib datamodel_code_generator
-    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-    pytest -v -s tests/models/decoder_only/language -m cpu_model
-    pytest -v -s tests/models/embedding/language -m cpu_model
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
-    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
-
-  # online inference
-  docker exec cpu-test bash -c "
-    set -e
-    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
-    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-    python3 benchmarks/benchmark_serving.py \
-      --backend vllm \
-      --dataset-name random \
-      --model facebook/opt-125m \
-      --num-prompts 20 \
-      --endpoint /v1/completions \
-      --tokenizer facebook/opt-125m"
-}
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.ppc64le .
 
-# All of CPU tests are expected to be finished less than 25 mins.
-export -f cpu_tests
-timeout 25m bash -c "cpu_tests"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 14756b5964aaf..4f1729d46dae2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -13,26 +13,27 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
+remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
 
 function cpu_tests() {
   set -e
+  export NUMA_NODE=$2
 
   # offline inference
-  docker exec cpu-test-avx2 bash -c "
+  docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
     set -e
     python3 examples/offline_inference.py"
 
   # Run basic model test
-  docker exec cpu-test bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pip install pytest pytest-asyncio \
       decord einops librosa peft Pillow sentence-transformers soundfile \
@@ -45,20 +46,26 @@ function cpu_tests() {
     pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
   # Run compressed-tensor test
-  docker exec cpu-test bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
   # Run AWQ test
-  docker exec cpu-test bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
+  # Run chunked-prefill and prefix-cache test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v -k cpu_model \
+    tests/basic_correctness/test_chunked_prefill.py"  
+
   # online inference
-  docker exec cpu-test bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
     export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -75,4 +82,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 25 mins.
 export -f cpu_tests
-timeout 25m bash -c "cpu_tests $CORE_RANGE"
+timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
new file mode 100644
index 0000000000000..d06604f96f2b8
--- /dev/null
+++ b/.buildkite/run-gh200-test.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script build the GH200 docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+DOCKER_BUILDKIT=1 docker build . \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t gh200-test \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+# Setup cleanup
+remove_docker_container() { docker rm -f gh200-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference
+docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference.py
+'
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index 4505dc7a9373c..fa4f74fca7a11 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index faeac8e2ded36..e0a12afbe7320 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -12,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
+# Run the image and test offline inference/tensor parallel
+docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
+    python3 examples/offline_inference.py
+    python3 examples/offline_inference_cli.py -tp 2
+'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 24bf223fb12c0..b563c96343f92 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -9,8 +9,7 @@
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
-# nightly(bool): run this test in nightly pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually)
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for test. incompatbile with command.
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -51,7 +50,9 @@ steps:
   - tests/multimodal
   - tests/test_utils
   - tests/worker
+  - tests/standalone_tests/lazy_torch_compile.py
   commands:
+  - python3 standalone_tests/lazy_torch_compile.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@@ -60,6 +61,13 @@ steps:
   - pytest -v -s test_utils.py # Utils
   - pytest -v -s worker # Worker
 
+- label: Python-only Installation Test
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
 - label: Basic Correctness Test # 30min
   #mirror_hardwares: [amd]
   fast_check: true
@@ -171,16 +179,16 @@ steps:
     - vllm/
     - tests/v1
   commands:
-    - pytest -v -s v1
+    - VLLM_USE_V1=1 pytest -v -s v1
 
-- label: Examples Test # 15min
+- label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/entrypoints
   - examples/
   commands:
-    - pip install awscli tensorizer # for llava example and tensorizer test
+    - pip install tensorizer # for tensorizer test
     - python3 offline_inference.py
     - python3 cpu_offload.py
     - python3 offline_inference_chat.py
@@ -190,10 +198,13 @@ steps:
     - python3 offline_inference_vision_language_multi_image.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
-    - python3 offline_profile.py --model facebook/opt-125m
+    - python3 offline_inference_classification.py
+    - python3 offline_inference_embedding.py
+    - python3 offline_inference_scoring.py
+    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/prefix_caching
@@ -213,8 +224,12 @@ steps:
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/model_executor/layers
+  - vllm/model_executor/guided_decoding
   - tests/test_logits_processor
-  command: pytest -v -s test_logits_processor.py
+  - tests/model_executor/test_guided_processors
+  commands: 
+    - pytest -v -s test_logits_processor.py
+    - pytest -v -s model_executor/test_guided_processors.py
 
 - label: Speculative decoding tests # 30min
   source_file_dependencies:
@@ -229,7 +244,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
 - label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -313,7 +328,7 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 30min
+- label: Basic Models Test # 24min
   source_file_dependencies:
   - vllm/
   - tests/models
@@ -323,7 +338,7 @@ steps:
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
-- label: Language Models Test (Standard) # 42min
+- label: Language Models Test (Standard) # 32min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -333,10 +348,9 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
-    - pytest -v -s models/embedding/vision_language -m core_model
 
-- label: Language Models Test (Extended) # 50min
-  nightly: true
+- label: Language Models Test (Extended) # 1h10min
+  optional: true
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/language
@@ -345,9 +359,8 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 26min
+- label: Multi-Modal Models Test (Standard) # 28min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -356,13 +369,15 @@ steps:
   - tests/models/embedding/vision_language
   - tests/models/encoder_decoder/vision_language
   commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/vision_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-- label: Multi-Modal Models Test (Extended) # 1h15m
-  nightly: true
+- label: Multi-Modal Models Test (Extended) 1 # 1h16m
+  optional: true
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/audio_language
@@ -370,14 +385,26 @@ steps:
   - tests/models/embedding/vision_language
   - tests/models/encoder_decoder/vision_language
   commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 
+- label: Multi-Modal Models Test (Extended) 2 # 38m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
   optional: true
@@ -412,11 +439,11 @@ steps:
   - tests/distributed/
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 
 - label: Distributed Tests (2 GPUs) # 40min
   #mirror_hardwares: [amd]
@@ -429,19 +456,23 @@ steps:
   - vllm/model_executor/models/
   - tests/distributed/
   - vllm/compilation
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/model_runner.py
   commands:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
 - label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"
@@ -474,18 +505,22 @@ steps:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
 
-- label: LoRA Long Context (Distributed) # 11min
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
+- label: LoRA TP Test (Distributed)
   num_gpus: 4
-  soft_fail: true
   source_file_dependencies:
   - vllm/lora
-  - tests/lora/test_long_context
+  - tests/lora
   commands:
     # FIXIT: find out which code initialize cuda before running the test
     # before the fix, we need to use spawn to test it
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # This test runs llama 13B, so it is required to run on 4 GPUs.
     - pytest -v -s -x lora/test_long_context.py
+    # There is some Tensor Parallelism related processing logic in LoRA that 
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+
 
 - label: Weight Loading Multiple GPU Test  # 33min
   working_dir: "/vllm-workspace/tests"
@@ -513,6 +548,7 @@ steps:
 
 - label: Distributed Tests (A100) # optional
   gpu: a100
+  optional: true
   num_gpus: 4
   source_file_dependencies:
   - vllm/
@@ -521,11 +557,12 @@ steps:
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
   - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
 - label: LM Eval Large Models # optional
   gpu: a100
+  optional: true
   num_gpus: 4
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
index 541b395eddbe7..7345dd4e66b29 100644
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@@ -25,7 +25,12 @@ echo "Version: $version"
 
 # If the version contains "dev", rename it to v1.0.0.dev for consistency
 if [[ $version == *dev* ]]; then
-    new_version="1.0.0.dev"
+    suffix="${version##*.}"
+    if [[ $suffix == cu* ]]; then
+        new_version="1.0.0.dev+${suffix}"
+    else
+        new_version="1.0.0.dev"
+    fi
     new_wheel="${wheel/$version/$new_version}"
     mv -- "$wheel" "$new_wheel"
     wheel="$new_wheel"
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index cd721971d01d6..3cb91fc0f8232 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,13 +3,16 @@
 
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-CMakeLists.txt @tlrmchlsmth @WoosukKwon
+/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+CMakeLists.txt @tlrmchlsmth
+
+# vLLM V1
+/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
 
 # Test ownership
 /tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
index 71f4e520135d4..d1f6105a47166 100644
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,2 +1,2 @@
 github: [vllm-project]
-open_collective: [vllm]
+open_collective: vllm
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 4f54eea564ecb..683b70cd89989 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -15,6 +15,8 @@ updates:
     allow:
       - dependency-type: "all"
     ignore:
+      - dependency-name: "*"
+        update-types: ["version-update:semver-patch"]
       - dependency-name: "torch"
       - dependency-name: "torchvision"
       - dependency-name: "xformers"
@@ -24,9 +26,6 @@ updates:
       - dependency-name: "ray[adag]"
       - dependency-name: "lm-eval"
     groups:
-      patch-update:
-        applies-to: version-updates
-        update-types: ["patch"]
       minor-update:
         applies-to: version-updates
         update-types: ["minor"]
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
index 3b2da7b9f8966..3246c6f9bc4b7 100755
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -15,19 +15,36 @@ NEW=/tmp/new_pr_body.txt
 gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
 cp "${OLD}" "${NEW}"
 
-# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
-sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
-
 # Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
 sed -i '/FIX #xxxx.*$/d' "${NEW}"
 
 # Remove "FILL IN THE PR DESCRIPTION HERE"
 sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
 
+# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
+sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
+
+# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
+python3 - <<EOF
+import re
+
+with open("${NEW}", "r") as file:
+    content = file.read()
+
+pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
+content = re.sub(pattern, '', content)
+
+with open("${NEW}", "w") as file:
+    file.write(content)
+EOF
+
 # Run this only if ${NEW} is different than ${OLD}
 if ! cmp -s "${OLD}" "${NEW}"; then
-    echo "Updating PR body"
     gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
+    echo
+    echo "Updated PR body:"
+    echo
+    cat "${NEW}"
 else
     echo "No changes needed"
 fi
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
new file mode 100644
index 0000000000000..ab6f6e5d2060d
--- /dev/null
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -0,0 +1,81 @@
+name: Lint and Deploy Charts
+
+on: pull_request
+
+jobs:
+  lint-and-deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: Set up Helm
+        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
+        with:
+          version: v3.14.4
+
+       #Python is required because ct lint runs Yamale and yamllint which require Python.
+      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: '3.13'
+
+      - name: Set up chart-testing
+        uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
+        with:
+          version: v3.10.1
+
+      - name: Run chart-testing (lint)
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
+
+      - name: Setup minio
+        run: |
+          docker network create vllm-net
+          docker run -d -p 9000:9000 --name minio --net vllm-net \
+                     -e "MINIO_ACCESS_KEY=minioadmin" \
+                     -e "MINIO_SECRET_KEY=minioadmin" \
+                     -v /tmp/data:/data \
+                     -v /tmp/config:/root/.minio \
+                     minio/minio server /data
+          export AWS_ACCESS_KEY_ID=minioadmin
+          export AWS_SECRET_ACCESS_KEY=minioadmin
+          export AWS_EC2_METADATA_DISABLED=true
+          mkdir opt-125m
+          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
+          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
+          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
+
+      - name: Create kind cluster
+        uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
+
+      - name: Build the Docker image vllm cpu
+        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+
+      - name: Configuration of docker images, network and namespace for the kind cluster
+        run: |
+          docker pull amazon/aws-cli:2.6.4
+          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
+          kind load docker-image vllm-cpu-env:latest --name chart-testing
+          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
+          kubectl create ns ns-vllm
+
+      - name: Run chart-testing (install)
+        run: |
+          export AWS_ACCESS_KEY_ID=minioadmin
+          export AWS_SECRET_ACCESS_KEY=minioadmin
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+    
+      - name: curl test
+        run: |
+          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
+          sleep 10
+          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
+                  --header "Content-Type: application/json" \
+                  --data '{
+                          "model": "opt-125m",
+                          "prompt": "San Francisco is a",
+                          "max_tokens": 7,
+                          "temperature": 0
+                  }'):$CODE"
+          echo "$CODE"
\ No newline at end of file
diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml
new file mode 100644
index 0000000000000..4932af943a07b
--- /dev/null
+++ b/.github/workflows/png-lint.yml
@@ -0,0 +1,37 @@
+name: Lint PNG exports from excalidraw
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '*.excalidraw.png'
+      - '.github/workflows/png-lint.yml'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '*.excalidraw.png'
+      - '.github/workflows/png-lint.yml'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: "Run png-lint.sh to check excalidraw exported images"
+        run: |
+          tools/png-lint.sh
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c1051d10a4860..e40ceaaa8b037 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -39,67 +39,68 @@ jobs:
             const script = require('.github/workflows/scripts/create_release.js')
             await script(github, context, core)
 
-  wheel:
-    name: Build Wheel
-    runs-on: ${{ matrix.os }}
-    needs: release
-
-    strategy:
-      fail-fast: false
-      matrix:
-          os: ['ubuntu-20.04']
-          python-version: ['3.9', '3.10', '3.11', '3.12']
-          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
-          cuda-version: ['11.8', '12.1']
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
-        with:
-          create-symlink: true
-          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
-      - name: Set up Linux Env
-        if: ${{ runner.os == 'Linux' }}
-        run: |
-          bash -x .github/workflows/scripts/env.sh
-
-      - name: Set up Python
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-            python-version: ${{ matrix.python-version }}
-
-      - name: Install CUDA ${{ matrix.cuda-version }}
-        run: |
-          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
-      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
-        run: |
-          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
-      - name: Build wheel
-        shell: bash
-        env:
-          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
-        run: |
-          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
-          asset_name=${wheel_name//"linux"/"manylinux1"}
-          echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
-          echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
-      - name: Upload Release Asset
-        uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.release.outputs.upload_url }}
-          asset_path: ./dist/${{ env.wheel_name }}
-          asset_name: ${{ env.asset_name }}
-          asset_content_type: application/*
+  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
+  # wheel:
+  #   name: Build Wheel
+  #   runs-on: ${{ matrix.os }}
+  #   needs: release
+
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #         os: ['ubuntu-20.04']
+  #         python-version: ['3.9', '3.10', '3.11', '3.12']
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+  #         cuda-version: ['11.8', '12.1']
+
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+  #     - name: Setup ccache
+  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
+  #       with:
+  #         create-symlink: true
+  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+
+  #     - name: Set up Linux Env
+  #       if: ${{ runner.os == 'Linux' }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/env.sh
+
+  #     - name: Set up Python
+  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+  #       with:
+  #           python-version: ${{ matrix.python-version }}
+
+  #     - name: Install CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+
+  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+
+  #     - name: Build wheel
+  #       shell: bash
+  #       env:
+  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+  #       run: |
+  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
+  #         asset_name=${wheel_name//"linux"/"manylinux1"}
+  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+
+  #     - name: Upload Release Asset
+  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #       with:
+  #         upload_url: ${{ needs.release.outputs.upload_url }}
+  #         asset_path: ./dist/${{ env.wheel_name }}
+  #         asset_name: ${{ env.asset_name }}
+  #         asset_content_type: application/*
 
       # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
       # - name: Publish package
diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml
new file mode 100644
index 0000000000000..e0bb24276a653
--- /dev/null
+++ b/.github/workflows/sphinx-lint.yml
@@ -0,0 +1,32 @@
+name: Lint documentation
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "docs/**"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "docs/**"
+
+jobs:
+  sphinx-lint:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-lint.txt
+      - name: Linting docs
+        run: tools/sphinx-lint.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5acbd762ee957..51b49a18dddf2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
@@ -196,6 +196,8 @@ set(VLLM_EXT_SRC
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
+  "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
   "csrc/torch_bindings.cpp")
@@ -204,19 +206,32 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
 
-  FetchContent_Declare(
+  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
+  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
+    set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
+  endif()
+
+  if(VLLM_CUTLASS_SRC_DIR)
+    if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
+      get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
+    endif()
+    message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
+    FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
+  else()
+    FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
+        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
-  )
+        GIT_SHALLOW FALSE
+    )
+  endif()
   FetchContent_MakeAvailable(cutlass)
 
   list(APPEND VLLM_EXT_SRC
@@ -224,10 +239,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/mamba/causal_conv1d/causal_conv1d.cu"
     "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/quantization/gguf/gguf_kernel.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
+    "csrc/cutlass_extensions/common.cpp")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -236,7 +253,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
@@ -257,11 +274,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
+  # For Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+             "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -270,12 +290,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+      message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
+                     "later if you intend on running FP8 sparse or quantized models on "
                      "Hopper.")
     else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building cutlass_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
 
@@ -288,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
   cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -390,7 +410,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
@@ -414,7 +434,7 @@ set_gencode_flags_for_srcs(
   CUDA_ARCHS "${CUDA_ARCHS}")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
     set(MARLIN_MOE_SRC
         "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
@@ -509,7 +529,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
+          GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/Dockerfile b/Dockerfile
index 220dbe26712ec..123703848749c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
@@ -46,9 +47,14 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
+ARG TARGETPLATFORM
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
@@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -134,8 +146,8 @@ COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
-
 #################### DEV IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
@@ -143,6 +155,9 @@ ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# install vllm wheel first, so that torch etc will be installed
+# Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip uninstall -y torch && \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+. /etc/environment && \
+if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+fi
 COPY examples examples
 #################### vLLM installation IMAGE ####################
 
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@@ -191,6 +213,10 @@ ADD . /vllm-workspace/
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
 
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -e tests/vllm_test_utils
+
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install hf_transfer
@@ -205,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN mv vllm test_docs/
-
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
@@ -214,8 +239,11 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
-
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+    else \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+    fi
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.arm b/Dockerfile.arm
new file mode 100644
index 0000000000000..093ee2209222f
--- /dev/null
+++ b/Dockerfile.arm
@@ -0,0 +1,62 @@
+# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-arm
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
+
+# Set LD_PRELOAD for tcmalloc on ARM
+ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements-build.txt
+
+FROM cpu-test-arm AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
+    pip install -v -r requirements-cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Disabling AVX512 specific optimizations for ARM
+ARG VLLM_CPU_DISABLE_AVX512="true"
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 287b4958da4e5..ebe226cf6d148 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -16,7 +16,7 @@ RUN --mount=type=cache,target=/var/cache/apt \
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp
+    pip install intel-openmp==2025.0.1
 
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
 
@@ -62,4 +62,8 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e tests/vllm_test_utils
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index d18fc016387bf..87e0c1a6a934e 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -11,6 +11,9 @@ ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 
 RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 2143315d2a078..77162bc82de62 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -1,5 +1,6 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
+# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
 
 FROM $BASE_IMAGE
 
@@ -38,4 +39,7 @@ ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
     pip install --no-build-isolation -v -e .
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index a05ff452cd36e..8bd188ffde408 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -22,4 +22,7 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVIC
 COPY examples/ /workspace/examples
 COPY benchmarks/ /workspace/benchmarks
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index b19c6ddec7948..971248577983f 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -29,6 +29,9 @@ RUN --mount=type=cache,target=/root/.cache/pip  \
 RUN --mount=type=bind,source=.git,target=.git \
     VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 8fb79afaebe97..e733994f8c33e 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -51,9 +51,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         *"rocm-6.2"*) \
             python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --pre \
-                torch==2.6.0.dev20240918 \
+                torch==2.6.0.dev20241113+rocm6.2 \
                 'setuptools-scm>=8' \
-                torchvision==0.20.0.dev20240918 \
+                torchvision==0.20.0.dev20241113+rocm6.2 \
                 --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
         *) ;; esac
 
@@ -168,4 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     if ls libs/*.whl; then \
     python3 -m pip install libs/*.whl; fi
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 0a507b6ecdf60..b617932a85b47 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -22,4 +22,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         -r requirements-tpu.txt
 RUN python3 setup.py develop
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 8bb80a299ccc4..827792c8c7bd8 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -50,9 +50,25 @@ ENV CUDA_HOME="/usr/local/cuda" \
     PATH="${CUDA_HOME}/bin:${PATH}" \
     LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
 
+# Install necessary packages to build numactl from source
+RUN microdnf install -y autoconf automake libtool make rpm-build
+
+# Download the numactl source RPM to /root directory
+WORKDIR /root
+RUN microdnf download --enablerepo=ubi-9-baseos-source --source numactl.src
+
+# Get NUMACTL_V (just the version part of the RPM file)
+RUN NUMACTL_V=$(ls /root/numactl-* | sed -r 's|/root/numactl-(.+)\.el9\.src\.rpm|\1|') && \
+    echo "########################################### NUMACTL_V is set to $NUMACTL_V" && \
+    rpm -i /root/numactl-${NUMACTL_V}.el9.src.rpm && \
+    rpmbuild -ba /root/rpmbuild/SPECS/numactl.spec  && \
+    rpm -i /root/rpmbuild/RPMS/x86_64/{numactl-libs-${NUMACTL_V}.el9.x86_64.rpm,numactl-${NUMACTL_V}.el9.x86_64.rpm,numactl-devel-${NUMACTL_V}.el9.x86_64.rpm}
+
 ## Python cuda base #################################################################
 FROM cuda-base AS python-cuda-base
 
+WORKDIR /workspace
+
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
@@ -206,10 +222,15 @@ FROM vllm-openai as vllm-grpc-adapter
 
 USER root
 
+RUN --mount=type=bind,from=cuda-base,source=/root/rpmbuild/RPMS/x86_64,target=/tmp/rpm/ \
+    rpm -i --nodeps /tmp/rpm/numactl-libs-*.el9.x86_64.rpm
+
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
-    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.5.3
+    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]"
+
+RUN pip install git+https://github.com/opendatahub-io/vllm-tgis-adapter.git@vllm0p6p5
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 63bc682770422..a374f20d7d949 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -64,5 +64,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV VLLM_USAGE_SOURCE production-docker-image \
     TRITON_XPU_PROFILE 1
-
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/README.md b/README.md
index 6530886ed7de2..93b71ddaccc61 100644
--- a/README.md
+++ b/README.md
@@ -16,9 +16,10 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 *Latest News* 🔥
-- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
+- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
-- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
+- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
 - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
@@ -100,6 +101,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Dropbox
 - Google Cloud
 - Lambda Lab
+- Nebius
 - NVIDIA
 - Replicate
 - Roblox
@@ -132,3 +134,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 * For coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+
+## Media Kit
+
+* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 25c8b1bbf3e22..b67849038cf0d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -24,6 +24,7 @@ class RequestFuncInput:
     model: str
     best_of: int = 1
     logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
 
@@ -36,6 +37,7 @@ class RequestFuncOutput:
     ttft: float = 0.0  # Time to first token
     itl: List[float] = field(
         default_factory=list)  # List of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
     prompt_len: int = 0
     error: str = ""
 
@@ -54,6 +56,7 @@ async def async_request_tgi(
             "do_sample": True,
             "temperature": 0.01,  # TGI does not accept 0.0 temperature.
             "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
             # TGI does not accept ignore_eos flag.
         }
         payload = {
@@ -241,6 +244,8 @@ async def async_request_openai_completions(
             "stream": True,
             "ignore_eos": request_func_input.ignore_eos,
         }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
         headers = {
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
         }
@@ -335,6 +340,8 @@ async def async_request_openai_chat_completions(
             "stream": True,
             "ignore_eos": request_func_input.ignore_eos,
         }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
         headers = {
             "Content-Type": "application/json",
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
new file mode 100644
index 0000000000000..1a0e62598bfcb
--- /dev/null
+++ b/benchmarks/benchmark_guided.py
@@ -0,0 +1,494 @@
+"""Benchmark guided decoding throughput."""
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+from typing import List
+
+import datasets
+import pandas as pd
+import uvloop
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.sampling_params import GuidedDecodingParams
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str = 'json'
+    completion: str = None
+
+
+def run_vllm(requests: List[SampleRequest],
+             engine_args: EngineArgs,
+             n: int,
+             guided_decoding_rate: float = 1.0,
+             warmup: bool = False) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**vars(engine_args))
+
+    # Add the requests to the engine.
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
+    # create a list containing random selected true or false
+    guided_decoding_req_idx = random.sample(
+        range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+    if warmup:
+        print(">>>>> Running warmup prompt, for the first 5")
+        # We setup the first 5 requests to warmup FSM
+        # if using xgrammar dataset, we will skip warmup
+        warmup_requests = requests[:5]
+        for i, request in enumerate(warmup_requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if guided_decoding_rate > 0 else None,
+                ))
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+
+    print(">>>>> Benchmark started...")
+    prompts = []
+    sampling_params = []
+    for i, request in enumerate(requests):
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                guided_decoding=GuidedDecodingParams(
+                    **{request.structure_type: request.schema})
+                if i in guided_decoding_req_idx else None,
+            ))
+
+    start = time.perf_counter()
+    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
+    ret = []
+    for output, request in zip(outputs, requests):
+        generated_text = output.outputs[0].text
+        ret.append({
+            "generated": generated_text,
+            "expected": request.completion
+        })
+    end = time.perf_counter()
+    return end - start, ret
+
+
+async def run_vllm_async(
+        requests: List[SampleRequest],
+        engine_args: AsyncEngineArgs,
+        n: int,
+        guided_decoding_rate: float = 1.0,
+        warmup: bool = False,
+        disable_frontend_multiprocessing: bool = False) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        guided_decoding_req_idx = random.sample(
+            range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+        if warmup:
+            print(">>>>>> Running warmup prompt, for the first 5")
+            # We setup the first 5 requests to warmup FSM
+            # if using xgrammar dataset, we will skip warmup
+            warmup_requests = requests[:5]
+            for i, request in enumerate(warmup_requests):
+                prompts.append(request.prompt)
+                sampling_params.append(
+                    SamplingParams(
+                        n=n,
+                        temperature=1.0,
+                        top_p=1.0,
+                        ignore_eos=True,
+                        max_tokens=request.expected_output_len,
+                        guided_decoding=GuidedDecodingParams(
+                            json=request.schema)
+                        if guided_decoding_rate > 0 else None,
+                    ))
+            generators = []
+            for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+                generator = llm.generate(prompt, sp, request_id=f"test{i}")
+                generators.append(generator)
+            all_gens = merge_async_iterators(*generators)
+            async for i, res in all_gens:
+                pass
+
+        print(">>>>> Benchmark started...")
+        prompts = []
+        sampling_params = []
+        for i, request in enumerate(requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if i in guided_decoding_req_idx else None,
+                ))
+
+        generators = []
+        start_time = []
+        latencies = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+            start_time.append(time.perf_counter())
+            latencies.append([])
+        all_gens = merge_async_iterators(*generators)
+        generated_texts = [''] * len(requests)
+        async for i, res in all_gens:
+            generated_texts[i] = res.outputs[0].text
+            lat = time.perf_counter() - start_time[i]
+            latencies[i].append(lat)
+        ret = [{
+            'generated': gt,
+            'expected': req.completion
+        } for gt, req in zip(generated_texts, requests)]
+        end = time.perf_counter()
+        first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
+        next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
+                                  for lat in latencies])
+        return end - start, ret, (first_latency, next_latency)
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    if args.dataset == 'json':
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(dir_path,
+                                                 "structured_schemas",
+                                                 "structured_schema_1.json")
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "grammar":
+        schema = """
+            ?start: select_statement
+
+            ?select_statement: "SELECT " column_list " FROM " table_name
+
+            ?column_list: column_name ("," column_name)*
+
+            ?table_name: identifier
+
+            ?column_name: identifier
+
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=regex,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=choice,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "xgrammar_bench":
+        args.warmup = False
+        requests: List[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+                                        split="train")
+        print(f"dataset has {len(dataset)} entries")
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+                                                   tokenize=False)
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+
+            requests.append(
+                SampleRequest(prompt=prompt,
+                              prompt_len=input_len,
+                              expected_output_len=args.output_len,
+                              schema=schema,
+                              completion=completion))
+
+    return requests
+
+
+def evaluate(ret, args):
+
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import re
+        actual = actual.replace('\n', '').replace(' ', '').strip()
+        try:
+            actual = re.search(r'\{.*\}', actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+
+        return True
+
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+
+    def _eval_correctness_regex(expected, actual):
+        import re
+        return re.match(args.regex, actual) is not None
+
+    def _eval_correctness(expected, actual):
+        if args.structure_type == 'json':
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == 'regex':
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == 'choice':
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res['expected'], res['generated'])
+        res['correctness'] = score
+        scores.append(score)
+
+    not_none_scores = [score for score in scores if score is not None]
+
+    return (sum(not_none_scores) / len(not_none_scores) *
+            100) if len(not_none_scores) > 0 else None
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # async engine is working for 'regex', 'choice' and 'grammar'
+    if args.dataset == 'grammar':
+        args.structure_type = 'grammar'
+        args.async_engine = False
+    elif args.dataset == 'regex':
+        args.structure_type = 'regex'
+        args.async_engine = False
+    elif args.dataset == 'choice':
+        args.structure_type = 'choice'
+        args.async_engine = False
+    else:
+        args.structure_type = 'json'
+
+    if args.no_guided_decoding:
+        args.guided_decoding_ratio = 0
+    if args.save_results:
+        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += f"_async{args.async_engine}"
+        result_file_name += f"_warmup{args.warmup}"
+        result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+
+    # Synthesize a prompt with the given input length.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    requests = sample_requests(tokenizer, args)
+
+    if args.async_engine:
+        engine_args = AsyncEngineArgs.from_cli_args(args)
+        elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
+            run_vllm_async(requests, engine_args, args.n,
+                           args.guided_decoding_ratio, args.warmup,
+                           args.disable_frontend_multiprocessing))
+    else:
+        engine_args = EngineArgs.from_cli_args(args)
+        elapsed_time, ret = run_vllm(requests, engine_args, args.n,
+                                     args.guided_decoding_ratio, args.warmup)
+        first_latency, next_latency = None, None
+
+    score = evaluate(ret, args)
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
+    if first_latency is not None:
+        latency_breakdown = "\nFirst token latency(msecs):\n"
+        latency_breakdown += f"{first_latency.describe()}"
+        latency_breakdown += "\nNext token latency(msecs):\n"
+        latency_breakdown += f"{next_latency.describe()}"
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
+        f"Correct rate is {score} %",
+        f"{latency_breakdown if first_latency is not None else ''}")
+
+    # Output JSON results if specified
+    if args.output_json or result_file_name:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "total_output_tokens": total_output_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
+            "output_tokens_per_second":
+            f"{total_output_tokens / elapsed_time:.2f}",
+            "correct_rate(%)": score
+        }
+        results = {"outputs": ret, **results}
+        if first_latency is not None:
+            results["first_token_latency(msecs)"] = first_latency.describe(
+            ).to_dict()
+            results["next_token_latency(msecs)"] = next_latency.describe(
+            ).to_dict()
+        if args.output_json:
+            with open(args.output_json, "w") as f:
+                json.dump(results, f, indent=4)
+        elif result_file_name:
+            with open(result_file_name, "w") as f:
+                json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=512,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument(
+        "--dataset",
+        default='json',
+        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--json_schema_path",
+                        type=str,
+                        default=None,
+                        help="Path to json schema.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=10,
+                        help="Number of prompts to process.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--no-guided-decoding",
+                        action='store_true',
+                        default=False,
+                        help="Whether to disable JSON decoding or not.")
+    parser.add_argument("--guided-decoding-ratio",
+                        type=float,
+                        default=1.0,
+                        help="Ratio of Guided Decoding requests")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    parser.add_argument("--warmup",
+                        action="store_true",
+                        default=False,
+                        help="Run warmup prompts before benchmark.")
+    parser.add_argument("--save-results",
+                        action="store_true",
+                        default=False,
+                        help="save output results.")
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    main(args)
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 6d33096ca1d11..5e9381f712e10 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -54,13 +54,30 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
     print(f"cost time {end_time - start_time}")
 
 
-def sample_requests(
+@dataclasses.dataclass
+class Request:
+    prompt: str
+    prompt_len: int
+    output_len: int
+
+
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
+    vocab = tokenizer.get_vocab()
+    # Remove the special tokens.
+    vocab = {
+        k: v
+        for k, v in vocab.items() if k not in tokenizer.all_special_ids
+    }
+    return random.choices(list(vocab.values()), k=length)
+
+
+def sample_requests_from_dataset(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     input_length_range: Tuple[int, int],
     fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> List[Request]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -77,31 +94,55 @@ def sample_requests(
     random.shuffle(dataset)
 
     min_len, max_len = input_length_range
+    assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_requests: List[Request] = []
+
     for i in range(len(dataset)):
-        if len(filtered_dataset) == num_requests:
+        if len(filtered_requests) == num_requests:
             break
 
         # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
+        prompt_token_ids = tokenizer(dataset[i][0]).input_ids
+        prompt = tokenizer.decode(prompt_token_ids)
         completion = dataset[i][1]
         completion_token_ids = tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
-            # Prune too short sequences.
-            continue
+        output_len = (len(completion_token_ids)
+                      if fixed_output_len is None else fixed_output_len)
         if min_len <= prompt_len <= max_len:
-            filtered_dataset.append((prompt, prompt_len, output_len))
+            filtered_requests.append(Request(prompt, prompt_len, output_len))
+
+    return filtered_requests
+
+
+def sample_requests_from_random(
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    input_length_range: Tuple[int, int],
+    fixed_output_len: Optional[int],
+    prefix_len: int,
+) -> List[Request]:
 
-    return filtered_dataset
+    requests = []
+    prefix_token_ids = sample_tokens(tokenizer, prefix_len)
+    min_len, max_len = input_length_range
+
+    for i in range(num_requests):
+        unique_part_token_ids = sample_tokens(
+            tokenizer,
+            random.randint(min_len - prefix_len, max_len - prefix_len))
+        prompt_token_ids = prefix_token_ids + unique_part_token_ids
+        prompt = tokenizer.decode(prompt_token_ids)
+        prompt_len = len(prompt_token_ids)
+        assert (min_len <= prompt_len <= max_len
+                ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+        requests.append(Request(prompt, prompt_len, fixed_output_len))
+    return requests
 
 
-def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
+def repeat_and_sort_requests(requests: List[Request],
                              repeat_count: int,
                              sort: bool = False) -> List[str]:
     repeated_requests = requests * repeat_count
@@ -109,7 +150,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
         repeated_requests.sort(key=lambda x: x[1])
     else:
         random.shuffle(repeated_requests)
-    return [req[0] for req in repeated_requests]
+    return [req.prompt for req in repeated_requests]
 
 
 def main(args):
@@ -117,9 +158,12 @@ def main(args):
     input_length_range = tuple(map(int, args.input_length_range.split(':')))
     random.seed(args.seed)
     if args.dataset_path is not None:
-        print(f"Start to sample {args.num_prompts} prompts"
+        if args.prefix_len > 0:
+            raise ValueError("prefix-len is not supported when "
+                             "dataset-path is provided.")
+        print(f"Start to sample {args.num_prompts} prompts "
               f"from {args.dataset_path}")
-        filtered_datasets = sample_requests(
+        filtered_requests = sample_requests_from_dataset(
             dataset_path=args.dataset_path,
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
@@ -127,9 +171,22 @@ def main(args):
             fixed_output_len=args.output_len,
         )
     else:
-        prompt_len = len(tokenizer(PROMPT).input_ids)
-        filtered_datasets = [(PROMPT, prompt_len, args.output_len)
-                             ] * args.num_prompts
+        print(f"Start to sample {args.num_prompts} prompts from random")
+        filtered_requests = sample_requests_from_random(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            input_length_range=input_length_range,
+            fixed_output_len=args.output_len,
+            prefix_len=args.prefix_len,
+        )
+
+    # Print some helpful stats of the requests.
+    print(f"Sampled {len(filtered_requests)} requests.")
+    prompt_lens = [req.prompt_len for req in filtered_requests]
+    print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}")
+    print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}")
+    print(f"Min Prompt Length: {min(prompt_lens)}")
+    print(f"Max Prompt Length: {max(prompt_lens)}")
 
     engine_args = EngineArgs.from_cli_args(args)
 
@@ -137,8 +194,8 @@ def main(args):
 
     sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
 
-    print("Testing filtered datasets")
-    prompts = repeat_and_sort_requests(filtered_datasets,
+    print("Testing filtered requests")
+    prompts = repeat_and_sort_requests(filtered_requests,
                                        repeat_count=args.repeat_count,
                                        sort=args.sort)
 
@@ -161,20 +218,29 @@ def main(args):
     parser.add_argument('--output-len', type=int, default=10)
     parser.add_argument('--num-prompts',
                         type=int,
-                        default=1,
+                        required=True,
                         help="Number of the prompts sampled from dataset")
     parser.add_argument('--repeat-count',
                         type=int,
-                        default=100,
+                        default=1,
                         help='Number of times to repeat each prompt')
     parser.add_argument('--sort',
                         action='store_true',
                         help='Sort prompts by input length')
     parser.add_argument('--input-length-range',
                         type=str,
-                        default='128:256',
+                        required=True,
                         help='Range of input lengths for sampling prompts,'
                         'specified as "min:max" (e.g., "128:256").')
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=0,
+        help="Specifies the length of a common prefix to be "
+        "added to the input prompt. The input-length-range will "
+        "subtract this length when filtering prompts. Only used "
+        "when dataset-path is not provided.",
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index bdb8ea8e2a5dc..4eb0e1f8ac903 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -199,6 +199,56 @@ def sample_sonnet_requests(
     return sampled_requests
 
 
+def sample_mmmu_pro_vision_requests(
+    dataset,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+    sampled_requests: List[Tuple[str, int, int, Dict[str,
+                                                     Collection[str]]]] = []
+    for data in dataset:
+        if len(sampled_requests) == num_requests:
+            break
+
+        # MMMU-Pro vision direct prompt
+        # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
+        prompt = (
+            "Answer with the option letter from the given choices directly. "
+            "The last line of your response should be of the following "
+            "format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
+            "options.")
+
+        prompt_token_ids = tokenizer(prompt).input_ids
+        if fixed_output_len is None:
+            # Default max output len is set to 128
+            print("--hf-output-len is not provided. Using default value 128.")
+            fixed_output_len = 128
+
+        prompt_len = len(prompt_token_ids)
+        output_len = fixed_output_len
+
+        assert isinstance(
+            data["image"],
+            Image), ("Input image format must be `PIL.Image.Image`, "
+                     f"given {type(data['image'])}.")
+        image: Image = data["image"]
+        image = image.convert("RGB")
+        image_data = io.BytesIO()
+        image.save(image_data, format='JPEG')
+        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+        mm_content = {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
+
+    return sampled_requests
+
+
 def sample_hf_requests(
     dataset_path: str,
     dataset_subset: str,
@@ -208,6 +258,21 @@ def sample_hf_requests(
     random_seed: int,
     fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+
+    # Special case for MMMU-Pro vision dataset
+    if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
+        assert dataset_split == "test"
+        dataset = load_dataset(dataset_path,
+                               name=dataset_subset,
+                               split=dataset_split,
+                               streaming=True)
+        assert "image" in dataset.features, (
+            "MMMU/MMMU_Pro vision dataset must have 'image' column.")
+        filter_func = lambda x: isinstance(x["image"], Image)
+        dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
+        return sample_mmmu_pro_vision_requests(dataset, num_requests,
+                                               tokenizer, fixed_output_len)
+
     dataset = load_dataset(dataset_path,
                            name=dataset_subset,
                            split=dataset_split,
@@ -251,6 +316,19 @@ def sample_hf_requests(
                     "url": f"data:image/jpeg;base64,{image_base64}"
                 },
             }
+        elif "image" in data and isinstance(data["image"], str):
+            if (data["image"].startswith("http://") or \
+                data["image"].startswith("file://")):
+                image_url = data["image"]
+            else:
+                image_url = f"file://{data['image']}"
+
+            mm_content = {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            }
         else:
             mm_content = None
 
@@ -703,6 +781,7 @@ def main(args: argparse.Namespace):
     backend = args.backend
     model_id = args.model
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode
 
     if args.base_url is not None:
         api_url = f"{args.base_url}{args.endpoint}"
@@ -712,6 +791,7 @@ def main(args: argparse.Namespace):
         base_url = f"http://{args.host}:{args.port}"
 
     tokenizer = get_tokenizer(tokenizer_id,
+                              tokenizer_mode=tokenizer_mode,
                               trust_remote_code=args.trust_remote_code)
 
     if args.dataset is not None:
@@ -1132,5 +1212,15 @@ def main(args: argparse.Namespace):
         "from the sampled HF dataset.",
     )
 
+    parser.add_argument(
+        '--tokenizer-mode',
+        type=str,
+        default="auto",
+        choices=['auto', 'slow', 'mistral'],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        'always use the slow tokenizer. \n* '
+        '"mistral" will always use the `mistral_common` tokenizer.')
+
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
new file mode 100644
index 0000000000000..4435d87e18a8a
--- /dev/null
+++ b/benchmarks/benchmark_serving_guided.py
@@ -0,0 +1,881 @@
+r"""Benchmark online serving throughput with guided decoding.
+
+On the server side, run one of the following commands:
+    (vLLM OpenAI API server)
+    vllm serve <your_model> --disable-log-requests
+
+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
+On the client side, run:
+    python benchmarks/benchmark_serving.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset json \
+        --guided-decoding-ratio 1.0 \
+        --guided-decoding-backend xgrammar \
+        --request-rate 10 \
+        --num-prompts 1000
+
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
+"""
+import argparse
+import asyncio
+import dataclasses
+import json
+import os
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from typing import AsyncGenerator, List, Optional, Tuple
+
+import datasets
+import numpy as np
+import pandas as pd
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+                                  RequestFuncOutput)
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: List[Tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: List[Tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: List[Tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: List[Tuple[float, float]]
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str
+    completion: str = None
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    if args.dataset == 'json':
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(dir_path,
+                                                 "structured_schemas",
+                                                 "structured_schema_1.json")
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "grammar":
+        schema = """
+            ?start: select_statement
+
+            ?select_statement: "SELECT " column_list " FROM " table_name
+
+            ?column_list: column_name ("," column_name)*
+
+            ?table_name: identifier
+
+            ?column_name: identifier
+
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=regex,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=choice,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "xgrammar_bench":
+        requests: List[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+                                        split="train")
+        print(f"dataset has {len(dataset)} entries")
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+                                                   tokenize=False)
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+
+            requests.append(
+                SampleRequest(prompt=prompt,
+                              prompt_len=input_len,
+                              expected_output_len=args.output_len,
+                              schema=schema,
+                              structure_type=args.structure_type,
+                              completion=completion))
+
+    return requests
+
+
+async def get_request(
+    input_requests: List[SampleRequest],
+    request_rate: float,
+    burstiness: float = 1.0,
+) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
+    """
+    Asynchronously generates requests at a specified rate 
+    with OPTIONAL burstiness.
+    
+    Args:
+        input_requests: 
+            A list of input requests, each represented as a tuple.
+        request_rate: 
+            The rate at which requests are generated (requests/s).
+        burstiness (optional): 
+            The burstiness factor of the request generation. 
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results 
+            in more bursty requests, while a higher burstiness value 
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
+    input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}.")
+    theta = 1.0 / (request_rate * burstiness)
+
+    for i, request in enumerate(input_requests):
+        yield i, request
+
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: List[Tuple[str, int, int]],
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[float],
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    actual_output_lens: List[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: List[float] = []
+    tpots: List[float] = []
+    all_tpots: List[float] = []
+    ttfts: List[float] = []
+    e2els: List[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            # We use the tokenizer to count the number of output tokens for all
+            # serving backends instead of looking at len(outputs[i].itl) since
+            # multiple output tokens may be bundled together
+            # Note : this may inflate the output token count slightly
+            output_len = len(
+                tokenizer(outputs[i].generated_text,
+                          add_special_tokens=False).input_ids)
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i].prompt_len
+            tpot = 0
+            if output_len > 1:
+                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
+                                                                 1)
+                tpots.append(tpot)
+            outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2)
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) *
+        1000,  # ttfts is empty if streaming is not supported by backend
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+                            for p in selected_percentiles],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+                             for p in selected_percentiles],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[SampleRequest],
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[str],
+    ignore_eos: bool,
+    max_concurrency: Optional[int],
+    guided_decoding_ratio: float,
+    guided_decoding_backend: str,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    def prepare_extra_body(request) -> dict:
+        extra_body = {}
+        # Add the schema to the extra_body
+        extra_body[request.structure_type] = request.schema
+        # Add the specific guided_decoding_backend
+        extra_body["guided_decoding_backend"] = guided_decoding_backend
+        return extra_body
+
+    print("Starting initial single prompt test run...")
+    guided_decoding_req_idx = random.sample(
+        range(len(input_requests)),
+        int(len(input_requests) * guided_decoding_ratio))
+
+    test_request = input_requests[0]
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_request.prompt,
+        api_url=api_url,
+        prompt_len=test_request.prompt_len,
+        output_len=test_request.expected_output_len,
+        ignore_eos=ignore_eos,
+        extra_body=prepare_extra_body(test_request),
+    )
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}")
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=prepare_extra_body(test_request),
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    if burstiness == 1.0:
+        distribution = "Poisson process"
+    else:
+        distribution = "Gamma distribution"
+
+    print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = (asyncio.Semaphore(max_concurrency)
+                 if max_concurrency else None)
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    expected: List[str] = []
+    async for i, request in get_request(input_requests, request_rate,
+                                        burstiness):
+        extra_body = prepare_extra_body(
+            request) if i in guided_decoding_req_idx else None
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=request.prompt,
+            api_url=api_url,
+            prompt_len=request.prompt_len,
+            output_len=request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        expected.append(request.completion)
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(request_func_input=request_func_input,
+                                     pbar=pbar)))
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            extra_body={test_request.structure_type: test_request.schema},
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
+    )
+
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
+                                    metrics.total_token_throughput))
+
+    result = {
+        "duration":
+        benchmark_duration,
+        "completed":
+        metrics.completed,
+        "total_input_tokens":
+        metrics.total_input,
+        "total_output_tokens":
+        metrics.total_output,
+        "request_throughput":
+        metrics.request_throughput,
+        "output_throughput":
+        metrics.output_throughput,
+        "total_token_throughput":
+        metrics.total_token_throughput,
+        "ttft_description":
+        pd.Series([output.ttft for output in outputs]).describe().to_dict(),
+        "tpot_description":
+        pd.Series([output.tpot for output in outputs]).describe().to_dict(),
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens":
+        actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    ret = [{
+        'generated': output.generated_text,
+        'expected': gt
+    } for output, gt in zip(outputs, expected)]
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{:<40} {:<10.2f}".format(
+            f"Mean {metric_name} (ms):",
+            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+        print("{:<40} {:<10.2f}".format(
+            f"Median {metric_name} (ms):",
+            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms")
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms")
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms")
+        for p, value in getattr(metrics,
+                                f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+                                            value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT",
+                       "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result, ret
+
+
+def evaluate(ret, args):
+
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import re
+        actual = actual.replace('\n', '').replace(' ', '').strip()
+        try:
+            actual = re.search(r'\{.*\}', actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+
+        return True
+
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+
+    def _eval_correctness_regex(expected, actual):
+        import re
+        return re.match(args.regex, actual) is not None
+
+    def _eval_correctness(expected, actual):
+        if args.structure_type == 'guided_json':
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == 'guided_regex':
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == 'guided_choice':
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res['expected'], res['generated'])
+        res['correctness'] = score
+        scores.append(score)
+
+    not_none_scores = [score for score in scores if score is not None]
+
+    return (sum(not_none_scores) / len(not_none_scores) *
+            100) if len(not_none_scores) > 0 else None
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(tokenizer_id,
+                              trust_remote_code=args.trust_remote_code)
+
+    if args.dataset == 'grammar':
+        args.structure_type = 'guided_grammar'
+    elif args.dataset == 'regex':
+        args.structure_type = 'guided_regex'
+    elif args.dataset == 'choice':
+        args.structure_type = 'guided_choice'
+    else:
+        args.structure_type = 'guided_json'
+
+    if args.no_guided_decoding:
+        args.guided_decoding_ratio = 0
+    if args.save_results:
+        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name += f"_{backend}"
+        result_file_name += f"_{args.request_rate}qps"
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+
+    input_requests = sample_requests(tokenizer, args)
+
+    benchmark_result, ret = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[
+                float(p) for p in args.metric_percentiles.split(",")
+            ],
+            ignore_eos=args.ignore_eos,
+            max_concurrency=args.max_concurrency,
+            guided_decoding_ratio=args.guided_decoding_ratio,
+            guided_decoding_backend=args.guided_decoding_backend,
+        ))
+
+    # Save config and results to json
+    score = evaluate(ret, args)
+    print("correct_rate(%)", score, '\n')
+    if args.save_results:
+        results = {
+            "backend":
+            backend,
+            "model_id":
+            model_id,
+            "tokenizer_id":
+            tokenizer_id,
+            "num_prompts":
+            args.num_prompts,
+            "request_rate":
+            args.request_rate if args.request_rate < float("inf") else "inf",
+            "burstiness":
+            args.burstiness,
+            "max_concurrency":
+            args.max_concurrency,
+            "correct_rate(%)":
+            score
+        }
+        results = {"outputs": ret, **results, **benchmark_result}
+
+        # Save to file
+        if args.result_filename:
+            result_file_name = args.result_filename
+        if args.result_dir:
+            result_file_name = os.path.join(args.result_dir, result_file_name)
+        with open(result_file_name, "w", encoding='utf-8') as outfile:
+            json.dump(results, outfile, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset",
+        default='json',
+        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--json_schema_path",
+                        type=str,
+                        default=None,
+                        help="Path to json schema.")
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.")
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--save-results",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-seperated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
+        "Default value is \"ttft,tpot,itl\".")
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-seperated list of percentiles for selected metrics. "
+        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\". "
+        "Use \"--percentile-metrics\" to select metrics.",
+    )
+    parser.add_argument("--no-guided-decoding",
+                        action='store_true',
+                        default=False,
+                        help="Whether to disable JSON decoding or not.")
+    parser.add_argument("--guided-decoding-ratio",
+                        type=float,
+                        default=1.0,
+                        help="Ratio of Guided Decoding requests")
+    parser.add_argument("--guided-decoding-backend",
+                        type=str,
+                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
+                        default="xgrammar",
+                        help="Backend to use for guided decoding")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 159cf055737ce..1e5967bd9bf8b 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -294,23 +294,36 @@ def main(args: argparse.Namespace):
     tokenizer = AutoTokenizer.from_pretrained(
         args.tokenizer, trust_remote_code=args.trust_remote_code)
     if args.dataset is None:
-        # Synthesize a prompt with the given input length.
-        # As tokenizer may add additional tokens like BOS, we need to try
-        # different lengths to get the desired input length.
-        for i in range(-10, 10):
-            prompt = "hi " * (args.input_len + i)
-            tokenized_prompt = tokenizer(prompt).input_ids
-            if len(tokenized_prompt) == args.input_len:
-                break
-        else:
-            raise ValueError(
-                f"Failed to synthesize a prompt with {args.input_len} tokens.")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=args.input_len,
-                          expected_output_len=args.output_len)
-            for _ in range(args.num_prompts)
-        ]
+        vocab_size = tokenizer.vocab_size
+        requests = []
+        for _ in range(args.num_prompts):
+            # Synthesize a prompt with the given input length.
+            candidate_ids = [
+                random.randint(0, vocab_size - 1)
+                for _ in range(args.input_len)
+            ]
+            # As tokenizer may add additional tokens like BOS, we need to try
+            # different lengths to get the desired input length.
+            for _ in range(5):  # Max attempts to correct
+                candidate_prompt = tokenizer.decode(candidate_ids)
+                tokenized_len = len(tokenizer.encode(candidate_prompt))
+
+                if tokenized_len == args.input_len:
+                    break
+
+                # Adjust length based on difference
+                diff = args.input_len - tokenized_len
+                if diff > 0:
+                    candidate_ids.extend([
+                        random.randint(100, vocab_size - 100)
+                        for _ in range(diff)
+                    ])
+                else:
+                    candidate_ids = candidate_ids[:diff]
+            requests.append(
+                SampleRequest(prompt=candidate_prompt,
+                              prompt_len=args.input_len,
+                              expected_output_len=args.output_len))
     else:
         requests = sample_requests(tokenizer, args)
 
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
new file mode 100644
index 0000000000000..3d1c5e392f9e2
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -0,0 +1,384 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass sparse impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass sparse with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
+                                                     k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16, bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
new file mode 100644
index 0000000000000..ef06fcd6604dd
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -0,0 +1,96 @@
+# Cutlass bench utils
+from typing import Iterable, Tuple
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
+                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
+                        m: int, n: int, k: int) -> \
+                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+    ABs = []
+    for _ in range(num_tensors):
+        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+        if b_comp is not None:
+            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
+    BComps, Es, As, Bs = zip(*ABs)
+    return list(BComps), list(Es), list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 63cf5d50cac75..d0353bc8cb42a 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -8,6 +8,7 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_tensors
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
@@ -17,31 +18,6 @@
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
 
 # bench
 def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
@@ -386,4 +362,4 @@ def to_torch_dtype(dt):
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
+    args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index 25ec9d6028627..d58fb0bf86374 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -40,4 +40,4 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
-}
+}
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
new file mode 100644
index 0000000000000..2924ea4a49f54
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# benchmark the overhead of disaggregated prefill.
+# methodology:
+# - send all request to prefill vLLM instance. It will buffer KV cache.
+# - then send all request to decode instance. 
+# - The TTFT of decode instance is the overhead.
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill -f pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+benchmark() {
+
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  # compare chunked prefill with disaggregated prefill
+
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=10
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+
+
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    
+
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8200 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  wait_for_server 8100
+  wait_for_server 8200
+
+  # let the prefill instance finish prefill
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8100 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate "inf"
+
+
+  # send the request to decode.
+  # The TTFT of this command will be the overhead of disagg prefill impl.
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8200 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate "$qps"
+  kill_gpu_processes
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=1
+  default_output_len=1
+  benchmark $default_qps $default_output_len
+
+}
+
+
+main "$@"
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
new file mode 100644
index 0000000000000..d8d9e976dce76
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Requirement: 8x H100 GPUs.
+
+
+# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
+# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
+# Resource: 8x H100
+# Approaches:
+# 1. Chunked prefill: 1 vllm instance with tp=8
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pgrep pt_main_thread | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+  for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
+  sleep 1
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+launch_chunked_prefill() {
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  # disagg prefill
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --max-model-len 10000 \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.6 &
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --max-model-len 10000 \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.6 &
+  wait_for_server 8100
+  wait_for_server 8200
+  python3 round_robin_proxy.py &
+  sleep 1
+}
+
+
+launch_disagg_prefill() {
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
+  # disagg prefill
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  wait_for_server 8100
+  wait_for_server 8200
+  python3 disagg_prefill_proxy_server.py &
+  sleep 1
+}
+
+
+benchmark() {
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=100
+  qps=$1
+  prefix_len=50
+  input_len=1024
+  output_len=$2
+  tag=$3
+
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename "$tag"-qps-"$qps".json \
+          --request-rate "$qps"
+
+  sleep 2
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx matplotlib aiohttp
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt so that we can sample 2048 tokens for input
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_output_len=6
+
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  launch_chunked_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len chunked_prefill
+  done
+  kill_gpu_processes
+
+  launch_disagg_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len disagg_prefill
+  done
+  kill_gpu_processes
+
+  python3 visualize_benchmark_results.py
+
+}
+
+
+main "$@"
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
new file mode 100644
index 0000000000000..4058b1c0a3b79
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -0,0 +1,61 @@
+import os
+
+import aiohttp
+from quart import Quart, make_response, request
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+async def forward_request(url, data):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+        async with session.post(url=url, json=data,
+                                headers=headers) as response:
+            if response.status == 200:
+                # if response.headers.get('Transfer-Encoding') == 'chunked':
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(
+                            1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+
+
+@app.route('/v1/completions', methods=['POST'])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request['max_tokens'] = 1
+
+        # finish prefill
+        async for _ in forward_request('http://localhost:8100/v1/completions',
+                                       prefill_request):
+            continue
+
+        # return decode
+        generator = forward_request('http://localhost:8200/v1/completions',
+                                    original_request_data)
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+
+    except Exception as e:
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == '__main__':
+    app.run(port=8000)
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
new file mode 100644
index 0000000000000..6eb5f63980070
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -0,0 +1,60 @@
+import asyncio
+import itertools
+
+import aiohttp
+from aiohttp import web
+
+
+class RoundRobinProxy:
+
+    def __init__(self, target_ports):
+        self.target_ports = target_ports
+        self.port_cycle = itertools.cycle(self.target_ports)
+
+    async def handle_request(self, request):
+        target_port = next(self.port_cycle)
+        target_url = f"http://localhost:{target_port}{request.path_qs}"
+
+        async with aiohttp.ClientSession() as session:
+            try:
+                # Forward the request
+                async with session.request(
+                        method=request.method,
+                        url=target_url,
+                        headers=request.headers,
+                        data=request.content,
+                ) as response:
+                    # Start sending the response
+                    resp = web.StreamResponse(status=response.status,
+                                              headers=response.headers)
+                    await resp.prepare(request)
+
+                    # Stream the response content
+                    async for chunk in response.content.iter_any():
+                        await resp.write(chunk)
+
+                    await resp.write_eof()
+                    return resp
+
+            except Exception as e:
+                return web.Response(text=f"Error: {str(e)}", status=500)
+
+
+async def main():
+    proxy = RoundRobinProxy([8100, 8200])
+    app = web.Application()
+    app.router.add_route('*', '/{path:.*}', proxy.handle_request)
+
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', 8000)
+    await site.start()
+
+    print("Proxy server started on http://localhost:8000")
+
+    # Keep the server running
+    await asyncio.Event().wait()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
new file mode 100644
index 0000000000000..e59d8bb0e6c8c
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -0,0 +1,46 @@
+import json
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+if __name__ == "__main__":
+
+    data = []
+    for name in ['disagg_prefill', 'chunked_prefill']:
+        for qps in [2, 4, 6, 8]:
+            with open(f"results/{name}-qps-{qps}.json") as f:
+                x = json.load(f)
+                x['name'] = name
+                x['qps'] = qps
+                data.append(x)
+
+    df = pd.DataFrame.from_dict(data)
+    dis_df = df[df['name'] == 'disagg_prefill']
+    chu_df = df[df['name'] == 'chunked_prefill']
+
+    plt.style.use('bmh')
+    plt.rcParams['font.size'] = 20
+
+    for key in [
+            'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
+            'median_itl_ms', 'p99_itl_ms'
+    ]:
+
+        fig, ax = plt.subplots(figsize=(11, 7))
+        plt.plot(dis_df['qps'],
+                 dis_df[key],
+                 label='disagg_prefill',
+                 marker='o',
+                 linewidth=4)
+        plt.plot(chu_df['qps'],
+                 chu_df[key],
+                 label='chunked_prefill',
+                 marker='o',
+                 linewidth=4)
+        ax.legend()
+
+        ax.set_xlabel('QPS')
+        ax.set_ylabel(key)
+        ax.set_ylim(bottom=0)
+        fig.savefig(f'results/{key}.png')
+        plt.close(fig)
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
new file mode 100644
index 0000000000000..ef91f9f8eb529
--- /dev/null
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -0,0 +1,173 @@
+import pickle as pkl
+import time
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Iterable, List, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from tqdm import tqdm
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+@dataclass
+class bench_params_t:
+    num_tokens: int
+    hidden_size: int
+    add_residual: bool
+    dtype: torch.dtype
+
+    def description(self):
+        return (f'N {self.num_tokens} '
+                f'x D {self.hidden_size} '
+                f'x R {self.add_residual} '
+                f'x DT {self.dtype}')
+
+
+def get_bench_params() -> List[bench_params_t]:
+    ## Test Fixtures
+    NUM_TOKENS = [2**x for x in range(11)]
+    HIDDEN_SIZES = list(range(1024, 8129, 1024))
+    ADD_RESIDUAL = [True, False]
+    DTYPES = [torch.bfloat16, torch.float]
+
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    bench_params = list(map(lambda x: \
+        bench_params_t(x[0], x[1], x[2], x[3]), combinations))
+    return bench_params
+
+
+# Reference impls
+def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                      residual: Optional[torch.Tensor],
+                      quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _, _ = ops.scaled_int8_quant(torch_out)
+
+
+def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                     residual: Optional[torch.Tensor],
+                     quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = ops.scaled_fp8_quant(torch_out)
+
+
+def fused_impl(
+        rms_norm_layer: RMSNorm,  # this stores the weights
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        quant_dtype: torch.dtype):
+    out, _ = ops.rms_norm_dynamic_per_token_quant(x,
+                                                  rms_norm_layer.weight,
+                                                  1e-6,
+                                                  quant_dtype,
+                                                  residual=residual)
+
+
+# Bench functions
+def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
+             quant_dtype: torch.dtype, label: str, sub_label: str,
+             fn: Callable, description: str) -> TMeasurement:
+
+    min_run_time = 1
+
+    globals = {
+        "rms_norm_layer": rms_norm_layer,
+        "x": x,
+        "residual": residual,
+        "quant_dtype": quant_dtype,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+def bench(params: bench_params_t, label: str, sub_label: str) \
+        -> Iterable[TMeasurement]:
+
+    # Make inputs
+    layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    # Make inputs
+    scale = 1 / params.hidden_size
+    x = torch.randn(params.num_tokens,
+                    params.hidden_size,
+                    dtype=params.dtype,
+                    device='cuda') * scale
+    residual = (torch.randn_like(x) * scale).to(device='cuda') \
+            if params.add_residual else None
+
+    timers = []
+
+    # unfused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label,
+                 unfused_int8_impl, "unfused_int8_impl"))
+
+    # unfused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 unfused_fp8_impl, "unfused_fp8_impl"))
+
+    # fused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
+                 "fused_int8_impl"))
+
+    # fused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 fused_impl, "fused_fp8_impl"))
+
+    print_timers(timers)
+
+    return timers
+
+
+# launch bench
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def main():
+    torch.set_default_device('cuda')
+    bench_params = get_bench_params()
+
+    timers = []
+    for bp in tqdm(bench_params):
+        timers.extend(
+            bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
+    print_timers(timers)
+
+    # pickle all the results
+    timestamp = int(time.time())
+    with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f:
+        pkl.dump(timers, f)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 665b50bf18cf0..46bab74ae8adf 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -2,8 +2,10 @@
 import copy
 import itertools
 import math
+import os
 import pickle as pkl
 import time
+from dataclasses import dataclass
 from itertools import product
 from typing import Callable, Iterable, List, Optional, Tuple
 
@@ -15,11 +17,12 @@
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales,
+    marlin_zero_points)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     MarlinWorkspace)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, pack_rows, quantize_weights)
+    pack_rows, quantize_weights)
 from vllm.scalar_type import ScalarType, scalar_types
 from vllm.utils import FlexibleArgumentParser
 
@@ -27,149 +30,350 @@
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
 DEFAULT_TP_SIZES = [1]
 
+NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False)
+
+if NVTX_PROFILE:
+    import nvtx
+
+
+def terse_type_name(dt):
+    return {
+        torch.bfloat16: "bf16",
+        torch.float16: "fp16",
+        torch.int8: "int8",
+        torch.float8_e4m3fn: "fp8",
+        torch.bfloat16: "bf16",
+        torch.float: "float",
+        torch.int: "int",
+    }[dt]
+
+
+@dataclass
+class BenchmarkTensors:
+    w_ref: torch.Tensor
+    a: torch.Tensor
+
+    w_q: torch.Tensor
+    group_size: Optional[int]
+    wtype: ScalarType
+    w_g_s: torch.Tensor
+    w_g_zp: Optional[torch.Tensor]
+    w_ch_s: Optional[torch.Tensor]
+    w_tok_s: Optional[torch.Tensor]
+
+
+@dataclass
+class TypeConfig:
+    act_type: torch.dtype
+    weight_type: ScalarType
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    group_zero_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]
+
+
+def rand_data(shape, dtype=torch.float16, scale=1):
+    if dtype.is_floating_point:
+        return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype)
+    else:
+        return torch.randint(-15, 15, shape, dtype=dtype, device="cuda")
+
+
+def quantize_and_pack(atype: torch.dtype,
+                      w: torch.Tensor,
+                      wtype: ScalarType,
+                      stype: Optional[torch.dtype],
+                      group_size: Optional[int],
+                      zero_points: bool = False):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w,
+        wtype,
+        group_size=group_size,
+        zero_points=zero_points,
+        # to match how the kernel applies zps
+        ref_zero_points_after_scales=True)
 
-def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
     w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
-    w_q = w_q.t().contiguous().t()  # make col major
-    return ops.machete_prepack_B(w_q, wtype)
+    return w_ref, w_q, w_s, w_zp
 
 
-def make_bench_tensors(
-    atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
-    k: int
-) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
-                                    torch.tensor]]]:
-    assert wtype.is_integer(), "TODO: support floating point weights"
+def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
+                         group_size: Optional[int]) -> List[BenchmarkTensors]:
+    m, n, k = shape
 
     # we want to make sure that weights don't fit into L2 cache between runs so
     #  we construct enough weights to exceed L2 cache, which is 50mb on a H100
     #  so we target total weight size > 2*50mb
-    num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
-
-    a = torch.randn((m, k), device="cuda", dtype=atype) * 5
-    weights = [
-        torch.randn((k, n), device="cuda", dtype=atype)
-        for _ in range(num_weights)
-    ]
-    quanitized_weights = [
-        quantize_weights(w, wtype, group_size) for w in weights
-    ]
-
-    return a, quanitized_weights
+    num_weights = math.ceil(2 * 50 * 1024**2 * 8 /
+                            (k * n * types.weight_type.size_bits))
+
+    a = rand_data((m, k), types.act_type, scale=5)
+
+    benchmark_tensors: List[BenchmarkTensors] = []
+    for _ in range(num_weights):
+        w = rand_data((k, n), types.act_type, scale=5)
+
+        if types.group_scale_type is not None:
+            w = w.to(types.group_scale_type)
+        if w.dtype.itemsize == 1:
+            w = w.to(torch.float16)
+
+        w_ref, w_q_packed, w_s, w_zp = quantize_and_pack(
+            a.dtype, w, types.weight_type, types.group_scale_type, group_size,
+            types.group_zero_type is not None)
+
+        if not a.dtype.is_floating_point:
+            aiinfo = torch.iinfo(a.dtype)
+            w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
+
+        w_ref = w_ref.to(torch.float32)
+
+        w_ch_s = None if types.channel_scale_type is None else\
+            rand_data((n,), types.channel_scale_type)
+        w_tok_s = None if types.token_scale_type is None else\
+            rand_data((m,), types.token_scale_type)
+
+        benchmark_tensors.append(
+            BenchmarkTensors(w_ref=w_ref,
+                             a=a,
+                             w_q=w_q_packed,
+                             wtype=types.weight_type,
+                             w_g_s=w_s,
+                             w_g_zp=w_zp,
+                             group_size=group_size,
+                             w_ch_s=w_ch_s,
+                             w_tok_s=w_tok_s))
+
+    return benchmark_tensors
+
+
+def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    a = bt.a
+    w = bt.w_ref.to(bt.a.dtype)  # use float reference tensor
+    if a.dtype not in [torch.float16, torch.bfloat16]:
+        a = a.to(torch.float16)
+        w = w.to(torch.float16)
+    return lambda: torch.matmul(a, w)
+
+
+def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    if bt.w_ch_s is not None and bt.w_tok_s is not None:
+        scale_a = bt.w_tok_s.to(torch.float32)
+        scale_b = bt.w_ch_s.to(torch.float32)
+    else:
+        scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+        scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+    w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t()
+    return lambda: ops.cutlass_scaled_mm(
+        bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16)
+
+
+def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    device = bt.a.device
+
+    workspace = MarlinWorkspace(bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+
+    if bt.w_g_zp is None:
+        w_zp = torch.empty(0, dtype=torch.int, device=device)
+    else:
+        w_zp = marlin_zero_points(bt.w_g_zp, bt.w_ref.shape[0],
+                                  bt.w_ref.shape[1], bt.wtype.size_bits)
+
+    if bt.group_size is None:
+        w_s = torch.tensor([], device="cuda", dtype=torch.half)
+    else:
+        w_s = marlin_permute_scales(bt.w_g_s, bt.w_ref.shape[0],
+                                    bt.w_ref.shape[1], bt.group_size)
+
+    sort_indices = torch.empty(0, dtype=torch.int, device=device)
+    g_idx = torch.empty(0, dtype=torch.int, device=device)
+    w_q = ops.gptq_marlin_repack(bt.w_q, sort_indices, bt.w_ref.shape[0],
+                                 bt.w_ref.shape[1], bt.wtype.size_bits)
+
+    if bt.a.dtype.is_floating_point:
+        assert bt.w_ch_s is None
+        assert bt.w_tok_s is None
+        assert bt.group_size is not None
+
+        fn = lambda: ops.gptq_marlin_gemm(a=bt.a,
+                                          b_q_weight=w_q,
+                                          b_scales=w_s,
+                                          b_zeros=w_zp,
+                                          g_idx=g_idx,
+                                          perm=sort_indices,
+                                          workspace=workspace.scratch,
+                                          b_q_type=bt.wtype,
+                                          size_m=bt.a.shape[0],
+                                          size_n=bt.w_ref.shape[1],
+                                          size_k=bt.w_ref.shape[0],
+                                          is_k_full=True,
+                                          is_zp_float=False)
+    else:
+        assert bt.a.dtype == torch.int8
+        assert bt.wtype == scalar_types.uint4b8
+
+        if bt.w_ch_s is not None:
+            s_ch = bt.w_ch_s.to(torch.float32)
+        else:
+            s_ch = torch.ones(bt.w_ref.shape[1],
+                              dtype=torch.float32,
+                              device=device)
+
+        if bt.w_tok_s is not None:
+            s_tok = bt.w_tok_s.to(torch.float32)
+        else:
+            s_tok = torch.ones(bt.a.shape[0],
+                               dtype=torch.float32,
+                               device=device)
+
+        fn = lambda: ops.marlin_qqq_gemm(a=bt.a,
+                                         b_q_weight=w_q,
+                                         s_group=w_s,
+                                         s_tok=s_tok,
+                                         s_ch=s_ch,
+                                         workspace=workspace.scratch,
+                                         size_m=bt.a.shape[0],
+                                         size_n=bt.w_ref.shape[1],
+                                         size_k=bt.w_ref.shape[0])
+
+    return fn
+
+
+def machete_create_bench_fn(bt: BenchmarkTensors,
+                            out_type=torch.dtype,
+                            schedule=None) -> Callable:
+    w_q = bt.w_q.t().contiguous().t()  # make col major
+    w_q = ops.machete_prepack_B(w_q, bt.a.dtype, bt.wtype,
+                                None if bt.w_g_s is None else bt.w_g_s.dtype)
+
+    w_g_zp = bt.w_g_zp
+    if w_g_zp is not None:
+        w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype))
+
+    return lambda: ops.machete_mm(
+        a=bt.a,
+        b_q=bt.w_q,
+        b_type=bt.wtype,
+        b_group_scales=bt.w_g_s,
+        b_group_zeros=w_g_zp,
+        b_group_size=bt.group_size,
+        b_channel_scales=bt.w_ch_s,
+        a_token_scales=bt.w_tok_s,
+        out_type=out_type,
+        schedule=schedule,
+    )
 
 
 # impl
 
-
 # bench
-def bench_fn(label: str, sub_label: str, description: str,
-             fn: Callable) -> TMeasurement:
 
-    min_run_time = 1
-    return TBenchmark.Timer(
-        stmt="fn()",
+
+def bench_fns(label: str, sub_label: str, description: str,
+              fns: List[Callable]):
+
+    min_run_time = 1 if not NVTX_PROFILE else 0.1
+    res = TBenchmark.Timer(
+        stmt="""
+        for fn in fns:
+            fn()
+        """,
         globals={
-            "fn": fn
+            "fns": fns
         },
         label=label,
         sub_label=sub_label,
         description=description,
     ).blocked_autorange(min_run_time=min_run_time)
 
+    if NVTX_PROFILE:
+        with nvtx.annotate("mm-bench"), nvtx.annotate(
+                f"{label}|{sub_label}|{description}"):
+            fns[0]()
 
-def loop_over_weights(
-    a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
-                                         torch.tensor, torch.tensor]],
-    fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
-                 None]):
-    for w_ref, w_q, w_s, _ in weights:
-        fn(a, w_ref, w_q, w_s)
+    return res
 
 
 _SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
 _SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
 
 
-def bench(atype: torch.dtype,
-          wtype: ScalarType,
+def bench(types: TypeConfig,
           group_size: int,
           m: int,
           k: int,
           n: int,
           label: str,
           sub_label: str,
-          benchmark_marlinv1: bool = True,
-          sweep_schedules: bool = True) -> Iterable[TMeasurement]:
-    global _SWEEP_SCHEDULES_RESULTS
-
-    a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
-    sub_label += f", L={len(weights)}"
-
-    weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
-                       for w_ref, w_q, w_s, w_zp in weights]
+          sweep_schedules: bool = True) -> List[TMeasurement]:
+    benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
+    sub_label += f", L={len(benchmark_tensors)}"
+
+    name_type_string = f"W{types.weight_type}"+\
+                       f"-A{terse_type_name(types.act_type)}"
+    if types.group_scale_type is not None:
+        name_type_string += f"-GS{terse_type_name(types.group_scale_type)}"
+    if types.group_zero_type is not None:
+        name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}"
+    if group_size is not None:
+        name_type_string += f"-G{group_size}"
+    if types.channel_scale_type is not None:
+        name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}"
+    if types.token_scale_type is not None:
+        name_type_string += f"-TS{terse_type_name(types.token_scale_type)}"
 
     timers = []
     # pytorch impl
     timers.append(
-        bench_fn(
-            label, sub_label, "torch.matmul", lambda: loop_over_weights(
-                a,
-                weights,
-                lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
-            )))
+        bench_fns(
+            label, sub_label, "torch.matmul (fp16)",
+            [torch_matmul_f16_create_bench_fn(bt)
+             for bt in benchmark_tensors]))
 
-    if benchmark_marlinv1:
-        w_ref = weights[0][0]
-
-        w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
-        sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
-        g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
-
-        def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
-            w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
-            return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
-                                          wtype.size_bits)
-
-        def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
-            return marlin_permute_scales(w_s, *w_ref.shape, group_size)
-
-        weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
-                             marlinv1_permute_scales(w_s), w_zp)
-                            for w_ref, w_q, w_s, w_zp in weights]
-
-        workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
-                                    GPTQ_MARLIN_MAX_PARALLEL)
-
-        # marlinv1
+    if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn:
+        timers.append(
+            bench_fns(
+                label, sub_label,
+                f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", [
+                    cutlass_scaled_mm_create_bench_fn(bt)
+                    for bt in benchmark_tensors
+                ]))
+
+    if types.act_type != torch.float8_e4m3fn:
         timers.append(
-            bench_fn(
-                label, sub_label, "marlin_orig", lambda: loop_over_weights(
-                    a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
-                    gptq_marlin_gemm(a,
-                                     w_q,
-                                     w_s,
-                                     w_zp_empty,
-                                     g_idx,
-                                     sort_indices,
-                                     workspace.scratch,
-                                     wtype,
-                                     size_m=a.shape[0],
-                                     size_n=w_ref.shape[1],
-                                     size_k=w_ref.shape[0],
-                                     is_k_full=True))))
+            bench_fns(label, sub_label, f"marlin ({name_type_string})",
+                      [marlin_create_bench_fn(bt)
+                       for bt in benchmark_tensors]))
 
     # machete
     timers.append(
-        bench_fn(
-            label, sub_label, "machete_heuristic", lambda: loop_over_weights(
-                a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
-                    a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
+        bench_fns(label, sub_label, f"machete ({name_type_string})", [
+            machete_create_bench_fn(bt, out_type=types.output_type)
+            for bt in benchmark_tensors
+        ]))
 
     if sweep_schedules:
+        global _SWEEP_SCHEDULES_RESULTS
+
         print("Finding best schedule for machete")
         best = None
         best_schedule = None
-        schedules = ops.machete_supported_schedules(wtype)
+        schedules = ops.machete_supported_schedules(
+            a_type=types.act_type,
+            b_type=types.weight_type,
+            group_scales_type=types.group_scale_type,
+            group_zeros_type=types.group_zero_type,
+            token_scales_type=types.token_scale_type,
+            channel_scales_type=types.channel_scale_type,
+            out_type=types.output_type)
+
+        if schedules is None or len(schedules) == 0:
+            raise ValueError("No schedules found to sweep")
+
         for schedule in reversed(schedules):
             schedule_M = int(schedule.split("_")[0].split("x")[1])
 
@@ -177,16 +381,11 @@ def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
             if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
                 continue
 
-            def run(a, _, w_q, w_s, schedule=schedule):
-                ops.machete_gemm(a,
-                                 w_q,
-                                 wtype,
-                                 w_s,
-                                 b_group_size=group_size,
-                                 schedule=schedule)
-
-            res = bench_fn(label, sub_label, "machete_best",
-                           lambda: loop_over_weights(a, weights_machete, run))
+            res = bench_fns(label, sub_label, "machete_best", [
+                machete_create_bench_fn(
+                    bt, out_type=types.output_type, schedule=schedule)
+                for bt in benchmark_tensors
+            ])
 
             results_row = {
                 "M": m,
@@ -213,25 +412,33 @@ def run(a, _, w_q, w_s, schedule=schedule):
 
 
 # runner
-def print_timers(timers: Iterable[TMeasurement]):
+def print_timers(timers: List[TMeasurement]):
     compare = TBenchmark.Compare(timers)
     compare.print()
 
 
-def run(dtype: torch.dtype, sweep_schedules: bool,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    types = TypeConfig(
+        act_type=args.act_type,
+        weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
+            else scalar_types.uint4,
+        output_type=args.out_type,
+        group_scale_type=args.group_scale_type,
+        group_zero_type=args.group_zero_type,
+        channel_scale_type=args.channel_scale_type,
+        token_scale_type=args.token_scale_type,
+    )
 
-    results = []
+    results: List[TMeasurement] = []
     for m, k, n in MKNs:
-        timers = bench(dtype,
-                       scalar_types.uint4b8,
-                       128,
+        timers = bench(types,
+                       args.group_size,
                        m,
                        k,
                        n,
-                       f"{dtype}-gemm",
+                       f"{args.act_type}-gemm",
                        f"MKN=({m}x{k}x{n})",
-                       sweep_schedules=sweep_schedules)
+                       sweep_schedules=args.sweep_schedules)
         print_timers(timers)
         results.extend(timers)
 
@@ -240,7 +447,7 @@ def run(dtype: torch.dtype, sweep_schedules: bool,
 
 # output makers
 def make_output(
-    data: Iterable[TMeasurement],
+    data: List[TMeasurement],
     MKNs: Iterable[Tuple[int, int, int]],
     base_description: str,
     timestamp=None,
@@ -262,7 +469,6 @@ def run_square_bench(args):
     dim_sizes = list(
         range(args.dim_start, args.dim_end + 1, args.dim_increment))
     MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-
     data = run(args.dtype, args.sweep_schedules, MKNs)
 
     make_output(data, MKNs, f"square_bench-{args.dtype}")
@@ -306,33 +512,49 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
             for k, n in KNs:
                 MKNs.append((m, k, n))
 
-        data = run(args.dtype, args.sweep_schedules, MKNs)
+        data = run(args, MKNs)
         model_bench_data.append(data)
 
+    type_string = f"{args.act_type}"
+
     # Print all results
     for data, model_tp in zip(model_bench_data, models_tps):
         model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print(f"== Results {type_string} {model}-TP{tp_size} ====")
         print_timers(data)
 
-    timestamp = int(time.time())
+    timestr = time.strftime("%Y%m%d-%H%M%S")
 
-    all_data = []
+    all_results = []
     for d in model_bench_data:
-        all_data.extend(d)
+        all_results.extend(d)
+
     # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
+    with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f:
+        args_dict = vars(args)
+        args_dict.pop("func")
+        pkl.dump({
+            "args": args_dict,
+            "results": all_results,
+        }, f)
 
 
 if __name__ == "__main__":
 
     def to_torch_dtype(dt):
-        if dt == "bfloat16":
-            return torch.bfloat16
-        if dt == "float16":
-            return torch.float16
-        raise ValueError("unsupported dtype")
+        return {
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+            "int8": torch.int8,
+            "float8_e4m3fn": torch.float8_e4m3fn,
+            "int": torch.int,
+            "float": torch.float,
+        }[dt]
+
+    class ToTorchDtype(argparse.Action):
+
+        def __call__(self, parser, namespace, values, option_string=None):
+            setattr(namespace, self.dest, to_torch_dtype(values))
 
     parser = FlexibleArgumentParser(
         description="""
@@ -352,12 +574,42 @@ def to_torch_dtype(dt):
             """,  # noqa: E501
         formatter_class=argparse.RawTextHelpFormatter,
     )
-
     parser.add_argument(
-        "--dtype",
-        type=to_torch_dtype,
+        "--act-type",
+        action=ToTorchDtype,
         required=True,
-        help="Available options are ['bfloat16', 'float16']",
+        choices=['bfloat16', 'float16', 'int8', 'float8_e4m3fn'],
+    )
+    parser.add_argument(
+        "--group-scale-type",
+        action=ToTorchDtype,
+        choices=['bfloat16', 'float16'],
+    )
+    parser.add_argument(
+        "--group-zero-type",
+        type=to_torch_dtype,
+        choices=['bfloat16', 'float16'],
+    )
+    parser.add_argument(
+        "--channel-scale-type",
+        action=ToTorchDtype,
+        choices=['float'],
+    )
+    parser.add_argument(
+        "--token-scale-type",
+        action=ToTorchDtype,
+        choices=['float'],
+    )
+    parser.add_argument(
+        "--out-type",
+        action=ToTorchDtype,
+        choices=['bfloat16', 'float16'],
+    )
+    parser.add_argument(
+        "--group-size",
+        type=int,
+        help="Available options are ['None', '-1', '128'], default=128",
+        default=128,
     )
     parser.add_argument(
         "--sweep-schedules",
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 536c133bb3341..8fb44e3a3dbd8 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -131,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -141,7 +141,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
new file mode 100644
index 0000000000000..baa5de0fff1bd
--- /dev/null
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -0,0 +1,262 @@
+import itertools
+from typing import Optional, Tuple, Union
+
+import torch
+import triton
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from torch import nn
+
+from vllm import _custom_ops as vllm_ops
+
+
+class HuggingFaceRMSNorm(nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
+def rmsnorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
+    naive_norm.weight = nn.Parameter(weight)
+    naive_norm = naive_norm.to(x.device)
+
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    output = naive_norm(x, residual)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_flashinfer(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        fused_add_rmsnorm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        output = rmsnorm(x, weight, eps)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size,
+                    seq_len,
+                    hidden_size,
+                    dtype=dtype,
+                    device="cuda")
+    weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x) if use_residual else None
+
+    output_naive = rmsnorm_naive(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_flashinfer = rmsnorm_flashinfer(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_vllm = rmsnorm_vllm(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+
+    if use_residual:
+        output_naive = output_naive[0]
+        output_flashinfer = output_flashinfer[0]
+        output_vllm = output_vllm[0]
+
+    print(f"Naive output={output_naive}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"VLLM output={output_vllm}")
+
+    if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
+                      rtol=1e-2) and torch.allclose(
+                          output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+head_num_range = [32, 48]
+configs = list(
+    itertools.product(head_num_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark(use_residual):
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["head_num", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["huggingface", "flashinfer", "vllm"],
+            line_names=["HuggingFace", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=
+            f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
+            args={},
+        ))
+    def benchmark(head_num, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_size = head_num * 128  # assuming head_dim = 128
+
+        x = torch.randn(batch_size,
+                        seq_len,
+                        hidden_size,
+                        dtype=dtype,
+                        device="cuda")
+        weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+        residual = torch.randn_like(x) if use_residual else None
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "huggingface":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_naive(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_flashinfer(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_vllm(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-size",
+        type=int,
+        default=4096,
+        help="Hidden size (2nd dimension) of the sequence",
+    )
+    parser.add_argument("--use-residual",
+                        action="store_true",
+                        help="Whether to use residual connection")
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/rmsnorm/",
+        help="Path to save rmsnorm benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(batch_size=args.batch_size,
+                   seq_len=args.seq_len,
+                   hidden_size=args.hidden_size,
+                   use_residual=args.use_residual)
+
+    # Get the benchmark function with proper use_residual setting
+    benchmark = get_benchmark(args.use_residual)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index de608fd05af70..7d0bd84150a27 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -20,10 +20,11 @@
     args = parser.parse_args()
 
     with open(args.filename, 'rb') as f:
-        data: List[TMeasurement] = pickle.load(f)
+        data = pickle.load(f)
+        raw_results: List[TMeasurement] = data["results"]
 
     results = defaultdict(lambda: list())
-    for v in data:
+    for v in raw_results:
         result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
         if result is not None:
             KN = result.group(1)
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index 25ec9d6028627..51f24f3ba1774 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -40,4 +40,10 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
+    "meta-llama/Llama-3.1-405b-hf": [
+        ([16384, 18432], 1),
+        ([16384, 16384], 0),
+        ([16384, 106496], 1),
+        ([53248, 16384], 0),
+    ],
 }
diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json
new file mode 100644
index 0000000000000..6003698469e8d
--- /dev/null
+++ b/benchmarks/structured_schemas/structured_schema_1.json
@@ -0,0 +1,113 @@
+{
+    "$schema":
+    "https://json-schema.org/draft/2020-12/schema",
+    "title":
+    "User Profile",
+    "type":
+    "object",
+    "properties": {
+        "userId": {
+            "type": "string",
+            "description": "Unique identifier for the user."
+        },
+        "personalInfo": {
+            "type": "object",
+            "properties": {
+                "firstName": {
+                    "type": "string",
+                    "description": "The user's first name."
+                },
+                "lastName": {
+                    "type": "string",
+                    "description": "The user's last name."
+                },
+                "age": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "description": "The user's age."
+                },
+                "phoneNumbers": {
+                    "type":
+                    "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "type": {
+                                "type": "string",
+                                "enum": ["home", "work", "mobile"],
+                                "description": "Type of phone number."
+                            },
+                            "number": {
+                                "type": "string",
+                                "pattern": "^\\+?[1-9]\\d{1,14}$",
+                                "description": "Phone number in E.164 format."
+                            }
+                        },
+                        "required": ["type", "number"]
+                    },
+                    "description":
+                    "List of phone numbers associated with the user."
+                }
+            },
+            "required": ["firstName", "lastName"]
+        },
+        "address": {
+            "type": "object",
+            "properties": {
+                "street": {
+                    "type": "string",
+                    "description": "Street address."
+                },
+                "city": {
+                    "type": "string",
+                    "description": "City name."
+                },
+                "state": {
+                    "type": "string",
+                    "description": "State or province."
+                },
+                "postalCode": {
+                    "type": "string",
+                    "pattern": "^\\d{5}(-\\d{4})?$",
+                    "description": "Postal code."
+                },
+                "country": {
+                    "type": "string",
+                    "description": "Country name."
+                }
+            },
+            "required": ["street", "city", "state", "postalCode", "country"]
+        },
+        "preferences": {
+            "type": "object",
+            "properties": {
+                "newsletterSubscribed": {
+                    "type":
+                    "boolean",
+                    "description":
+                    "Indicates if the user is subscribed to the newsletter."
+                },
+                "favoriteCategories": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "List of user's favorite categories."
+                }
+            },
+            "required": ["newsletterSubscribed"]
+        },
+        "accountStatus": {
+            "type": "string",
+            "enum": ["active", "inactive", "suspended"],
+            "description": "Current status of the user's account."
+        },
+        "registrationDate": {
+            "type": "string",
+            "format": "date-time",
+            "description": "ISO 8601 formatted date-time of user registration."
+        }
+    },
+    "required":
+    ["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
+}
\ No newline at end of file
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 5912c5c02ede7..68f7ca1af05ad 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -16,9 +16,14 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mf16c"
+    )
+endif()
 list(APPEND CXX_COMPILE_FLAGS
     "-fopenmp"
-    "-mf16c"
     "-DVLLM_CPU_EXTENSION")
 
 execute_process(COMMAND cat /proc/cpuinfo
@@ -53,6 +58,8 @@ find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
 find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -72,9 +79,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
     else()
         message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
     endif()
+    
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
+    
 elseif (POWER9_FOUND OR POWER10_FOUND)
     message(STATUS "PowerPC detected")
     # Check for PowerPC VSX support
@@ -82,8 +91,20 @@ elseif (POWER9_FOUND OR POWER10_FOUND)
         "-mvsx"
         "-mcpu=native"
         "-mtune=native")
+
+elseif (ASIMD_FOUND)
+    message(STATUS "ARMv8 or later architecture detected")
+    if(ARM_BF16_FOUND)
+        message(STATUS "BF16 extension detected")
+        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
+        add_compile_definitions(ARM_BF16_SUPPORT)
+    else()
+        message(WARNING "BF16 functionality is not available")
+        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
+    endif()
+    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
 
 #
@@ -153,4 +174,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
index 741cd0c82dc89..cb1a069942069 100644
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@@ -140,13 +140,10 @@ void paged_attention_v1_launcher(
       blocksparse_block_size, blocksparse_head_sliding_step);
 
 #define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
+  if (is_block_sparse) {                                                   \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
   }
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
index 6de8d0bdd5b8d..c457bdb89008e 100644
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@@ -147,13 +147,10 @@ void paged_attention_v2_launcher(
       blocksparse_head_sliding_step);
 
 #define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
+  if (is_block_sparse) {                                                   \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
   }
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 1be806bbfa43c..8a95279f9a25a 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -307,10 +307,20 @@ void reshape_and_cache_flash(
     torch::Tensor& key_cache,  // [num_blocks, block_size, num_heads, head_size]
     torch::Tensor&
         value_cache,  // [num_blocks, block_size, num_heads, head_size]
-    torch::Tensor& slot_mapping,  // [num_tokens]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
     const std::string& kv_cache_dtype, const double k_scale,
     const double v_scale) {
-  int num_tokens = key.size(0);
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
   int block_size = key_cache.size(1);
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
new file mode 100644
index 0000000000000..ba9f40a230c8e
--- /dev/null
+++ b/csrc/core/math.hpp
@@ -0,0 +1,7 @@
+#include <climits>
+#include <iostream>
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
\ No newline at end of file
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index e73eca1b345fd..e21832ba7582f 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -24,12 +24,20 @@ struct KernelVecType<float> {
 
 template <>
 struct KernelVecType<c10::Half> {
+#ifdef __powerpc64__
+  // Power architecture-specific vector types
+  using q_load_vec_type = vec_op::FP32Vec8;
+  using k_load_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures, including x86
   using q_load_vec_type = vec_op::FP16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
   using k_load_vec_type = vec_op::FP16Vec16;
+  using v_load_vec_type = vec_op::FP16Vec16;
+#endif
+  using q_vec_type = vec_op::FP32Vec16;
   using k_vec_type = vec_op::FP32Vec16;
   using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP16Vec16;
 };
 
 #ifdef __AVX512BF16__
@@ -43,6 +51,21 @@ struct KernelVecType<c10::BFloat16> {
   using v_load_vec_type = vec_op::BF16Vec16;
 };
 #else
+  #ifdef __aarch64__
+    #ifndef ARM_BF16_SUPPORT
+    // pass
+    #else
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using q_load_vec_type = vec_op::BF16Vec8;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::BF16Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+    #endif
+  #else
 template <>
 struct KernelVecType<c10::BFloat16> {
   using q_load_vec_type = vec_op::BF16Vec8;
@@ -52,6 +75,7 @@ struct KernelVecType<c10::BFloat16> {
   using qk_acc_vec_type = vec_op::FP32Vec16;
   using v_load_vec_type = vec_op::BF16Vec16;
 };
+  #endif
 #endif
 
 template <typename T>
@@ -771,4 +795,4 @@ void paged_attention_v2(
                                  CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
                                  CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
                                });
-}
+}
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 0213be09105ed..28db0479748bf 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -1,4 +1,3 @@
-
 #ifndef CPU_TYPES_HPP
 #define CPU_TYPES_HPP
 
@@ -8,8 +7,11 @@
 #elif defined(__POWER9_VECTOR__)
   //ppc implementation
   #include "cpu_types_vsx.hpp"
+#elif defined(__aarch64__)
+  //arm implementation
+  #include "cpu_types_arm.hpp"
 #else
   #warning "unsupported vLLM cpu implementation"
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
new file mode 100644
index 0000000000000..73e0f8cb2e0fb
--- /dev/null
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -0,0 +1,515 @@
+#include <arm_neon.h>
+#include <torch/all.h> 
+#include <cmath>
+
+namespace vec_op {
+
+#ifdef ARM_BF16_SUPPORT
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)  
+#else
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+  template <typename T, T... indexes, typename F>
+  constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+    (f(std::integral_constant<T, indexes>{}), ...);
+  };
+}; 
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  float16x8_t reg;
+
+  explicit FP16Vec8(const void *ptr)
+      : reg(vld1q_f16(static_cast<const __fp16 *>(ptr))) {};
+
+  explicit FP16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const {
+    vst1q_f16(static_cast<__fp16 *>(ptr), reg);
+  }
+};
+
+struct FP16Vec16 : public Vec<FP16Vec16> {
+    constexpr static int VEC_ELEM_NUM = 16;
+    
+    float16x8x2_t reg; 
+    
+    explicit FP16Vec16(const void *ptr) {
+        reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));        
+        reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);    
+    }
+    
+    explicit FP16Vec16(const FP32Vec16& vec);
+    
+    void save(void *ptr) const {
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);       
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);   
+    }
+    
+    void save(void *ptr, const int elem_num) const {
+        int full_blocks = elem_num / 8;   
+        int remainder = elem_num % 8;     
+        
+        if (full_blocks > 0) {
+            vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+            if (full_blocks > 1) {
+                vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+            }
+        }
+        
+        if (remainder > 0) {
+            float16x8_t temp = reg.val[full_blocks];
+            for (int i = 0; i < remainder; ++i) {
+                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+            }
+        }
+    }
+};
+
+
+#ifdef ARM_BF16_SUPPORT
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  bfloat16x8_t reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8_t *>(ptr)) {};
+
+  explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};  
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8_t *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  bfloat16x8x2_t reg;
+
+  explicit BF16Vec16(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x2_t *>(ptr)) {};
+
+  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  explicit BF16Vec16(float32x4x4_t v) : reg({
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])
+  }){};
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x2_t *>(ptr) = reg; };
+};
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  bfloat16x8x4_t reg;
+
+  explicit BF16Vec32(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x4_t *>(ptr)) {};
+
+  explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
+
+  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg
+  }) {};
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; };
+};
+#endif
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+
+  union AliasReg {
+    float32x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4_t reg;
+
+  explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {};
+
+  explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
+
+  explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {};
+
+  explicit FP32Vec4(float32x4_t data) : reg(data) {};
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {};
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    float32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x2_t reg;
+
+  explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {};
+
+  explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
+
+  explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
+
+  explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
+
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {};
+
+  explicit FP32Vec8(const FP16Vec8 &v) {
+        reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));  
+        reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); 
+    };
+
+  explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
+
+  #ifdef ARM_BF16_SUPPORT
+
+  explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
+
+  explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
+
+  #endif
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+
+    return answer;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])};
+    float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])};
+    float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])};
+    float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])};
+
+    float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1);
+    float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])};
+    float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])};
+    float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])};
+    float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])};
+
+    float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1);
+    float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])), static_cast<float32_t>(erf(ar.values[1]))};
+    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])), static_cast<float32_t>(erf(ar.values[3]))};
+    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])), static_cast<float32_t>(erf(ar.values[5]))};
+    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])), static_cast<float32_t>(erf(ar.values[7]))};
+
+    float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
+    float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  } 
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  void save(float *ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    float32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x4_t reg;
+
+  explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+
+  explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
+
+  explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {}
+
+  explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data) {
+        reg.val[0] = data.reg.val[0]; 
+        reg.val[1] = data.reg.val[1]; 
+        reg.val[2] = data.reg.val[0]; 
+        reg.val[3] = data.reg.val[1]; 
+  }
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+
+  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {}
+
+  #ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(bfloat16x8x2_t v) : reg({
+    vcvtq_low_f32_bf16(v.val[0]),
+    vcvtq_high_f32_bf16(v.val[0]),
+    vcvtq_low_f32_bf16(v.val[1]),
+    vcvtq_high_f32_bf16(v.val[1])
+  }) {};
+  #endif
+
+  explicit FP32Vec16(const FP32Vec4 &data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  };
+
+  #ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(const BF16Vec16 &v) : reg({
+    vcvtq_low_f32_bf16(v.reg.val[0]),
+    vcvtq_high_f32_bf16(v.reg.val[0]),
+    vcvtq_low_f32_bf16(v.reg.val[1]),
+    vcvtq_high_f32_bf16(v.reg.val[1])
+  }) {};
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {};
+  #endif
+
+  explicit FP32Vec16(const FP16Vec16 &v) {
+      reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
+      reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
+      reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
+      reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
+  };
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vaddq_f32(reg.val[0], b.reg.val[0]),
+        vaddq_f32(reg.val[1], b.reg.val[1]),
+        vaddq_f32(reg.val[2], b.reg.val[2]),
+        vaddq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vmulq_f32(reg.val[0], b.reg.val[0]),
+        vmulq_f32(reg.val[1], b.reg.val[1]),
+        vmulq_f32(reg.val[2], b.reg.val[2]),
+        vmulq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vsubq_f32(reg.val[0], b.reg.val[0]),
+        vsubq_f32(reg.val[1], b.reg.val[1]),
+        vsubq_f32(reg.val[2], b.reg.val[2]),
+        vsubq_f32(reg.val[3], b.reg.val[3])
+    }));
+  };
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vdivq_f32(reg.val[0], b.reg.val[0]),
+        vdivq_f32(reg.val[1], b.reg.val[1]),
+        vdivq_f32(reg.val[2], b.reg.val[2]),
+        vdivq_f32(reg.val[3], b.reg.val[3])
+    }));
+  };
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+
+    return answer;
+  };
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&answer, &start, ar](int i) { answer += ar.values[start + i]; });
+
+    return answer;
+  };
+
+  void save(float *ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+    vst1q_f32(ptr + 8, reg.val[2]);
+    vst1q_f32(ptr + 12, reg.val[3]);
+  };
+};
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
+
+#ifdef ARM_BF16_SUPPORT
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+#endif
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<__fp16 *>(ptr) = v;
+}
+
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) {
+    float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
+    float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
+    float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
+    float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
+
+    reg.val[0] = vcombine_f16(low_0, high_0);
+    reg.val[1] = vcombine_f16(low_1, high_1);
+};
+
+inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) {
+    float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
+    float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
+
+    reg = vcombine_f16(lower_half, upper_half);
+};
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
+};
+
+#ifdef ARM_BF16_SUPPORT
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+
+  float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
+  float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
+  float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
+  float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1]));
+
+  float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0]));
+  float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0]));
+  float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1]));
+  float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1]));
+
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
+};
+#endif
+
+#ifdef ARM_BF16_SUPPORT
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
+  }){};
+#endif
+
+inline void prefetch(const void *addr) {
+    __builtin_prefetch(addr, 0, 1);
+};
+
+#ifdef ARM_BF16_SUPPORT
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { 
+  *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
+};
+#endif
+};
\ No newline at end of file
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index f42fa2361a2db..d9aed657a3113 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -25,7 +25,13 @@ struct KernelVecType<c10::BFloat16> {
 
 template <>
 struct KernelVecType<c10::Half> {
+#ifdef __powerpc64__
+  // Power architecture-specific vector type
+  using load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures
   using load_vec_type = vec_op::FP16Vec16;
+#endif
   using azp_adj_load_vec_type = vec_op::INT32Vec16;
   using cvt_vec_type = vec_op::FP32Vec16;
 };
diff --git a/csrc/cutlass_extensions/common.cpp b/csrc/cutlass_extensions/common.cpp
new file mode 100644
index 0000000000000..3d2093ab94297
--- /dev/null
+++ b/csrc/cutlass_extensions/common.cpp
@@ -0,0 +1,11 @@
+#include "cutlass_extensions/common.hpp"
+
+int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
new file mode 100644
index 0000000000000..85e359aa57113
--- /dev/null
+++ b/csrc/cutlass_extensions/common.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                       \
+  {                                                 \
+    cutlass::Status error = status;                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(error));     \
+  }
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                        \
+  {                                                               \
+    cudaError_t error = status;                                   \
+    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
+  }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
+int32_t get_sm_version_num();
diff --git a/csrc/cutlass_extensions/cute_utils.cuh b/csrc/cutlass_extensions/cute_utils.cuh
index 1842fab8b2cac..f61fe3ceb978a 100644
--- a/csrc/cutlass_extensions/cute_utils.cuh
+++ b/csrc/cutlass_extensions/cute_utils.cuh
@@ -20,9 +20,9 @@ CUTE_HOST_DEVICE static constexpr auto permute_layout(Layout l) {
 // is the layout f(x) = x
 template <typename Layout>
 CUTE_HOST_DEVICE static constexpr bool is_identity_layout() {
-  if constexpr (std::is_same_v<Layout, void>)
+  if constexpr (std::is_same_v<Layout, void>) {
     return true;
-  else {
+  } else {
     constexpr auto coalesced_layout = coalesce(Layout{});
     if constexpr (rank(coalesced_layout) == 1 &&
                   stride<0>(coalesced_layout) == 1) {
diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
similarity index 99%
rename from csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
rename to csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
index d407d66ab2aa6..7aa87feb4cce2 100644
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
@@ -52,6 +52,7 @@
 // clang-format off
 
 #include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cute/tensor.hpp"
 
 namespace cutlass::epilogue::threadblock {
diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
rename to csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
new file mode 100644
index 0000000000000..c69e87999ae71
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -0,0 +1,317 @@
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
+
+/*
+   This file defines custom epilogues for fusing channel scales, token scales,
+   bias, and activation zero-points onto a GEMM operation using the
+   CUTLASS 2.x API, for sm80 (Ampere) NVIDIA GPUs.
+
+   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace vllm::c2x {
+
+using namespace cute;
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad =
+      cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
+          OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad =
+      cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
+          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  template <typename T>
+  using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast<
+      OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  template <typename T>
+  using RowOrZeroLoad =
+      cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast<
+          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      // it would technically work but no use case as data_ptr is never nullptr
+      static_assert(!std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+    static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ protected:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBiasAzp
+    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
+
+  // This is the full AZP term, azp * J @ B, shape (1,n)
+  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute float(accum - azp_adj), both operands are int32_t
+  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Accum, AzpWithAdj>;
+
+  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
+                                              EVTComputeAzp>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
+                                              EVTComputeScaleB, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBiasAzpToken
+    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
+
+  // Per-token azp term, shape (m,1)
+  using Azp = typename SUPER::template ColLoad<int32_t>;
+
+  // This is the AZP adjustment term, J @ B, shape (1,n)
+  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute azp * azp_adj
+  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, int32_t, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Azp, AzpAdj>;
+
+  // Compute float(accum - azp*azp_adj), all operands are int32_t
+  using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAcc =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAcc, Accum, EVTComputeAzp>;
+
+  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
+                                              EVTComputeAcc>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
+                                              EVTComputeScaleB, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   torch::Tensor const& azp,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+};  // namespace vllm::c2x
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
new file mode 100644
index 0000000000000..fcc17c7727f94
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -0,0 +1,315 @@
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+
+/*
+   This file defines custom epilogues for fusing channel scales, token scales,
+   bias, and activation zero-points onto a GEMM operation using the
+   CUTLASS 3.x API, for NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace vllm::c3x {
+
+using namespace cute;
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
+                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
+                  std::is_same_v<Descriptor, RowLoad<T, true>>);
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.
+
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
+
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzp
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // This is the full AZP term, azp * J @ B, shape (1,n)
+  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute float(accum - azp_adj), both operands are int32_t
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzpToken
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // Per-token azp term, shape (m,1)
+  using Azp = typename SUPER::template ColLoad<int32_t>;
+
+  // This is the AZP adjustment term, J @ B, shape (1,n)
+  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute azp * azp_adj
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, int32_t, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
+
+  // Compute float(accum - azp*azp_adj), all operands are int32_t
+  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAcc =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   torch::Tensor const& azp,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+};  // namespace vllm::c3x
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index 4fcfcd311aa91..a5beea1a35e49 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -35,6 +35,35 @@ class MixedInputKernelScheduleType(enum.Enum):
     }
 }
 
+VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
+    **DataTypeSize,  # type: ignore
+    **{
+        VLLMDataType.u4b8: 4,
+        VLLMDataType.u8b128: 8,
+    }
+}
+
+VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+    VLLMDataType.u4b8: "vllm::kU4B8",
+    VLLMDataType.u8b128: "vllm::kU8B128",
+    DataType.u4: "vllm::kU4",
+    DataType.u8: "vllm::kU8",
+    DataType.s4: "vllm::kS4",
+    DataType.s8: "vllm::kS8",
+    DataType.f16: "vllm::kFloat16",
+    DataType.bf16: "vllm::kBfloat16",
+}
+
+VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+    DataType.u8: "at::ScalarType::Byte",
+    DataType.s8: "at::ScalarType::Char",
+    DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
+    DataType.s32: "at::ScalarType::Int",
+    DataType.f16: "at::ScalarType::Half",
+    DataType.bf16: "at::ScalarType::BFloat16",
+    DataType.f32: "at::ScalarType::Float",
+}
+
 VLLMKernelScheduleTag: Dict[Union[
     MixedInputKernelScheduleType, KernelScheduleType], str] = {
         **KernelScheduleTag,  # type: ignore
diff --git a/csrc/cutlass_extensions/vllm_numeric_conversion.cuh b/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
index 2ad914f8e9868..90f226cf64c0a 100644
--- a/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
+++ b/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
@@ -3,6 +3,7 @@
 #include "cutlass/numeric_conversion.h"
 #include "cutlass_extensions/vllm_custom_types.cuh"
 #include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/vllm_type_utils.cuh"
 
 // this file extends:
 //   https://github.com/NVIDIA/cutlass/blob/cutlass-3.5.0/include/cutlass/numeric_conversion.h
@@ -28,8 +29,19 @@ struct InterleavedNumericArrayConverter {
 
   CUTLASS_DEVICE
   static result_type convert(source_type const& source) {
-    CUTE_INVALID_CONTROL_PATH(
-        "InterleavedNumericArrayConverter not implemented\n");
+    if (cute::elect_one_sync()) {
+      if constexpr (std::is_same_v<IlvBlkLayout, void>) {
+        printf(
+            "Convert %s <= %s (N = %d, IlvBlkLayout = void), not implemented\n",
+            nameof_v<T>, nameof_v<S>, N);
+      } else {
+        printf(
+            "Convert %s <= %s (N = %d, size(IlvBlkLayout{}) = %d), not "
+            "implemented\n",
+            nameof_v<T>, nameof_v<S>, N, size(IlvBlkLayout{}));
+      }
+      __brkpt();
+    }
     return {};
   }
 
@@ -56,11 +68,6 @@ struct InterleavedNumericArrayConverter<
   result_type operator()(source_type const& s) const { return convert(s); }
 };
 
-// TODO (LucasWilkinson): Implement
-// for Array<cutlass::float8_e4m3fn, N> <= Array<vllm_uint4b8_t, N>
-
-// ....
-
 template <typename RegConvert32bit, typename T, typename S, int N>
 struct ArrayConverterPacked32Bit {
   using result_type = Array<T, N>;
@@ -86,14 +93,16 @@ struct ArrayConverterPacked32Bit {
   using ScalarConverter = NumericConverter<T, S>;
 
   template <typename PackedSrc>
-  CUTLASS_DEVICE static uint32_t to_reg(PackedSrc const& source) {
+  CUTLASS_DEVICE static auto to_regs(PackedSrc const& src) {
     if constexpr (sizeof(PackedSrc) == 1) {
-      return static_cast<uint32_t>(reinterpret_cast<const uint8_t&>(source));
+      return Array<uint32_t, 1>{reinterpret_cast<uint8_t const&>(src)};
     } else if constexpr (sizeof(PackedSrc) == 2) {
-      return static_cast<uint32_t>(reinterpret_cast<const uint16_t&>(source));
+      return Array<uint32_t, 1>{reinterpret_cast<uint16_t const&>(src)};
+    } else if constexpr (sizeof(PackedSrc) == 4) {
+      return Array<uint32_t, 1>{reinterpret_cast<uint32_t const&>(src)};
     } else {
-      static_assert(sizeof(PackedSrc) == 4);
-      return reinterpret_cast<const uint32_t&>(source);
+      static_assert(sizeof(PackedSrc) == 8);
+      return reinterpret_cast<Array<uint32_t, 2> const&>(src);
     }
   }
 
@@ -110,7 +119,7 @@ struct ArrayConverterPacked32Bit {
     static_assert(std::is_same_v<typename PackedSrcType::Element, S>);
     static_assert(std::is_same_v<typename PackedResultType::Element, T>);
 
-    return RegConvert32bit::template convert<PackedResultType>(to_reg(source));
+    return RegConvert32bit::template convert<PackedResultType>(to_regs(source));
   }
 
   friend class detail::VectorizedConverter;
@@ -140,6 +149,131 @@ struct ArrayConverterPacked32Bit {
   }
 };
 
+// Convert 8 4bit values packed into a 32bit register to 8 8bit values packed
+// into 2 32bit register.
+template <uint8_t LUT0, uint8_t LUT1, uint8_t LUT2, uint8_t LUT3,    //
+          uint8_t LUT4, uint8_t LUT5, uint8_t LUT6, uint8_t LUT7,    //
+          uint8_t LUT8, uint8_t LUT9, uint8_t LUT10, uint8_t LUT11,  //
+          uint8_t LUT12, uint8_t LUT13, uint8_t LUT14, uint8_t LUT15>
+CUTLASS_DEVICE cutlass::AlignedArray<uint32_t, 2> lut_4bit_to_8bit_convert(
+    uint32_t src) {
+  cutlass::AlignedArray<uint32_t, 2> r;
+  // Determines if the value is in the top half of the LUT if set or
+  //  (i.e. LUT[8:15]) in the bottom half (i.e. LUT[0:7]) if not set. Then move
+  //  into bit position 0x4 of each nibble so when or'd with final_prmt_base it
+  //  selects the correct candidate. When elements in final_prmt_base
+  //  are >= 0x4, the high candidate is selected (i.e. LUT[8:15]), when elements
+  //  are  < 0x4, the low candidate is selected (i.e. LUT[0:7])
+  uint32_t high_bit = (src & 0x88888888) >> 1;
+
+  // `high_bit` is OR'd with 0x31203120 to find the correct value in the LUT
+  // (selects correct high or low candidate)
+  const uint32_t final_prmt_base = 0x32103210;
+
+  // Ignore the high bit when indexing into LUT, for each 4bit value
+  //  we index into both the high and low candidates then use
+  //  high_bit | final_prmt_base to select the correct candidate
+  uint32_t lut_idx = (src & 0x77777777);
+
+  auto pack = [](uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
+    return uint32_t(a) | (uint32_t(b) << 8) | (uint32_t(c) << 16) |
+           (uint32_t(d) << 24);
+  };
+
+  static constexpr uint32_t LOW_0 = pack(LUT0, LUT1, LUT2, LUT3);
+  static constexpr uint32_t LOW_1 = pack(LUT4, LUT5, LUT6, LUT7);
+  static constexpr uint32_t HIGH_0 = pack(LUT8, LUT9, LUT10, LUT11);
+  static constexpr uint32_t HIGH_1 = pack(LUT12, LUT13, LUT14, LUT15);
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int ii = 0; ii < 2; ++ii, lut_idx >>= 16, high_bit >>= 16) {
+    uint32_t final_prmt_idx = final_prmt_base | high_bit;
+
+    // This uses a look up table to convert packed int4s to packed int8s,
+    // using the int4 value as the index to prmt. It first select both the
+    // high and low candidates, then uses the high bit (i.e. `high_bit`) to
+    // select the correct candidate.
+    asm volatile(
+        "{\n"
+        "  .reg .b32 low, high;\n"
+        "  prmt.b32 low, %1, %2, %5;\n"
+        "  prmt.b32 high, %3, %4, %5;\n"
+        "  prmt.b32 %0, low, high, %6;\n"
+        "}\n"
+        : "=r"(r[ii])
+        : "n"(LOW_0), "n"(LOW_1), "n"(HIGH_0), "n"(HIGH_1), "r"(lut_idx),
+          "r"(final_prmt_idx));
+  }
+
+  return r;
+};
+
+// for Array<int8_t, N> <= Array<vllm_uint4b8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<int8_t, vllm_uint4b8_t, N, Round> {
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<vllm_uint4b8_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as int8s
+      auto r = lut_4bit_to_8bit_convert<0xF8, 0xF9, 0xFA, 0xFB,  //
+                                        0xFC, 0xFD, 0xFE, 0xFF,  //
+                                        0x00, 0x01, 0x02, 0x03,  //
+                                        0x04, 0x05, 0x06, 0x07>(src_[0]);
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
+// for Array<cutlass::float_e4m3_t, N> <= Array<vllm_uint4b8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e4m3_t, vllm_uint4b8_t, N, Round> {
+  using result_type = Array<cutlass::float_e4m3_t, N>;
+  using source_type = Array<vllm_uint4b8_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+ private:
+  struct RegConvert {
+    template <typename PackedResultType>
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as fp8s
+      auto r = lut_4bit_to_8bit_convert<0xD0, 0xCE, 0xCC, 0xCA,  //
+                                        0xC8, 0xC4, 0xC0, 0xB8,  //
+                                        0x00, 0x38, 0x40, 0x44,  //
+                                        0x48, 0x4A, 0x4C, 0x4E>(src_[0]);
+      return reinterpret_cast<PackedResultType&>(r);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
 // for Array<cutlass::half_t, N> <= Array<vllm_uint4b8_t, N>
 template <FloatRoundStyle Round, int N>
 struct NumericArrayConverter<cutlass::half_t, vllm_uint4b8_t, N, Round> {
@@ -148,7 +282,8 @@ struct NumericArrayConverter<cutlass::half_t, vllm_uint4b8_t, N, Round> {
 
   struct RegConvert {
     template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
       using RegArray =
           cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
                                 sizeof(PackedResultType)>;
@@ -249,7 +384,8 @@ struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
  private:
   struct RegConvert {
     template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
       using RegArray =
           cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
                                 sizeof(PackedResultType)>;
@@ -338,7 +474,8 @@ struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
  private:
   struct RegConvert {
     template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
       using RegArray =
           cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
                                 sizeof(PackedResultType)>;
@@ -417,7 +554,8 @@ struct NumericArrayConverter<cutlass::half_t, vllm_uint8b128_t, N, Round> {
 
   struct RegConvert {
     template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
       // Hold output FP16s in reg. We need 1 reg for every 2 elements
       using RegArray =
           cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
@@ -469,7 +607,8 @@ struct NumericArrayConverter<float, vllm_uint8b128_t, N, Round> {
  private:
   struct RegConvert {
     template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
       PackedResultType r;
 
       // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of
@@ -513,7 +652,8 @@ struct NumericArrayConverter<cutlass::bfloat16_t, vllm_uint4b8_t, N, Round> {
  private:
   struct RegConvert {
     template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src_reg) {
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src_reg = src_[0];
       // Hold output BF16s in reg. We need 1 reg for every 2 elements
       using RegArray =
           cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
@@ -603,7 +743,8 @@ struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
  private:
   struct RegConvert {
     template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
       using RegArray =
           cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
                                 sizeof(PackedResultType)>;
@@ -671,7 +812,8 @@ struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
  private:
   struct RegConvert {
     template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
+    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
+      uint32_t src = src_[0];
       using RegArray =
           cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
                                 sizeof(PackedResultType)>;
@@ -788,6 +930,61 @@ struct NumericArrayConverter<cutlass::bfloat16_t, vllm_uint8b128_t, N, Round> {
 
 #endif
 
+// for Array<int8_t, N> <= Array<cutlass::half_t, N>
+//   FastFP16toINT8 from https://arxiv.org/pdf/2406.09904
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<int8_t, cutlass::half_t, N, Round> {
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<cutlass::half_t, N>;
+
+  struct RegConvert {
+    // FastFP16toINT8 from https://arxiv.org/pdf/2406.09904
+    template <typename PackedResultType, int src_regs>
+    CUTLASS_DEVICE static PackedResultType convert(
+        Array<uint32_t, src_regs> src) {
+      // Hold output int8s in reg. We need 1 reg for every 4 elements
+      using RegArray = cutlass::AlignedArray<
+          uint32_t, std::max(PackedResultType::kElements / 4, size_t(1))>;
+      RegArray r;
+
+      static constexpr uint32_t MAGIC_BIAS_ = 0x64806480;
+      auto MAGIC_BIAS = *reinterpret_cast<const half2*>(&MAGIC_BIAS_);
+
+      *reinterpret_cast<half2*>(&src[0]) =
+          __hadd2(*reinterpret_cast<half2*>(&src[0]), MAGIC_BIAS);
+
+      if constexpr (src_regs > 1) {
+        *reinterpret_cast<half2*>(&src[1]) =
+            __hadd2(*reinterpret_cast<half2*>(&src[1]), MAGIC_BIAS);
+      }
+
+      static_assert(PackedResultType::kElements <= 4);
+      uint32_t uint8s;
+      static constexpr uint32_t MASK_0246 = 0x6420;
+      static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080;
+      asm volatile("prmt.b32 %0,%1,%2,%3;\n"
+                   : "=r"(uint8s)
+                   : "r"(src[0]), "r"((src_regs > 1) ? src[1] : src[0]),
+                     "n"(MASK_0246));
+
+      uint32_t int8s = (uint8s ^ UINT8s_TO_INT8s_MASK);
+
+      return reinterpret_cast<PackedResultType&>(int8s);
+    };
+  };
+
+ public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
+                                     typename source_type::Element,
+                                     N>::convert(source);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) const { return convert(s); }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace cutlass
diff --git a/csrc/cutlass_extensions/vllm_type_utils.cuh b/csrc/cutlass_extensions/vllm_type_utils.cuh
new file mode 100644
index 0000000000000..500ed508c8303
--- /dev/null
+++ b/csrc/cutlass_extensions/vllm_type_utils.cuh
@@ -0,0 +1,42 @@
+#include "cutlass/bfloat16.h"
+#include "cutlass/half.h"
+#include "cuda_bf16.h"
+
+#include "cutlass_extensions/vllm_custom_types.cuh"
+
+namespace cutlass {
+
+template <typename T>
+struct nameof {
+  static constexpr char const* value = "unknown";
+};
+
+template <typename T>
+inline constexpr auto nameof_v = nameof<T>::value;
+
+#define NAMEOF_TYPE(T)                       \
+  template <>                                \
+  struct nameof<T> {                         \
+    static constexpr char const* value = #T; \
+  };
+
+NAMEOF_TYPE(float_e4m3_t)
+NAMEOF_TYPE(float_e5m2_t)
+NAMEOF_TYPE(half_t)
+NAMEOF_TYPE(nv_bfloat16)
+NAMEOF_TYPE(bfloat16_t)
+NAMEOF_TYPE(float)
+
+NAMEOF_TYPE(int4b_t)
+NAMEOF_TYPE(int8_t)
+NAMEOF_TYPE(int32_t)
+NAMEOF_TYPE(int64_t)
+
+NAMEOF_TYPE(vllm_uint4b8_t)
+NAMEOF_TYPE(uint4b_t)
+NAMEOF_TYPE(uint8_t)
+NAMEOF_TYPE(vllm_uint8b128_t)
+NAMEOF_TYPE(uint32_t)
+NAMEOF_TYPE(uint64_t)
+
+};  // namespace cutlass
\ No newline at end of file
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index a634e1c3d4886..03414b7e1ae93 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -14,6 +14,20 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
+// TODO(luka/varun): use FP8_TYPE macro after refactoring
+#ifndef USE_ROCM
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                      \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
+
 #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index 498d069c05f0d..dd1e6de2e0180 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -424,7 +424,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
         // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
         // (which occurs when `final_state_position` is a non-positivie index)
         // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
-        if (final_state_position < 0 && seqlen > kWidth){
+        if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
             input_t vals_load[kNElts] = {0};
             if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
                 // chunk = n_chunks - 2, a segment of the final state sits in the last index
diff --git a/csrc/ops.h b/csrc/ops.h
index 672e608e9c47e..c145e4eda0845 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -66,6 +66,14 @@ void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
                                          torch::Tensor& weight,
                                          torch::Tensor& scale, double epsilon);
 
+void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
+                                      torch::Tensor const& input,
+                                      torch::Tensor const& weight,
+                                      torch::Tensor& scales,
+                                      double const epsilon,
+                                      std::optional<torch::Tensor> scale_ub,
+                                      std::optional<torch::Tensor> residual);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
@@ -128,6 +136,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
                              int64_t thx, int64_t thy);
 
 torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
+#endif
 
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
                               int64_t n);
@@ -138,6 +147,7 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
 torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
                               int64_t row);
 
+#ifndef USE_ROCM
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
@@ -152,6 +162,15 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& azp_adj,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
+                              torch::Tensor const& b, torch::Tensor const& e,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias);
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
+                                   torch::Tensor& e, torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
deleted file mode 100644
index bf04bb400790f..0000000000000
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include <climits>
-
-/**
- * Helper function for checking CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                        \
-  {                                                  \
-    TORCH_CHECK(status == cutlass::Status::kSuccess, \
-                cutlassGetStatusString(status))      \
-  }
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index ee801e16573d4..dbb72e8bbd3f5 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -8,6 +8,10 @@
 #include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
 #include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
 
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp"
+
+using namespace vllm;
+
 /*
    This file defines quantized GEMM operations using the CUTLASS 2.x API, for
    NVIDIA GPUs with SM versions prior to sm90 (Hopper).
@@ -22,12 +26,11 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t,
-                                            Epilogue>(
+    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
@@ -42,10 +45,10 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogue>(
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
@@ -61,10 +64,10 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
@@ -78,12 +81,11 @@ void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
-                                            Epilogue>(
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
@@ -98,10 +100,10 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogue>(
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
@@ -117,10 +119,10 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
@@ -134,13 +136,12 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                                   Epilogue>(
+      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t,
-                                                   Epilogue>(
+      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
@@ -148,13 +149,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_sm89_fp8_dispatch<
-          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>(
+      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
-                                                  cutlass::half_t, Epilogue>(
+      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
@@ -170,10 +171,10 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogue>(
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
@@ -189,10 +190,10 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index 6329ff63623e2..75681f7f37820 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,8 +21,8 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
-#include "broadcast_load_epilogue_c2x.hpp"
-#include "common.hpp"
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
@@ -71,307 +71,6 @@ struct enable_sm89_to_sm90 : Kernel {
 #endif
   }
 };
-
-/*
- * This class provides the common load descriptors for the
- * ScaledEpilogue[...] classes
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
-
-  template <typename T>
-  using ColOrScalarLoad =
-      cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
-          OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowOrScalarLoad =
-      cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
-          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
-
-  template <typename T>
-  using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast<
-      OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast<
-      OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
-
-  template <typename T>
-  using RowOrZeroLoad =
-      cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast<
-          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
-
-  // This utility function constructs the arguments for the load descriptors
-  // from a tensor. It can handle both row and column, as well as row/column or
-  // scalar cases.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(torch::Tensor const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
-    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
-                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
-      return Arguments{data_ptr, tensor.numel() != 1};
-    } else {
-      // it would technically work but no use case as data_ptr is never nullptr
-      static_assert(!std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
-      return Arguments{data_ptr};
-    }
-  }
-
-  // This overload handles the case where there might not be a tensor, in which
-  // case a nullptr is passed and a constant (0) is used.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
-    static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
-    return Arguments{data_ptr};
-  }
-};
-
-/*
- This epilogue function defines a quantized GEMM operation similar to
- torch._scaled_mm.
-
- A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
- per-row. B can be quantized per-tensor or per-column.
- Any combination of per-tensor and per-row or column is supported.
- A and B must have symmetric quantization (zero point == 0).
-
- So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
- scales are applied elementwise with numpy-style broadcasting.
-
- ScaleA and ScaleB define the epilogue functions that apply the scales for
- the A and B operands respectively. These scales may be either per-tensor or
- per row or column.
-*/
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
-  }
-};
-
-/*
- * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
- * This bias can also be used in the per-tensor azp case, where the activation
- * zero point (azp) is used to compute an azp correction term,
- * which is folded into the bias.
- *
- * The bias tensor must be per-output channel.
- * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBias
-    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- protected:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD>;
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
-                                                             EVTCompute0, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
-  }
-};
-
-/*
- * This epilogue directly supports per-tensor azp in int32 form.
- * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
- * term, which should already be multiplied with the scalar azp.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBiasAzp
-    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
-
-  // This is the full AZP term, azp * J @ B, shape (1,n)
-  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute float(accum - azp_adj), both operands are int32_t
-  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Accum, AzpWithAdj>;
-
-  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
-                                              EVTComputeAzp>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
-                                              EVTComputeScaleB, Bias>;
-
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-/*
- * This epilogue supports per-token azp by computing and applying
- * the correction term using a rank-1 update. If the term were materialized,
- * it would require O(m*n) space, and this way it only requires O(m+n) space.
- * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
- * point for each row of A.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBiasAzpToken
-    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
-
-  // Per-token azp term, shape (m,1)
-  using Azp = typename SUPER::template ColLoad<int32_t>;
-
-  // This is the AZP adjustment term, J @ B, shape (1,n)
-  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute azp * azp_adj
-  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, int32_t, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Azp, AzpAdj>;
-
-  // Compute float(accum - azp*azp_adj), all operands are int32_t
-  using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAcc =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeAcc, Accum, EVTComputeAzp>;
-
-  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
-                                              EVTComputeAcc>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
-                                              EVTComputeScaleB, Bias>;
-
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
 template <typename Arch, template <typename> typename ArchGuard,
           typename ElementAB_, typename ElementD_,
           template <typename, typename> typename Epilogue_, typename TileShape,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 292c9e4b34e1c..8190277997161 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -23,11 +23,13 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
+using namespace vllm;
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
@@ -56,305 +58,6 @@ struct enable_sm90_or_later : Kernel {
   #endif
   }
 };
-
-/*
- * This class provides the common load descriptors for the
- * ScaledEpilogue[...] classes
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-  template <typename T>
-  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // This utility function constructs the arguments for the load descriptors
-  // from a tensor. It can handle both row and column, as well as row/column or
-  // scalar cases.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(torch::Tensor const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
-    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
-                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
-      return Arguments{data_ptr, tensor.numel() != 1};
-    } else {
-      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
-                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
-      return Arguments{data_ptr};
-    }
-  }
-
-  // This overload handles the case where there might not be a tensor, in which
-  // case a nullptr is passed and a constant (0) is used.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
-    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
-                  std::is_same_v<Descriptor, RowLoad<T, true>>);
-    return Arguments{data_ptr};
-  }
-};
-
-/*
-   This epilogue function defines a quantized GEMM operation similar to
-   torch.scaled_mm_.
-
-   A and B may be both either int8 or fp8_e4m3. A can be
-   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
-*/
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
-  }
-};
-
-/*
- * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
- * This bias can also be used in the per-tensor azp case, where the activation
- * zero point (azp) is used to compute an azp correction term,
- * which is folded into the bias.
- *
- * The bias tensor must be per-output channel.
- * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
-
-  using ArgumentType = typename EVTCompute::Arguments;
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
-  }
-};
-
-/*
- * This epilogue directly supports per-tensor azp in int32 form.
- * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
- * term, which should already be multiplied with the scalar azp.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzp
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // This is the full AZP term, azp * J @ B, shape (1,n)
-  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute float(accum - azp_adj), both operands are int32_t
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-/*
- * This epilogue supports per-token azp by computing and applying
- * the correction term using a rank-1 update. If the term were materialized,
- * it would require O(m*n) space, and this way it only requires O(m+n) space.
- * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
- * point for each row of A.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzpToken
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // Per-token azp term, shape (m,1)
-  using Azp = typename SUPER::template ColLoad<int32_t>;
-
-  // This is the AZP adjustment term, J @ B, shape (1,n)
-  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute azp * azp_adj
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, int32_t, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
-
-  // Compute float(accum - azp*azp_adj), all operands are int32_t
-  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAcc =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
 template <typename ElementAB_, typename ElementD_,
           template <typename, typename, typename> typename Epilogue_,
           typename TileShape, typename ClusterShape, typename KernelSchedule,
@@ -721,11 +424,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == c.dtype(),
                 "currently bias dtype must match output dtype ", c.dtype());
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
         c, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogue>(c, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        c, a, b, a_scales, b_scales);
   }
 }
 
@@ -740,10 +443,10 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 97a969cf5e3e0..4f7b6588ef3f7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -3,6 +3,8 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 
+#include "cutlass_extensions/common.hpp"
+
 void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -79,16 +81,6 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
-int32_t get_sm_version_num() {
-  int32_t major_capability, minor_capability;
-  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
-                         0);
-  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
-                         0);
-  int32_t version_num = major_capability * 10 + minor_capability;
-  return version_num;
-}
-
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index d7c0297d5333f..15bd5b6ed1564 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -1,6 +1,9 @@
 #pragma once
 
+#include "quantization/vectorization.cuh"
+
 #include <cmath>
+#include <c10/core/ScalarType.h>
 
 #ifndef USE_ROCM
   #include <c10/util/Float8_e4m3fn.h>
@@ -15,6 +18,7 @@ using FP8_TYPE = c10::Float8_e4m3fnuz;
 // issue when running dynamic quantization. Here use 224.0f for rocm.
 constexpr auto FP8_E4M3_MAX = 224.0f;
 #endif
+constexpr static auto kFp8Type = c10::CppTypeToScalarType<FP8_TYPE>::value;
 
 namespace vllm {
 
@@ -89,22 +93,6 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   }
 }
 
-template <typename scalar_t>
-struct __align__(8) vec4_t {
-  scalar_t x;
-  scalar_t y;
-  scalar_t z;
-  scalar_t w;
-};
-
-typedef struct __align__(4) {
-  FP8_TYPE x;
-  FP8_TYPE y;
-  FP8_TYPE z;
-  FP8_TYPE w;
-}
-float8x4_t;
-
 template <typename scalar_t>
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                 int64_t const num_elems, int const tid,
@@ -139,10 +127,10 @@ __device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
+  using float8x4_t = q8x4_t<FP8_TYPE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
 
   int64_t const num_vec_elems = num_elems >> 2;
 
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
new file mode 100644
index 0000000000000..3c4f183bf4b59
--- /dev/null
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -0,0 +1,160 @@
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../../dispatch_utils.h"
+#include "layernorm_utils.cuh"
+#include "quant_conversions.cuh"
+
+namespace vllm {
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void rms_norm_dynamic_per_token_quant_vec(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute rms
+  vllm::vectorized::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, var_epsilon, residual);
+
+  // Compute scale
+  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
+                                                     has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
+                                     has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert token_scale for exact match with FBGemm
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
+                                     has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+
+// RMS norm + quant kernel
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__global__ void rms_norm_dynamic_per_token_quant_kernel(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  // For vectorization, token_input and token_output pointers need to be
+  // aligned at 8-byte and 4-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 4 == 0;
+
+  if (can_vectorize) {
+    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
+                                                has_residual>(
+        out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
+        hidden_size, residual);
+  }
+
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute RMS
+  vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
+                                            var_epsilon, residual);
+  // Compute Scale
+  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert s_token_scale for exact match with FBGemm
+    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+}  // namespace vllm
+
+// Residual add + RMS norm + dynamic per token
+template <typename scalar_in_t>
+void rms_norm_dynamic_per_token_quant_dispatch(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> const& scale_ub,
+    std::optional<at::Tensor>& residual) {
+  int32_t hidden_size = input.size(-1);
+  int32_t num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const float min_scaling_factor =
+      out.dtype() == torch::kInt8
+          ? std::numeric_limits<float>::epsilon()
+          : 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
+
+  if (residual.has_value()) {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        true>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, min_scaling_factor, hidden_size,
+                  residual->data_ptr<scalar_in_t>());
+        });
+
+  } else {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        false>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, min_scaling_factor, hidden_size, nullptr);
+        });
+  }
+}
+
+void rms_norm_dynamic_per_token_quant(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> scale_ub, std::optional<at::Tensor> residual) {
+  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+
+  if (scale_ub.has_value()) {
+    TORCH_CHECK(out.dtype() == kFp8Type);
+  }
+  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
+        rms_norm_dynamic_per_token_quant_dispatch<scalar_t>(
+            out, input, weight, scales, var_epsilon, scale_ub, residual);
+      });
+}
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
new file mode 100644
index 0000000000000..cec6b54edb569
--- /dev/null
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -0,0 +1,327 @@
+#pragma once
+
+/**
+ * __device__ layernorm utilities.
+ */
+
+#include "quantization/vectorization.cuh"
+#include "quant_conversions.cuh"
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+// has_residual must be true, if residual is not a nullptr
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  // sum of squares
+  float ss = 0.0f;
+
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    ss += x * x;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+
+  float block_absmax_val_maybe = 0.0f;
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor);
+    s_token_scale = scale;                 // Shared memory store
+    all_token_scales[blockIdx.x] = scale;  // Global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+      residual[token_offset + i] = static_cast<scalar_t>(x);
+    }
+    // Norm
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    // Quant
+    output[token_offset + i] =
+        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
+  }
+}
+
+namespace vectorized {
+
+// Compute 1.0/rms(input)
+// hidden_size must be a multiple of 4
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  // sum of squares
+  float ss = 0.0f;
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+    }
+
+    ss += x.x * x.x;
+    ss += x.y * x.y;
+    ss += x.z * x.z;
+    ss += x.w * x.w;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+// Vectorized version of vllm::compute_dynamic_per_token_scales
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+  float block_absmax_val_maybe = 0.0f;
+
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+    }
+
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor);
+    s_token_scale = scale;                 // shared memory store
+    all_token_scales[blockIdx.x] = scale;  // global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/output/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  q8x4_t<scalar_out_t>* vec_output =
+      reinterpret_cast<q8x4_t<scalar_out_t>*>(&output[token_offset]);
+  vec4_t<scalar_t>* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
+  }
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+// TODO(luka/varun) extract into type-agnostic vectorized quant function to
+//  replace scaled_fp8_conversion_vec
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> const in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+      // Update residual
+      r.x = static_cast<scalar_t>(x.x);
+      r.y = static_cast<scalar_t>(x.y);
+      r.z = static_cast<scalar_t>(x.z);
+      r.w = static_cast<scalar_t>(x.w);
+      vec_residual[i] = r;
+    }
+
+    q8x4_t<scalar_out_t> out;
+    out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.x * rms) * w.x, scale);
+    out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.y * rms) * w.y, scale);
+    out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.z * rms) * w.z, scale);
+    out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.w * rms) * w.w, scale);
+    vec_output[i] = out;
+  }
+}
+
+}  // namespace vectorized
+
+}  // namespace vllm
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
new file mode 100644
index 0000000000000..f8a9872226a3a
--- /dev/null
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -0,0 +1,81 @@
+#pragma once
+
+/**
+ * __device__ helper functions to deal with float -> quant datatype conversion
+ */
+
+#include "quantization/vectorization.cuh"
+// TODO(luka/varun):refactor common.cuh to use this file instead
+#include "quantization/fp8/common.cuh"
+
+namespace vllm {
+
+// TODO(luka/varun): combine into common utilities for int8
+//  (with int8_quant_kernels.cu)
+static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
+#ifdef USE_ROCM
+  static const float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  static const float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  // round
+  float dst = std::nearbyint(x);
+  // saturate
+  dst = std::clamp(dst, i8_min, i8_max);
+  return static_cast<int8_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+#endif
+}
+
+static __device__ __forceinline__ FP8_TYPE float_to_fp8(float const x) {
+  float const r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
+  return static_cast<FP8_TYPE>(r);
+}
+
+template <typename quant_type_t, bool is_scale_inverted, typename enable = void>
+struct ScaledQuant;
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, int8_t>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_int8_rn(x * scale);
+    } else {
+      return float_to_int8_rn(x / scale);
+    }
+  }
+};
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, FP8_TYPE>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_fp8(x * scale);
+    } else {
+      return float_to_fp8(x / scale);
+    }
+  }
+};
+
+template <typename scalar_t, typename quant_type_t, bool is_scale_inverted>
+__device__ void scaled_quant_conversion(quant_type_t* __restrict__ output,
+                                        scalar_t const* __restrict__ input,
+                                        float const scale, int const tid,
+                                        int const num_elements,
+                                        int const step) {
+  for (int i = tid; i < num_elements; i += step) {
+    output[i] = ScaledQuant<quant_type_t, is_scale_inverted>(input[i], scale);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h
index fba94fd1d157b..d42205a6571db 100644
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@@ -1,7 +1,7 @@
 // copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-common.h
 #define QK_K 256
 #define K_QUANTS_PER_ITERATION 2
-#define WARP_SIZE 32
+#define WARP_SIZE_GGUF 32
 #define K_SCALE_SIZE 12
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
@@ -1112,4 +1112,19 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 #endif
     return c;
 }
+
+static __device__ __forceinline__ uint32_t __vcmpeq4(const uint32_t a, const uint32_t b) {
+    uint32_t neq = a^b;
+    return !(neq & 0xff000000) * 0xff000000 |
+           !(neq & 0x00ff0000) * 0x00ff0000 |
+           !(neq & 0x0000ff00) * 0x0000ff00 |
+           !(neq & 0x000000ff) * 0x000000ff;
+}
+
+static __device__ __forceinline__ uint32_t __vsub4(const uint32_t a, const uint32_t b) {
+    return (static_cast<uint8_t>(((a & 0xff000000) >> 24) - ((b & 0xff000000) >> 24)) << 24) +
+           (static_cast<uint8_t>(((a & 0x00ff0000) >> 16) - ((b & 0x00ff0000) >> 16)) << 16) +
+           (static_cast<uint8_t>(((a & 0x0000ff00) >>  8) - ((b & 0x0000ff00) >>  8)) <<  8) +
+           (static_cast<uint8_t>(((a & 0x000000ff) >>  0) - ((b & 0x000000ff) >>  0)) <<  0);
+}
 #endif // defined(USE_ROCM)
diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index 37e4de4e14dd3..5f0eaf5a973fb 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -4,6 +4,8 @@
 #include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 
+#include "cuda_compat.h"
+
 #include "ggml-common.h"
 #include "vecdotq.cuh"
 #include "dequantize.cuh"
@@ -32,8 +34,8 @@ static __global__ void quantize_q8_1(const half* __restrict__ x,
 
 #pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1) {
-    amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
-    sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
+    amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32));
+    sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32);
   }
 
   const float d = amax / 127;
diff --git a/csrc/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh
index d13efd5965313..c935faa07df0c 100644
--- a/csrc/quantization/gguf/mmq.cuh
+++ b/csrc/quantization/gguf/mmq.cuh
@@ -10,7 +10,7 @@ static __device__ __forceinline__ void mul_mat_q(
 
     const int blocks_per_row_x = ncols_x / qk;
     const int blocks_per_col_y = nrows_y / QK8_1;
-    const int blocks_per_warp = WARP_SIZE / qi;
+    const int blocks_per_warp = WARP_SIZE_GGUF / qi;
 
     const int & ncols_dst = ncols_y;
 
@@ -27,10 +27,10 @@ static __device__ __forceinline__ void mul_mat_q(
 
     allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
 
-    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
-    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
+    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE_GGUF];
+    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE_GGUF/QI8_1];
 
-    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
+    float sum[mmq_y/WARP_SIZE_GGUF][mmq_x/nwarps] = {{0.0f}};
 
     for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
 
@@ -39,26 +39,26 @@ static __device__ __forceinline__ void mul_mat_q(
 
 #pragma unroll
         for (int ir = 0; ir < qr; ++ir) {
-            const int kqs = ir*WARP_SIZE + threadIdx.x;
+            const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
             const int kbxd = kqs / QI8_1;
 
 #pragma unroll
             for (int i = 0; i < mmq_x; i += nwarps) {
                 const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
                 const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
+                const int index_y = (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
                 tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
             }
 
 #pragma unroll
             for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
-                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
+                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
+                const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
                 const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
 
                 // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
-                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
+                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE_GGUF/QI8_1) + kby].ds;
+                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE_GGUF/QI8_1) + kby];
                 if (need_sum) {
                     *dsi_dst = *dsi_src;
                 } else {
@@ -70,12 +70,12 @@ static __device__ __forceinline__ void mul_mat_q(
             __syncthreads();
 
 // #pragma unroll // unrolling this loop causes too much register pressure
-            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
+            for (int k = ir*WARP_SIZE_GGUF/qr; k < (ir+1)*WARP_SIZE_GGUF/qr; k += vdr) {
 #pragma unroll
                 for (int j = 0; j < mmq_x; j += nwarps) {
 #pragma unroll
-                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+                        sum[i/WARP_SIZE_GGUF][j/nwarps] += vec_dot(
                             tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
                             threadIdx.x + i, threadIdx.y + j, k);
                     }
@@ -93,12 +93,12 @@ static __device__ __forceinline__ void mul_mat_q(
         }
 
 #pragma unroll
-        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+        for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
             const int row_dst = row_dst_0 + threadIdx.x + i;
             if (row_dst >= nrows_dst) {
                 continue;
             }
-            dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE][j/nwarps]);
+            dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE_GGUF][j/nwarps]);
         }
     }
 }
@@ -115,7 +115,7 @@ static __device__ __forceinline__ void mul_mat_q(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q4_0, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2)
 #endif
 mul_mat_q4_0(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -140,7 +140,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
@@ -165,7 +165,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q4_1, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2)
 #endif
 mul_mat_q4_1(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -190,7 +190,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
@@ -215,7 +215,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q5_0, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2)
 #endif
 mul_mat_q5_0(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -240,7 +240,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
@@ -265,7 +265,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q5_1, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2)
 #endif
 mul_mat_q5_1(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -289,7 +289,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
@@ -314,7 +314,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q8_0, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2)
 #endif
 mul_mat_q8_0(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -338,7 +338,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
@@ -363,7 +363,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q2_K, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2)
 #endif
 mul_mat_q2_K(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -387,7 +387,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
@@ -412,7 +412,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q3_K, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2)
 #endif
 mul_mat_q3_K(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -438,7 +438,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
@@ -463,7 +463,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q4_K, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2)
 #endif
 mul_mat_q4_K(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -487,7 +487,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
@@ -512,7 +512,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q5_K, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2)
 #endif
 mul_mat_q5_K(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -537,7 +537,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
@@ -562,7 +562,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
 
 template <bool need_check> static __global__ void
 #if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE*NWARPS_Q6_K, 2)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2)
 #endif
 mul_mat_q6_K(
     const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
@@ -586,7 +586,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
     const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh
index b221ae7896138..b01e939808a3f 100644
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@@ -28,8 +28,8 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) {
+        tmp += VLLM_SHFL_XOR_SYNC(tmp, mask);
     }
 
     if (threadIdx.x == 0) {
diff --git a/csrc/quantization/gguf/vecdotq.cuh b/csrc/quantization/gguf/vecdotq.cuh
index d5af345a6b26f..e00422637c65b 100644
--- a/csrc/quantization/gguf/vecdotq.cuh
+++ b/csrc/quantization/gguf/vecdotq.cuh
@@ -43,7 +43,7 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
 
 template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
     const int * v, const int * u, const float & d4, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     int sumi = 0;
 
 #pragma unroll
@@ -68,7 +68,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
 
 template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
     const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     int sumi = 0;
 
 #pragma unroll
@@ -95,7 +95,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
 
 template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
     const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     int sumi = 0;
 
 #pragma unroll
@@ -128,7 +128,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
 
 template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
     const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     int sumi = 0;
 
 #pragma unroll
@@ -162,7 +162,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
 
 template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
     const int * v, const int * u, const float & d8_0, const float & d8_1) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     int sumi = 0;
 
 #pragma unroll
@@ -176,7 +176,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
 
 template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
     const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
 
     int sumi = 0;
 
@@ -202,7 +202,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
 static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
     const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
     const half2 & dm2, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -230,7 +230,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
 static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
     const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
     const half2 & dm2, const float & d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     int sumi_d = 0;
     int sumi_m = 0;
 
@@ -267,7 +267,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
 static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
     const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
     const int & scale_offset, const float & d3, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
 
     float sumf = 0.0f;
 
@@ -301,7 +301,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
 static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
     const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
     const float & d3, const float & d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     int sumi = 0;
 
 #pragma unroll
@@ -326,7 +326,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
 static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
     const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
     const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
 
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
@@ -351,7 +351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
 static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
     const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
     const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -382,7 +382,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
 static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
     const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
     const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
 
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
@@ -413,7 +413,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
 static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
     const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
     const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -445,7 +445,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
 static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
     const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
     const float & d, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     float sumf = 0.0f;
 
 #pragma unroll
@@ -465,7 +465,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
 static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
     const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
     const float & d6, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     float sumf_d = 0.0f;
 
 #pragma unroll
@@ -507,8 +507,8 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI4_0) + mmq_y/QI4_0];
     *x_ql = tile_x_qs;
     *x_dm = (half2 *) tile_x_d;
 }
@@ -529,11 +529,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+        // x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbx] = bxi->d;
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_0;
     const int kbxd = k % blocks_per_tile_x_row;
 
 #pragma unroll
@@ -543,7 +543,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d);
+        x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d);
     }
 }
 
@@ -559,13 +559,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
 
 #pragma unroll
     for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
+        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_0) % WARP_SIZE_GGUF];
     }
 
     return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i/QI4_0 + k/QI4_0],
+         y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
 }
 
 static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
@@ -587,8 +587,8 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
+    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE_GGUF) +     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_1) + mmq_y/QI4_1];
     *x_ql = tile_x_qs;
     *x_dm = tile_x_dm;
 }
@@ -608,10 +608,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_1;
     const int kbxd = k % blocks_per_tile_x_row;
 
 #pragma unroll
@@ -621,7 +621,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+        x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
     }
 }
 
@@ -634,13 +634,13 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
 
 #pragma unroll
     for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
+        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_1) % WARP_SIZE_GGUF];
     }
 
     return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i/QI4_1 + k/QI4_1],
+         y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
 }
 
 static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
@@ -664,8 +664,8 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
+    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI5_0) + mmq_y/QI5_0];
 
     *x_ql = tile_x_ql;
     *x_dm = (half2 *) tile_x_d;
@@ -697,7 +697,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
         qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
 
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0;
 
         int qs1 = (ql >>  4)   & 0x0F0F0F0F;
         qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
@@ -706,10 +706,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
         qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
 
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1;
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_0;
     const int kbxd = k % blocks_per_tile_x_row;
     float * x_dmf = (float *) x_dm;
 
@@ -722,7 +722,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         }
 
         const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d);
+        x_dmf[i * (WARP_SIZE_GGUF/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d);
     }
 }
 
@@ -730,7 +730,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
+    const int index_bx = i * (WARP_SIZE_GGUF/QI5_0) + i/QI5_0 + k/QI5_0;
     const float * x_dmf = (const float *) x_dm;
     const float * y_df  = (const float *) y_ds;
 
@@ -738,12 +738,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
 
 #pragma unroll
     for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
+        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_0) % WARP_SIZE_GGUF];
     }
 
     return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+        (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
 }
 
 static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
@@ -767,8 +767,8 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_1) + mmq_y/QI5_1];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
@@ -801,7 +801,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
         qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
 
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0;
 
         int qs1 = (ql >>  4) & 0x0F0F0F0F;
         qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
@@ -809,10 +809,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
         qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
 
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1;
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_1;
     const int kbxd = k % blocks_per_tile_x_row;
 
 #pragma unroll
@@ -825,7 +825,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
         const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
 
-        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+        x_dm[i * (WARP_SIZE_GGUF/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
     }
 }
 
@@ -833,18 +833,18 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
+    const int index_bx = i * (WARP_SIZE_GGUF/QI5_1) + + i/QI5_1 + k/QI5_1;
 
     int u[2*VDR_Q5_1_Q8_1_MMQ];
 
 #pragma unroll
     for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
+        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_1) % WARP_SIZE_GGUF];
     }
 
     return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+        (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
 }
 
 static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
@@ -865,8 +865,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI8_0) + mmq_y/QI8_0];
 
     *x_ql = tile_x_qs;
     *x_dm = (half2 *) tile_x_d;
@@ -889,10 +889,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI8_0;
     const int kbxd = k % blocks_per_tile_x_row;
 
 #pragma unroll
@@ -903,7 +903,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d);
+        x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d);
     }
 }
 
@@ -914,8 +914,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
     const float * y_df  = (const float *) y_ds;
 
     return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
-         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
+        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[j * WARP_SIZE_GGUF + k], x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i/QI8_0 + k/QI8_0],
+         y_df[j * (WARP_SIZE_GGUF/QI8_1) + k/QI8_1]);
 }
 
 static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
@@ -942,9 +942,9 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI2_K) + mmq_y/QI2_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4)     + mmq_y/4];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
@@ -967,10 +967,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI2_K;
     const int kbxd = k % blocks_per_tile_x_row;
 
 #pragma unroll
@@ -981,18 +981,18 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+        x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
     }
 
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4);
 
         if (need_check) {
             i = min(i, i_max);
         }
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI2_K/4);
+        x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
     }
 }
 
@@ -1005,7 +1005,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
 
     int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
 
-    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+    const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
     const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
 
 #pragma unroll
@@ -1013,10 +1013,10 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
         v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
     }
 
-    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4]) + ky/4;
 
-    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
-    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+    const int index_y = j * WARP_SIZE_GGUF + (QR2_K*k) % WARP_SIZE_GGUF;
+    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
 }
 
 static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
@@ -1047,10 +1047,10 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
-    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI3_K) + mmq_y/QI3_K];
+    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE_GGUF/2)     + mmq_y/2];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4)     + mmq_y/4];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
@@ -1073,10 +1073,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI3_K;
     const int kbxd = k % blocks_per_tile_x_row;
     float * x_dmf = (float *) x_dm;
 
@@ -1087,27 +1087,27 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d);
+        x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d);
     }
 
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
-        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
+        int i = i0 + i_offset * 2 + k / (WARP_SIZE_GGUF/2);
         if (need_check) {
             i = min(i, i_max);
         }
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/2)) / (QI3_K/2);
         // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
+        x_qh[i * (WARP_SIZE_GGUF/2) + i / 2 + k % (WARP_SIZE_GGUF/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
     }
 
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4);
         if (need_check) {
             i = min(i, i_max);
         }
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI3_K/4);
 
         const int ksc = k % (QI3_K/4);
 
@@ -1121,7 +1121,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
         const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
 
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
+        x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = sc;
     }
 }
 
@@ -1134,24 +1134,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
     const float * x_dmf = (const float *) x_dm;
     const float * y_df  = (const float *) y_ds;
 
-    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
+    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4)) + ky/4;
 
     int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
 
 #pragma unroll
     for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
-        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+        const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
         const int shift = 2 * ((ky % 32) / 8);
         const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
 
-        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+        const int vh = x_qh[i * (WARP_SIZE_GGUF/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
         const int vlh = (vh << 2) & 0x04040404;
 
         v[l] = __vsubss4(vll, vlh);
     }
 
-    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
-    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+    const int index_y = j * WARP_SIZE_GGUF + (k*QR3_K) % WARP_SIZE_GGUF;
+    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
 }
 
 static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
@@ -1200,9 +1200,9 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_K) + mmq_y/QI4_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
@@ -1225,10 +1225,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_K; // == 1 if QK_K == 256
     const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
 
 #pragma unroll
@@ -1238,27 +1238,27 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
         const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+        x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
     }
 
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
         }
 
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI4_K/8);
 
         const int * scales = (const int *) bxi->scales;
 
-        const int ksc = k % (WARP_SIZE/8);
+        const int ksc = k % (WARP_SIZE_GGUF/8);
         // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
         int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
         scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
 
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8;
     }
 }
 
@@ -1267,11 +1267,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
     (void)x_qh;
 
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2*((k % 16) / 8);
 
-    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+    const int index_y = j * WARP_SIZE_GGUF + (QR4_K*k) % WARP_SIZE_GGUF;
+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
 }
 
 static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
@@ -1321,9 +1321,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_K) + mmq_y/QI5_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
@@ -1360,11 +1360,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
         const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
 
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = ql0 | qh0;
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = ql1 | qh1;
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_K; // == 1 if QK_K == 256
     const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
 
 #pragma unroll
@@ -1376,40 +1376,40 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         }
 
         const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+        x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
     }
 
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
         }
 
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI5_K/8);
 
         const int * scales = (const int *) bxi->scales;
 
-        const int ksc = k % (WARP_SIZE/8);
+        const int ksc = k % (WARP_SIZE_GGUF/8);
 
         // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
         int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
         scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
 
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8;
     }
 }
 
 static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
 
-    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
-    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
+    const int index_x = i * (QR5_K*WARP_SIZE_GGUF + 1) +  QR5_K*k;
+    const int index_y = j * WARP_SIZE_GGUF             + (QR5_K*k) % WARP_SIZE_GGUF;
     return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+                                      x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
 }
 
 static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -1439,9 +1439,9 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI6_K) + mmq_y/QI6_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
 
     *x_ql = tile_x_ql;
     *x_dm = tile_x_dm;
@@ -1478,11 +1478,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
         const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
 
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
+        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
+    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI6_K; // == 1 if QK_K == 256
     const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
     float * x_dmf = (float *) x_dm;
 
@@ -1496,20 +1496,20 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
         const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
 
-        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d);
+        x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d);
     }
 
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
         }
 
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / 4;
 
-        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
+        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + k % (WARP_SIZE_GGUF/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
     }
 }
 
@@ -1519,11 +1519,11 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
     const float * x_dmf = (const float *) x_dm;
     const float * y_df  = (const float *) y_ds;
 
-    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
+    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/8]);
 
-    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
-    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
-    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+    const int index_x = i * (QR6_K*WARP_SIZE_GGUF + 1) +  QR6_K*k;
+    const int index_y = j * WARP_SIZE_GGUF             + (QR6_K*k) % WARP_SIZE_GGUF;
+    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
 }
 
 static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
@@ -1582,7 +1582,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
 
 static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
 
     const int ib32 = iqs;
@@ -1619,7 +1619,7 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
 
 static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
 
     const int ib32 = iqs;
@@ -1646,7 +1646,7 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 
 static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
 
     const int ib32 = iqs;
@@ -1671,7 +1671,7 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
 
 static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
 
     const int       qs_packed = get_int_b2(bq1->qs, iqs);
@@ -1703,7 +1703,7 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
 
 static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
 
     const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
 
@@ -1763,7 +1763,7 @@ static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4
 
 static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
 
     const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
 
@@ -1788,7 +1788,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
 
 static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
     const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
     const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
 
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 6dbf9594e8492..0c698ced7713d 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -54,9 +54,10 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
           const int thread_k_blocks,  // same for k dimension (reduction)
           const int stages,  // number of stages for the async global->shared
                              // fetch pipeline
-          const bool has_act_order,    // whether act_order is enabled
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
+          const bool has_act_order,     // whether act_order is enabled
+          const int group_blocks = -1,  // number of consecutive 16x16 blocks
+                                        // with a separate quantization scale
+          const bool is_zp_float        // is zero point of float16 type?
           >
 __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
@@ -82,7 +83,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& workspace,
                                vllm::ScalarTypeId const b_q_type_id,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp) {
+                               bool is_k_full, bool has_zp, bool is_zp_float) {
   TORCH_CHECK_NOT_IMPLEMENTED(false,
                               "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
   return torch::empty({1, 1});
@@ -516,10 +517,11 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
           const int thread_k_blocks,  // same for k dimension (reduction)
           const int stages,  // number of stages for the async global->shared
                              // fetch pipeline
-          const bool has_act_order,    // whether act_order is enabled
-          const bool has_zp,           // whether zero-points are enabled
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
+          const bool has_act_order,     // whether act_order is enabled
+          const bool has_zp,            // whether zero-points are enabled
+          const int group_blocks = -1,  // number of consecutive 16x16 blocks
+                                        // with a separate quantization scale
+          const bool is_zp_float        // is zero point of float16 type?
           >
 __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
@@ -692,8 +694,10 @@ __global__ void Marlin(
   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
 
   // Zero-points sizes/strides
-  int zp_gl_stride = (prob_n / pack_factor) / 4;
-  constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4;
+  int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = is_zp_float
+                                   ? 16 * thread_n_blocks / 8
+                                   : ((16 * thread_n_blocks) / pack_factor) / 4;
   constexpr int zp_tb_groups = s_tb_groups;
   constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
   int zp_gl_rd_delta = zp_gl_stride;
@@ -768,9 +772,16 @@ __global__ void Marlin(
   constexpr int num_ints_per_thread = 8 / pack_factor;
   int zp_sh_rd;
   if constexpr (has_zp) {
-    zp_sh_rd = num_ints_per_thread * num_col_threads *
-                   ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-               num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    if constexpr (is_zp_float) {
+      if constexpr (group_blocks != -1) {
+        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                   (threadIdx.x % 32) / 4;
+      }
+    } else {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    }
   }
 
   // Precompute which thread should not read memory in which iterations; this is
@@ -832,6 +843,7 @@ __global__ void Marlin(
   FragS act_frag_s[2][4][4];             // For act-order
   int frag_qzp[2][num_ints_per_thread];  // Zero-points
   FragZP frag_zp;                        // Zero-points in fp16
+  FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
 
   // Zero accumulators.
   auto zero_accums = [&]() {
@@ -1126,7 +1138,7 @@ __global__ void Marlin(
     // has_zp implies AWQ, which doesn't have act_order,
     static_assert(!has_zp || group_blocks != 0);
 
-    if constexpr (has_zp) {
+    if constexpr (has_zp && !is_zp_float) {
       int pipe = full_pipe % stages;
 
       if constexpr (group_blocks == -1) {
@@ -1170,11 +1182,44 @@ __global__ void Marlin(
         }
       }
     }
+
+    else if constexpr (has_zp && is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_zp_stage =
+              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                     (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+          int cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
+              sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride];
+        }
+      }
+    }
   };
 
   // Execute the actual tensor core matmul of a sub-tile.
   auto matmul = [&](int k) {
-    if constexpr (has_zp) {
+    if constexpr (has_zp && !is_zp_float) {
       FragB frag_zp_0;
       FragB frag_zp_1;
       int zp_quant_0, zp_quant_1;
@@ -1219,10 +1264,14 @@ __global__ void Marlin(
       frag_b1 = dequant<scalar_t, w_type_id>(b_quant_1);
 
       // Apply zero-point to frag_b0
-      if constexpr (has_zp) {
+      if constexpr (has_zp && !is_zp_float) {
         sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
       }
 
+      else if constexpr (has_zp && is_zp_float && group_blocks != -1) {
+        sub_zp<scalar_t>(frag_b0, frag_zpf[k % 2][j], 0);
+      }
+
       // Apply scale to frag_b0
       if constexpr (has_act_order) {
         scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
@@ -1235,10 +1284,14 @@ __global__ void Marlin(
       }
 
       // Apply zero-point to frag_b1
-      if constexpr (has_zp) {
+      if constexpr (has_zp && !is_zp_float) {
         sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
       }
 
+      else if constexpr (has_zp && is_zp_float && group_blocks != -1) {
+        sub_zp<scalar_t>(frag_b1, frag_zpf[k % 2][j], 1);
+      }
+
       // Apply scale to frag_b1
       if constexpr (has_act_order) {
         scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
@@ -1510,7 +1563,7 @@ __global__ void Marlin(
         fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
       }
 
-      if constexpr (has_zp && group_blocks == -1) {
+      if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
         if (i == 0) {
           fetch_zp_to_shared();
         }
@@ -1697,23 +1750,27 @@ __global__ void Marlin(
 }
 
   #define __CALL_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-                    HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, NUM_THREADS)          \
+                    HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, NUM_THREADS,          \
+                    IS_ZP_FLOAT)                                               \
     else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
              thread_n_blocks == THREAD_N_BLOCKS &&                             \
              thread_k_blocks == THREAD_K_BLOCKS &&                             \
              has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP &&             \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {     \
-      cudaFuncSetAttribute(                                                    \
-          Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,          \
-                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
-                 HAS_ZP, GROUP_BLOCKS>,                                        \
-          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
-      Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,              \
-             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
-             HAS_ZP, GROUP_BLOCKS>                                             \
-          <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                   \
-              A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,        \
-              num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce);     \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+             is_zp_float == IS_ZP_FLOAT) {                                     \
+      if constexpr (!IS_ZP_FLOAT || std::is_same<scalar_t, half>::value) {     \
+        cudaFuncSetAttribute(                                                  \
+            Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,        \
+                   THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages,              \
+                   HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>,          \
+            cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
+        Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,            \
+               THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,   \
+               HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>                              \
+            <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
+                A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
+                num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce);   \
+      }                                                                        \
     }
 
 typedef struct {
@@ -1905,51 +1962,96 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
 }
 
   #define GPTQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)             \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
+              false)                                                        \
                                                                             \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
+              false)                                                        \
                                                                             \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
+              false)                                                        \
                                                                             \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
+              false)                                                        \
                                                                             \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
+              false)
 
   #define AWQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)             \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS,  \
+              false)                                                       \
                                                                            \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS,  \
+              false)                                                       \
                                                                            \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS,  \
+              false)                                                       \
                                                                            \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, false)
+
+  // We currently have 4-bit models only with group_blocks == 4
+  #define HQQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)            \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \
+              true)                                                       \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \
+              true)                                                       \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \
+              true)                                                       \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, true)
 
 template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
@@ -1958,7 +2060,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
                vllm::ScalarType const& q_type, bool has_act_order,
                bool is_k_full, bool has_zp, int num_groups, int group_size,
                int dev, cudaStream_t stream, int thread_k, int thread_n,
-               int sms, int max_par, bool use_fp32_reduce) {
+               int sms, int max_par, bool use_fp32_reduce, bool is_zp_float) {
   if (has_zp) {
     TORCH_CHECK(
         q_type == vllm::kU4 || q_type == vllm::kU8,
@@ -2111,6 +2213,11 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
     AWQ_CALL_IF(vllm::kU8, 8, 8, 256)
     AWQ_CALL_IF(vllm::kU8, 8, 4, 128)
     AWQ_CALL_IF(vllm::kU8, 4, 8, 128)
+
+    HQQ_CALL_IF(vllm::kU4, 16, 4, 256)
+    HQQ_CALL_IF(vllm::kU4, 8, 8, 256)
+    HQQ_CALL_IF(vllm::kU4, 8, 4, 128)
+    HQQ_CALL_IF(vllm::kU4, 4, 8, 128)
     else {
       TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
                   ", ", prob_k, "]", ", has_act_order = ", has_act_order,
@@ -2135,7 +2242,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                vllm::ScalarTypeId const& b_q_type_id,
                                int64_t size_m, int64_t size_n, int64_t size_k,
                                bool is_k_full, bool has_zp,
-                               bool use_fp32_reduce) {
+                               bool use_fp32_reduce, bool is_zp_float) {
   vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
   if (has_zp) {
     TORCH_CHECK(
@@ -2148,6 +2255,12 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         b_q_type.str());
   }
 
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
+                "Computation type must be float16 (half) when using float zero "
+                "points.");
+  }
+
   int pack_factor = 32 / b_q_type.size_bits();
 
   // Verify A
@@ -2257,12 +2370,22 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   if (has_zp) {
     int rank = b_zeros.sizes().size();
     TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2");
-    TORCH_CHECK(b_zeros.size(0) == num_groups,
-                "b_zeros dim 0 = ", b_zeros.size(0),
-                " is not num_groups = ", num_groups);
-    TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
-                "b_zeros dim 1 = ", b_zeros.size(1),
-                " is not size_n / pack_factor = ", size_n / pack_factor);
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(1) == size_n,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not size_n = ", size_n);
+      TORCH_CHECK(num_groups == b_zeros.size(0),
+                  "b_zeros dim 0 = ", b_zeros.size(0),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(b_zeros.size(0) == num_groups,
+                  "b_zeros dim 0 = ", b_zeros.size(0),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not size_n / pack_factor = ", size_n / pack_factor);
+    }
   }
 
   // Verify workspace size
@@ -2282,7 +2405,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
         workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
+        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     marlin::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
@@ -2291,7 +2414,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
         workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
+        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
   } else {
     TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
   }
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index d126af1849024..ac63afe79a255 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -3,8 +3,10 @@
 import os
 import shutil
 from collections.abc import Iterable
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from copy import deepcopy
+from dataclasses import dataclass, fields
+from functools import reduce
+from typing import Dict, List, Optional, Tuple, Union
 
 import jinja2
 # yapf conflicts with isort for this block
@@ -14,7 +16,10 @@
                                             MixedInputKernelScheduleType,
                                             TileSchedulerTag,
                                             TileSchedulerType, VLLMDataType,
-                                            VLLMDataTypeNames, VLLMDataTypeTag,
+                                            VLLMDataTypeNames,
+                                            VLLMDataTypeSize, VLLMDataTypeTag,
+                                            VLLMDataTypeTorchDataTypeTag,
+                                            VLLMDataTypeVLLMScalarTypeTag,
                                             VLLMKernelScheduleTag)
 
 # yapf: enable
@@ -27,49 +32,125 @@
 #include "../machete_mm_launcher.cuh"
 
 namespace machete {
-using GemmDispatcher_ = GemmDispatcher<
-    {{DataTypeTag[type_config.element_a]}},  // ElementA
-    {{DataTypeTag[type_config.element_b]}},  // ElementB
-    {{DataTypeTag[type_config.element_d]}},  // ElementD
-    {{DataTypeTag[type_config.accumulator]}}, // Accumulator
-    {{DataTypeTag[type_config.element_b_scale]}}, // Scales
-    {{DataTypeTag[type_config.element_b_zeropoint]}}>; // Zeropoints
-
-{% for s in schedules %}extern torch::Tensor 
-impl_{{type_name}}_sch_{{ gen_sch_name(s) }}(PyTorchArguments args);
-{% endfor %}
-template <>
-torch::Tensor GemmDispatcher_::dispatch(PyTorchArguments args) {
+
+{% for impl_config in impl_configs %}
+{% set type_sig = gen_type_sig(impl_config.types) -%}
+{% for s in impl_config.schedules %}
+extern torch::Tensor impl_{{type_sig}}_sch_{{gen_sch_sig(s)}}(MMArgs);
+{%- endfor %}
+
+torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) {
   [[maybe_unused]] auto M = args.A.size(0);
   [[maybe_unused]] auto N = args.B.size(1);
   [[maybe_unused]] auto K = args.A.size(1);
     
-  if (!args.schedule) {
-    {%- for cond, s in heuristic %}
+  if (!args.maybe_schedule) {
+    {%- for cond, s in impl_config.heuristic %}
     {%if cond is not none%}if ({{cond}})
     {%- else %}else
     {%- endif %}
-        return impl_{{ type_name }}_sch_{{ gen_sch_name(s) }}(args);{% endfor %}
+        return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);{% endfor %}
   }
 
-  {% for s in schedules %}
-  if (*args.schedule == "{{ gen_sch_name(s) }}") {
-    return impl_{{ type_name }}_sch_{{ gen_sch_name(s) }}(args);
-  }
-  {% endfor %}
+  {%- for s in impl_config.schedules %}
+  if (*args.maybe_schedule == "{{ gen_sch_sig(s) }}")
+    return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);
+  {%- endfor %}
   TORCH_CHECK_NOT_IMPLEMENTED(false, "machete_gemm(..) is not implemented for "
-                                     "schedule = ", *args.schedule);
+                                     "schedule = ", *args.maybe_schedule);
 }
+{%- endfor %}
+
 
-template <>
-std::vector<std::string> GemmDispatcher_::supported_schedules() {
-  return { 
-    {% for s in schedules -%}
-    "{{ gen_sch_name(s) }}"{{ ",
-    " if not loop.last }}{%- endfor %}
-  };
+static inline std::optional<at::ScalarType> maybe_scalartype(
+    c10::optional<at::Tensor> const& t) {
+    if (!t) {
+      return std::nullopt;
+    } else {
+      return t->scalar_type();
+    };
+}
+
+torch::Tensor mm_dispatch(MMArgs args) {
+  auto out_type = args.maybe_out_type.value_or(args.A.scalar_type());
+  auto a_type = args.A.scalar_type();
+  auto maybe_g_scales_type = maybe_scalartype(args.maybe_group_scales);
+  auto maybe_g_zeros_type = maybe_scalartype(args.maybe_group_zeros);
+  auto maybe_ch_scales_type = maybe_scalartype(args.maybe_channel_scales);
+  auto maybe_tok_scales_type = maybe_scalartype(args.maybe_token_scales);
+
+  {% for impl_config in impl_configs %}
+  {% set t = impl_config.types -%}
+  {% set type_sig = gen_type_sig(t) -%}
+  if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
+      && a_type == {{TorchTypeTag[t.a]}}
+      && out_type == {{TorchTypeTag[t.out]}}
+      && {%if t.b_group_scale != void -%}
+      maybe_g_scales_type == {{TorchTypeTag[t.b_group_scale]}}
+      {%- else %}!maybe_g_scales_type{%endif%}
+      && {%if t.b_group_zeropoint != void -%}
+      maybe_g_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
+      {%- else %}!maybe_g_zeros_type{%endif%}
+      && {%if t.b_channel_scale != void -%}
+      maybe_ch_scales_type == {{TorchTypeTag[t.b_channel_scale]}}
+      {%- else %}!maybe_ch_scales_type{%endif%}
+      && {%if t.a_token_scale != void -%}
+      maybe_tok_scales_type == {{TorchTypeTag[t.a_token_scale]}}
+      {%- else %}!maybe_tok_scales_type{%endif%}
+  ) {
+      return mm_dispatch_{{type_sig}}(args);
+  }
+  {%- endfor %}
+  
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    false, "machete_mm(..) is not implemented for "
+    "a_type=", args.A.scalar_type(),
+    ", b_type=", args.b_type.str(),
+    ", out_type=", out_type,
+    ", with_group_scale_type=", maybe_g_scales_type
+        ? toString(*maybe_g_scales_type) : "None",
+    ", with_group_zeropoint_type=", maybe_g_zeros_type
+        ? toString(*maybe_g_zeros_type) : "None",
+    ", with_channel_scale_type=", maybe_ch_scales_type
+        ? toString(*maybe_ch_scales_type) : "None",
+    ", with_token_scale_type=", maybe_tok_scales_type
+        ? toString(*maybe_tok_scales_type) : "None",
+    "; implemented types are: \\n",
+    {%- for impl_config in impl_configs %}
+    {% set t = impl_config.types -%}
+    "\\t{{gen_type_option_name(t)}}\\n",
+    {%- endfor %}
+    "");
 }
 
+std::vector<std::string> supported_schedules_dispatch(
+    SupportedSchedulesArgs args) {
+    auto out_type = args.maybe_out_type.value_or(args.a_type);
+    
+    {% for impl_config in impl_configs %}
+    {% set t = impl_config.types -%}
+    {% set schs = impl_config.schedules -%}
+    if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
+        && args.a_type == {{TorchTypeTag[t.a]}}
+        && out_type == {{TorchTypeTag[t.out]}}
+        && {%if t.b_group_scale != void -%}
+        args.maybe_group_scales_type == {{TorchTypeTag[t.b_group_scale]}}
+        {%- else %}!args.maybe_group_scales_type{%endif%}
+        && {%if t.b_group_zeropoint != void-%}
+        args.maybe_group_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
+        {%- else %}!args.maybe_group_zeros_type{%endif%}
+    ) {
+        return {
+            {%- for s in impl_config.schedules %}
+            "{{gen_sch_sig(s)}}"{% if not loop.last %},{% endif %}
+            {%- endfor %}
+        };
+    }
+    {%- endfor %}
+    
+    return {};
+};
+
 }; // namespace machete
 """
 
@@ -77,20 +158,10 @@
 #include "../machete_mm_launcher.cuh"
 
 namespace machete {
-template <typename Config, bool with_C, bool with_scales, bool with_zeropoints>
-using Kernel = MacheteKernelTemplate<
-    {{DataTypeTag[type_config.element_a]}},  // ElementA
-    {{DataTypeTag[type_config.element_b]}},  // ElementB
-    {{DataTypeTag[type_config.element_d]}},  // ElementD
-    {{DataTypeTag[type_config.accumulator]}}, // Accumulator
-    {{DataTypeTag[type_config.element_b_scale]}}, // Scales
-    {{DataTypeTag[type_config.element_b_zeropoint]}}, // Zeropoints
-    cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput,
-    Config, with_C, with_scales, with_zeropoints>;
-
-{% for sch in schedules %}
-{% set schedule_name = gen_sch_name(sch) -%}
-struct sch_{{schedule_name}} {
+    
+{% for sch in unique_schedules(impl_configs) %}
+{% set sch_sig = gen_sch_sig(sch) -%}
+struct sch_{{sch_sig}} {
   using TileShapeNM = Shape<{{
       to_cute_constant(sch.tile_shape_mn)|join(', ')}}>;
   using ClusterShape = Shape<{{
@@ -101,27 +172,34 @@
   using TileScheduler    = {{TileSchedulerTag[sch.tile_scheduler]}};
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 };
-
+{% endfor %}
+    
+{% for impl_config in impl_configs %}
+{% set t = impl_config.types -%}
+{% set schs = impl_config.schedules -%}
+{% set type_sig = gen_type_sig(t) -%}
+
+template<typename Sch>
+using Kernel_{{type_sig}} = MacheteKernelTemplate<
+  {{DataTypeTag[t.a]}},  // ElementA
+  {{DataTypeTag[t.b]}},  // ElementB
+  {{DataTypeTag[t.out]}},  // ElementD
+  {{DataTypeTag[t.accumulator]}}, // Accumulator
+  {{DataTypeTag[t.b_group_scale]}}, // GroupScaleT
+  {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
+  {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
+  {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
+  cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput,
+  Sch>;
+
+{% for sch in schs %}
+{% set sch_sig = gen_sch_sig(sch) -%}
 torch::Tensor 
-impl_{{type_name}}_sch_{{schedule_name}}(PyTorchArguments args) {
-  bool with_C = args.C.has_value(), with_scales = args.scales.has_value(),
-       with_zeropoints = args.zeros.has_value();
-
-  {% for s in specializations %}
-  if (with_C == {{s.with_C|lower}}
-      && with_zeropoints == {{s.with_zeropoints|lower}}
-      && with_scales == {{s.with_scales|lower}}) {
-      return run_impl<Kernel<sch_{{schedule_name}}, {{s.with_C|lower}},
-        {{s.with_scales|lower}}, {{s.with_zeropoints|lower}}>>(args);
-  }{% endfor %}
-
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false, "for the sake of compile times and binary size machete_mm(..) is "
-      " not implemented for with_C=", with_C, ", with_scales=", with_scales, 
-      ", with_zeropoints=", with_zeropoints, 
-      " (for {{type_name}}_sch_{{schedule_name}})");
+impl_{{type_sig}}_sch_{{sch_sig}}(MMArgs args) {
+  return run_impl<Kernel_{{type_sig}}<sch_{{sch_sig}}>>(args);
 }
-{% endfor %}
+{%- endfor %}
+{%- endfor %}
 
 }; // namespace machete
 """
@@ -130,26 +208,34 @@
 #include "../machete_prepack_launcher.cuh"
 
 namespace machete {
-using PrepackBDispatcher_ = PrepackBDispatcher<
-  {{DataTypeTag[type_config.element_a]}}, // ElementA
-  {{DataTypeTag[type_config.element_b]}}, // ElementB
-  {{DataTypeTag[type_config.element_d]}}, // ElementD
-  {{DataTypeTag[type_config.accumulator]}}, // Accumulator
-  {{DataTypeTag[type_config.element_b_scale]}}, // Scales
-  {{DataTypeTag[type_config.element_b_zeropoint]}}>; // Zeropoints
-
-using PrepackedLayoutB = PrepackedLayoutBTemplate<
-  {{DataTypeTag[type_config.element_a]}}, // ElementA
-  {{DataTypeTag[type_config.element_b]}}, // ElementB
-  {{DataTypeTag[type_config.element_d]}}, // ElementD
-  {{DataTypeTag[type_config.accumulator]}}, // Accumulator
-  cutlass::layout::ColumnMajor,
-  cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput>;
-
-template <>
-torch::Tensor PrepackBDispatcher_::dispatch(torch::Tensor B) {
-  return prepack_impl<PrepackedLayoutB>(B);
+
+torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
+  auto convert_type = args.maybe_group_scales_type.value_or(args.a_type);
+  {%- for t in types %}
+  {% set b_type = unsigned_type_with_bitwidth(t.b_num_bits) %}
+  if (args.a_type == {{TorchTypeTag[t.a]}}
+      && args.b_type.size_bits() == {{t.b_num_bits}} 
+      && convert_type == {{TorchTypeTag[t.convert]}}) {
+    return prepack_impl<
+      PrepackedLayoutBTemplate<
+        {{DataTypeTag[t.a]}}, // ElementA
+        {{DataTypeTag[b_type]}}, // ElementB
+        {{DataTypeTag[t.convert]}}, // ElementConvert
+        {{DataTypeTag[t.accumulator]}}, // Accumulator
+        cutlass::layout::ColumnMajor,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput>
+    >(args.B); 
+  }
+  {%- endfor %}
+  
+  TORCH_CHECK_NOT_IMPLEMENTED(false, 
+    "prepack_B_dispatch(..) is not implemented for "
+    "atype = ", args.a_type,
+    ", b_type = ", args.b_type.str(),
+    ", with_group_scales_type= ", args.maybe_group_scales_type ? 
+        toString(*args.maybe_group_scales_type) : "None");
 }
+
 }; // namespace machete
 """
 
@@ -166,32 +252,34 @@ class ScheduleConfig:
     tile_scheduler: TileSchedulerType
 
 
-@dataclass
+@dataclass(frozen=True)
 class TypeConfig:
-    element_a: DataType
-    element_b: Union[DataType, VLLMDataType]
-    element_b_scale: DataType
-    element_b_zeropoint: DataType
-    element_d: DataType
+    a: DataType
+    b: Union[DataType, VLLMDataType]
+    b_group_scale: DataType
+    b_group_zeropoint: DataType
+    b_channel_scale: DataType
+    a_token_scale: DataType
+    out: DataType
     accumulator: DataType
 
 
-@dataclass
-class Specialization:
-    with_C: bool
-    with_zeropoints: bool
-    with_scales: bool
+@dataclass(frozen=True)
+class PrepackTypeConfig:
+    a: DataType
+    b_num_bits: int
+    convert: DataType
+    accumulator: DataType
 
 
 @dataclass
 class ImplConfig:
-    type_config: TypeConfig
-    schedule_configs: List[ScheduleConfig]
-    specializations: List[Specialization]
+    types: TypeConfig
+    schedules: List[ScheduleConfig]
     heuristic: List[Tuple[Optional[str], ScheduleConfig]]
 
 
-def generate_schedule_name(schedule_config: ScheduleConfig) -> str:
+def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
     tile_shape = (
         f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
     )
@@ -209,40 +297,34 @@ def generate_schedule_name(schedule_config: ScheduleConfig) -> str:
             f"_{epilogue_schedule}_{tile_scheduler}")
 
 
-# mostly unique shorter schedule_name
-def generate_terse_schedule_name(schedule_config: ScheduleConfig) -> str:
+# mostly unique shorter sch_sig
+def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
     kernel_terse_names_replace = {
         "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_",
         "TmaWarpSpecializedCooperative_": "TmaCoop_",
         "StreamKScheduler": "streamK",
     }
 
-    schedule_name = generate_schedule_name(schedule_config)
+    sch_sig = generate_sch_sig(schedule_config)
     for orig, terse in kernel_terse_names_replace.items():
-        schedule_name = schedule_name.replace(orig, terse)
-    return schedule_name
+        sch_sig = sch_sig.replace(orig, terse)
+    return sch_sig
 
 
 # unique type_name
-def generate_type_signature(kernel_type_config: TypeConfig):
-    element_a = VLLMDataTypeNames[kernel_type_config.element_a]
-    element_b = VLLMDataTypeNames[kernel_type_config.element_b]
-    element_d = VLLMDataTypeNames[kernel_type_config.element_d]
-    accumulator = VLLMDataTypeNames[kernel_type_config.accumulator]
-    element_scale = VLLMDataTypeNames[kernel_type_config.element_b_scale]
-    element_zeropoint = VLLMDataTypeNames[
-        kernel_type_config.element_b_zeropoint]
-
-    return (f"{element_a}{element_b}{element_d}"
-            f"{accumulator}{element_scale}{element_zeropoint}")
-
+def generate_type_signature(kernel_types: TypeConfig):
+    return str("".join([
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
+        for field in fields(TypeConfig)
+    ]))
 
-# non-unique shorter type_name
-def generate_terse_type_signature(kernel_type_config: TypeConfig):
-    element_a = VLLMDataTypeNames[kernel_type_config.element_a]
-    element_b = VLLMDataTypeNames[kernel_type_config.element_b]
 
-    return f"{element_a}{element_b}"
+def generate_type_option_name(kernel_types: TypeConfig):
+    return ", ".join([
+        f"{field.name.replace('b_', 'with_')+'_type'}=" +
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
+        for field in fields(TypeConfig)
+    ])
 
 
 def is_power_of_two(n):
@@ -263,13 +345,36 @@ def _to_cute_constant(value: int):
         return _to_cute_constant(value)
 
 
+def unique_schedules(impl_configs: List[ImplConfig]):
+    return list(
+        set(sch for impl_config in impl_configs
+            for sch in impl_config.schedules))
+
+
+def unsigned_type_with_bitwidth(num_bits):
+    return {
+        4: DataType.u4,
+        8: DataType.u8,
+        16: DataType.u16,
+        32: DataType.u32,
+        64: DataType.u64,
+    }[num_bits]
+
+
 template_globals = {
+    "void": DataType.void,
     "DataTypeTag": VLLMDataTypeTag,
+    "VLLMScalarTypeTag": VLLMDataTypeVLLMScalarTypeTag,
+    "TorchTypeTag": VLLMDataTypeTorchDataTypeTag,
     "KernelScheduleTag": VLLMKernelScheduleTag,
     "EpilogueScheduleTag": EpilogueScheduleTag,
     "TileSchedulerTag": TileSchedulerTag,
     "to_cute_constant": to_cute_constant,
-    "gen_sch_name": generate_terse_schedule_name,
+    "gen_sch_sig": generate_terse_sch_sig,
+    "gen_type_sig": generate_type_signature,
+    "unique_schedules": unique_schedules,
+    "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
+    "gen_type_option_name": generate_type_option_name
 }
 
 
@@ -284,42 +389,82 @@ def create_template(template_str):
 prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
 
 
-def create_sources(impl_config: ImplConfig, num_impl_files=1):
+def create_sources(impl_configs: List[ImplConfig], num_impl_files=8):
     sources = []
 
-    type_name = generate_type_signature(impl_config.type_config)
-    terse_type_name = generate_terse_type_signature(impl_config.type_config)
-
     sources.append((
-        f"machete_mm_{terse_type_name}",
-        mm_dispatch_template.render(type_name=type_name,
-                                    type_config=impl_config.type_config,
-                                    schedules=impl_config.schedule_configs,
-                                    heuristic=impl_config.heuristic),
+        "machete_mm_dispatch",
+        mm_dispatch_template.render(impl_configs=impl_configs),
     ))
 
+    prepack_types = []
+    for impl_config in impl_configs:
+        convert_type = impl_config.types.a \
+             if impl_config.types.b_group_scale == DataType.void \
+             else impl_config.types.b_group_scale
+        prepack_types.append(
+            PrepackTypeConfig(
+                a=impl_config.types.a,
+                b_num_bits=VLLMDataTypeSize[impl_config.types.b],
+                convert=convert_type,
+                accumulator=impl_config.types.accumulator,
+            ))
+
+    def prepacked_type_key(prepack_type: PrepackTypeConfig):
+        # For now we we can just use the first accumulator type seen since
+        # the tensor core shapes/layouts don't vary based on accumulator
+        # type so we can generate less code this way
+        return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
+
+    unique_prepack_types = []
+    prepack_types_seen = set()
+    for prepack_type in prepack_types:
+        key = prepacked_type_key(prepack_type)
+        if key not in prepack_types_seen:
+            unique_prepack_types.append(prepack_type)
+            prepack_types_seen.add(key)
+
     sources.append((
-        f"machete_prepack_{terse_type_name}",
-        prepack_dispatch_template.render(
-            type_name=type_name,
-            type_config=impl_config.type_config,
-        ),
+        "machete_prepack",
+        prepack_dispatch_template.render(types=unique_prepack_types, ),
     ))
 
-    num_schedules = len(impl_config.schedule_configs)
-    schedules_per_file = math.ceil(num_schedules / num_impl_files)
-    for part, i in enumerate(range(0, num_schedules, schedules_per_file)):
-        file_schedules = impl_config.schedule_configs[i:i + schedules_per_file]
+    # Split up impls across files
+    num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
+    num_impls_per_file = math.ceil(num_impls / num_impl_files)
+
+    files_impls: List[List[ImplConfig]] = [[]]
+
+    curr_num_impls_assigned = 0
+    curr_impl_in_file = 0
+    curr_impl_configs = deepcopy(list(reversed(impl_configs)))
+
+    while curr_num_impls_assigned < num_impls:
+        room_left_in_file = num_impls_per_file - curr_impl_in_file
+        if room_left_in_file == 0:
+            files_impls.append([])
+            room_left_in_file = num_impls_per_file
+            curr_impl_in_file = 0
+
+        curr_ic = curr_impl_configs[-1]
+        if len(curr_ic.schedules) >= room_left_in_file:
+            # Break apart the current impl config
+            tmp_ic = deepcopy(curr_ic)
+            tmp_ic.schedules = curr_ic.schedules[:room_left_in_file]
+            curr_ic.schedules = curr_ic.schedules[room_left_in_file:]
+            files_impls[-1].append(tmp_ic)
+        else:
+            files_impls[-1].append(curr_ic)
+            curr_impl_configs.pop()
+        curr_num_impls_assigned += len(files_impls[-1][-1].schedules)
+        curr_impl_in_file += len(files_impls[-1][-1].schedules)
 
+    for part, file_impls in enumerate(files_impls):
         sources.append((
-            f"machete_mm_{terse_type_name}_impl_part{part}",
-            mm_impl_template.render(
-                type_name=type_name,
-                type_config=impl_config.type_config,
-                schedules=file_schedules,
-                specializations=impl_config.specializations,
-            ),
+            f"machete_mm_impl_part{part+1}",
+            mm_impl_template.render(impl_configs=file_impls),
         ))
+
     return sources
 
 
@@ -328,187 +473,169 @@ def generate():
     # about how this works
     SCRIPT_DIR = os.path.dirname(__file__)
 
-    schedule_common_params = dict(
+    sch_common_params = dict(
         kernel_schedule=TmaMI,
         epilogue_schedule=TmaCoop,
         tile_scheduler=TileSchedulerType.StreamK,
     )
 
-    # For now we use the same heuristic for all types
-    # Heuristic is currently tuned for H100s
-    default_heuristic = [
+    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
+    default_tile_heuristic_config = {
         #### M = 257+
-        (
-            "M > 256 && K <= 16384 && N <= 4096",
-            ScheduleConfig(
-                tile_shape_mn=(128, 128),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            "M > 256",
-            ScheduleConfig(
-                tile_shape_mn=(128, 256),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
+        "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
+        "M > 256": ((128, 256), (2, 1, 1)),
         #### M = 129-256
-        (
-            "M > 128 && K <= 4096 && N <= 4096",
-            ScheduleConfig(
-                tile_shape_mn=(128, 64),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            "M > 128 && K <= 8192 && N <= 8192",
-            ScheduleConfig(
-                tile_shape_mn=(128, 128),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            "M > 128",
-            ScheduleConfig(
-                tile_shape_mn=(128, 256),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
+        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
+        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
+        "M > 128": ((128, 256), (2, 1, 1)),
         #### M = 65-128
-        (
-            "M > 64 && K <= 4069 && N <= 4069",
-            ScheduleConfig(
-                tile_shape_mn=(128, 32),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            "M > 64 && K <= 4069 && N <= 8192",
-            ScheduleConfig(
-                tile_shape_mn=(128, 64),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            "M > 64 && K >= 8192 && N >= 12288",
-            ScheduleConfig(
-                tile_shape_mn=(256, 128),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            "M > 64",
-            ScheduleConfig(
-                tile_shape_mn=(128, 128),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
+        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
+        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
+        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
+        "M > 64": ((128, 128), (2, 1, 1)),
         #### M = 33-64
-        (
-            "M > 32 && K <= 6144 && N <= 6144",
-            ScheduleConfig(
-                tile_shape_mn=(128, 16),
-                cluster_shape_mnk=(1, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            "M > 32 && K >= 16384 && N >= 12288",
-            ScheduleConfig(
-                tile_shape_mn=(256, 64),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            "M > 32",
-            ScheduleConfig(
-                tile_shape_mn=(128, 64),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
+        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
+        "M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
+        "M > 32": ((128, 64), (2, 1, 1)),
         #### M = 17-32
-        (
-            "M > 16 && K <= 12288 && N <= 8192",
-            ScheduleConfig(
-                tile_shape_mn=(128, 32),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            "M > 16",
-            ScheduleConfig(
-                tile_shape_mn=(256, 32),
-                cluster_shape_mnk=(2, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
+        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
+        "M > 16": ((256, 32), (2, 1, 1)),
         #### M = 1-16
-        (
-            "N >= 26624",
-            ScheduleConfig(
-                tile_shape_mn=(256, 16),
-                cluster_shape_mnk=(1, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
-        (
-            None,
-            ScheduleConfig(
-                tile_shape_mn=(128, 16),
-                cluster_shape_mnk=(1, 1, 1),
-                **schedule_common_params  # type: ignore
-            )),
+        "N >= 26624": ((256, 16), (1, 1, 1)),
+        None: ((128, 16), (1, 1, 1)),
+    }
+
+    # For now we use the same heuristic for all types
+    # Heuristic is currently tuned for H100s
+    default_heuristic = [
+        (cond, ScheduleConfig(*tile_config,
+                              **sch_common_params))  # type: ignore
+        for cond, tile_config in default_tile_heuristic_config.items()
     ]
 
-    # Do not use schedules = list(set(...)) because we need to make sure
-    # the output list is deterministic; otherwise the generated kernel file
-    # will be non-deterministic and causes ccache miss.
-    schedules = []
-    for _, schedule_config in default_heuristic:
-        if schedule_config not in schedules:
-            schedules.append(schedule_config)
+    def get_unique_schedules(heuristic: Dict[str, ScheduleConfig]):
+        # Do not use schedules = list(set(...)) because we need to make sure
+        # the output list is deterministic; otherwise the generated kernel file
+        # will be non-deterministic and causes ccache miss.
+        schedules = []
+        for _, schedule_config in heuristic:
+            if schedule_config not in schedules:
+                schedules.append(schedule_config)
+        return schedules
 
     impl_configs = []
 
     GPTQ_kernel_type_configs = list(
         TypeConfig(
-            element_a=element_a,
-            element_b=element_b,
-            element_b_scale=element_a,
-            element_b_zeropoint=element_a,
-            element_d=element_a,
+            a=a,
+            b=b,
+            b_group_scale=a,
+            b_group_zeropoint=DataType.void,
+            b_channel_scale=DataType.void,
+            a_token_scale=DataType.void,
+            out=a,
             accumulator=DataType.f32,
-        ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-        for element_a in (DataType.f16, DataType.bf16))
-
-    GPTQ_kernel_specializations = [
-        Specialization(with_C=False, with_zeropoints=False, with_scales=True)
-    ]
+        ) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
+        for a in (DataType.f16, DataType.bf16))
 
     impl_configs += [
-        ImplConfig(x[0], x[1], x[2], x[3])
-        for x in zip(GPTQ_kernel_type_configs, itertools.repeat(schedules),
-                     itertools.repeat(GPTQ_kernel_specializations),
+        ImplConfig(x[0], x[1], x[2])
+        for x in zip(GPTQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
                      itertools.repeat(default_heuristic))
     ]
 
     AWQ_kernel_type_configs = list(
         TypeConfig(
-            element_a=element_a,
-            element_b=element_b,
-            element_b_scale=element_a,
-            element_b_zeropoint=element_a,
-            element_d=element_a,
+            a=a,
+            b=b,
+            b_group_scale=a,
+            b_group_zeropoint=a,
+            b_channel_scale=DataType.void,
+            a_token_scale=DataType.void,
+            out=a,
             accumulator=DataType.f32,
-        ) for element_b in (DataType.u4, DataType.u8)
-        for element_a in (DataType.f16, DataType.bf16))
+        ) for b in (DataType.u4, DataType.u8)
+        for a in (DataType.f16, DataType.bf16))
+
+    impl_configs += [
+        ImplConfig(x[0], x[1], x[2])
+        for x in zip(AWQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
+                     itertools.repeat(default_heuristic))
+    ]
 
-    AWQ_kernel_specializations = [
-        Specialization(with_C=False, with_zeropoints=True, with_scales=True)
+    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
+    # TODO (LucasWilkinson): Further tuning required
+    qqq_tile_heuristic_config = {
+        #### M = 257+
+        # ((128, 256), (2, 1, 1)) Broken for QQQ types
+        # TODO (LucasWilkinson): Investigate further
+        # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
+        # "M > 256": ((128, 256), (2, 1, 1)),
+        "M > 256": ((128, 128), (2, 1, 1)),
+        #### M = 129-256
+        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
+        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
+        # ((128, 256), (2, 1, 1)) Broken for QQQ types
+        # TODO (LucasWilkinson): Investigate further
+        # "M > 128": ((128, 256), (2, 1, 1)),
+        "M > 128": ((128, 128), (2, 1, 1)),
+        #### M = 65-128
+        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
+        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
+        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
+        "M > 64": ((128, 128), (2, 1, 1)),
+        #### M = 33-64
+        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
+        # Broken for QQQ types
+        # TODO (LucasWilkinson): Investigate further
+        #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
+        "M > 32": ((128, 64), (2, 1, 1)),
+        #### M = 17-32
+        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
+        "M > 16": ((256, 32), (2, 1, 1)),
+        #### M = 1-16
+        "N >= 26624": ((256, 16), (1, 1, 1)),
+        None: ((128, 16), (1, 1, 1)),
+    }
+
+    # For now we use the same heuristic for all types
+    # Heuristic is currently tuned for H100s
+    qqq_heuristic = [
+        (cond, ScheduleConfig(*tile_config,
+                              **sch_common_params))  # type: ignore
+        for cond, tile_config in qqq_tile_heuristic_config.items()
+    ]
+
+    QQQ_kernel_types = [
+        *(TypeConfig(
+            a=DataType.s8,
+            b=VLLMDataType.u4b8,
+            b_group_scale=b_group_scale,
+            b_group_zeropoint=DataType.void,
+            b_channel_scale=DataType.f32,
+            a_token_scale=DataType.f32,
+            out=DataType.f16,
+            accumulator=DataType.s32,
+        ) for b_group_scale in (DataType.f16, DataType.void)),
+        *(TypeConfig(
+            a=DataType.e4m3,
+            b=VLLMDataType.u4b8,
+            b_group_scale=b_group_scale,
+            b_group_zeropoint=DataType.void,
+            b_channel_scale=DataType.f32,
+            a_token_scale=DataType.f32,
+            out=DataType.f16,
+            accumulator=DataType.f32,
+        ) for b_group_scale in (DataType.f16, DataType.void)),
     ]
 
     impl_configs += [
-        ImplConfig(x[0], x[1], x[2], x[3])
-        for x in zip(AWQ_kernel_type_configs, itertools.repeat(schedules),
-                     itertools.repeat(AWQ_kernel_specializations),
-                     itertools.repeat(default_heuristic))
+        ImplConfig(x[0], x[1], x[2])
+        for x in zip(QQQ_kernel_types,
+                     itertools.repeat(get_unique_schedules(qqq_heuristic)),
+                     itertools.repeat(qqq_heuristic))
     ]
 
     output_dir = os.path.join(SCRIPT_DIR, "generated")
@@ -521,12 +648,11 @@ def generate():
     os.makedirs(output_dir)
 
     # Render each group of configurations into separate files
-    for impl_config in impl_configs:
-        for filename, code in create_sources(impl_config):
-            filepath = os.path.join(output_dir, f"{filename}.cu")
-            with open(filepath, "w") as output_file:
-                output_file.write(code)
-            print(f"Rendered template to {filepath}")
+    for filename, code in create_sources(impl_configs):
+        filepath = os.path.join(output_dir, f"{filename}.cu")
+        with open(filepath, "w") as output_file:
+            output_file.write(code)
+        print(f"Rendered template to {filepath}")
 
 
 if __name__ == "__main__":
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index e8e7b14de0da1..816f33a1078e5 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -171,6 +171,10 @@ struct MacheteCollectiveMma {
       make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}),
                  Int<DispatchPolicy::Stages>{})));
 
+  using SmemLayoutACopy = decltype(GmemLayoutA::TVbNbKL_to_offset_copy(
+      make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}),
+                 Int<DispatchPolicy::Stages>{})));
+
   using SmemLayoutAtomARowMajor =
       decltype(rs_smem_selector<GmmaMajorA, ElementA,
                                 decltype(cute::get<0>(TileShape_MNK{})),
@@ -288,14 +292,7 @@ struct MacheteCollectiveMma {
   static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0,
                 "SmemLayoutAtomScale must evenly divide tile k shape.");
 
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutACopy = decltype(tile_to_shape(
-      SmemLayoutAtomARowMajor{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}),
-                 Int<DispatchPolicy::Stages>{}),
-      conditional_t<::cutlass::gemm::detail::is_major<0, StrideA>(),
-                    Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
-
+  // Tile along modes in a way that maximizes the TMA box size
   using SmemLayoutB = decltype(tile_to_shape(
       SmemLayoutAtomB{},
       make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}),
@@ -428,12 +425,12 @@ struct MacheteCollectiveMma {
   // clang-format on
 
   // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx)
-  using PrepackedStrideA = decltype(stride(GmemLayoutA::TVbNbKL_to_offset(
+  using PrepackedStrideA = decltype(stride(GmemLayoutA::TVbNbKL_to_offset_copy(
       make_shape(int32_t(0), int32_t(0), int32_t(0)))));
 
   using ATensor = decltype(make_tensor(
       get_logical_ptr(static_cast<InternalElementA const*>(nullptr)),
-      shape(GmemLayoutA::TVbNbKL_to_offset(
+      shape(GmemLayoutA::TVbNbKL_to_offset_copy(
           make_shape(int32_t(0), int32_t(0), int32_t(0)))),
       PrepackedStrideA{}));
 
@@ -450,8 +447,8 @@ struct MacheteCollectiveMma {
 
   static constexpr auto make_tma_copy_A(ATensor tensor_a = ATensor{}) {
     return make_tma_copy<TmaElementA>(
-        GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}),
-        shape(SmemLayoutA{}(_, _, cute::Int<0>{})),
+        GmemTiledCopyA{}, tensor_a, SmemLayoutACopy{}(_, _, cute::Int<0>{}),
+        shape(SmemLayoutACopy{}(_, _, cute::Int<0>{})),
         size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
   }
 
@@ -584,7 +581,7 @@ struct MacheteCollectiveMma {
     typename Params::TMA_Scale tma_load_scale;
     typename Params::TMA_Zero tma_load_zero;
 
-    auto layout = GmemLayoutA::TVbNbKL_to_offset(make_shape(M, K, L));
+    auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L));
     tma_load_a = make_tma_copy_A(
         make_logical_tensor(ptr_A, shape(layout), stride(layout)));
 
@@ -722,7 +719,7 @@ struct MacheteCollectiveMma {
     // (TILE_V,TILE_B,m,k,l)
     auto make_gA_mkl = [&]() {
       // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx)
-      auto layout = GmemLayoutA::TVbNbKL_to_offset(make_shape(M, K, L));
+      auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L));
       Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(layout));
       return local_tile(mA_mkl,
                         make_shape(size<0>(layout), PPBlocksPerTile_MK{}),
diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh
index 4d41b8d291484..d4d19ae5deec7 100644
--- a/csrc/quantization/machete/machete_mm_kernel.cuh
+++ b/csrc/quantization/machete/machete_mm_kernel.cuh
@@ -21,6 +21,8 @@
 
 #include "cutlass_extensions/cute_utils.cuh"
 #include "cutlass_extensions/vllm_numeric_conversion.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
 #include "machete_collective_builder.cuh"
 #include "machete_prepacked_layout.cuh"
 #include "machete_interleaving_utils.cuh"
@@ -37,27 +39,42 @@ using namespace cute;
 //   W is quantized, in this situation or right-hand operand is quantized so
 //   we compute the transpose to move it to the left-hand side.
 template <typename ElementA_, typename ElementB_, typename ElementD_,
-          typename AccumulatorT, typename ScaleT, typename ZeroT,
-          class KernelSchedule, typename ScheduleConfig, bool with_C,
-          bool with_scales, bool with_zeropoints>
+          typename AccumulatorT, typename GroupScaleT, typename GroupZeroT,
+          typename ChannelScaleT, typename TokenScaleT, class KernelSchedule,
+          typename ScheduleConfig>
 struct MacheteKernelTemplate {
+  static constexpr bool with_C = false;  // not ever used
+  static constexpr bool with_group_scales = !std::is_same_v<GroupScaleT, void>;
+  static constexpr bool with_group_zeropoints =
+      !std::is_same_v<GroupZeroT, void>;
+  static constexpr bool with_channel_scales =
+      !std::is_same_v<ChannelScaleT, void>;
+  static constexpr bool with_token_scales = !std::is_same_v<TokenScaleT, void>;
+
   using MmaType = ElementA_;
   using ElementA = ElementA_;
   using ElementB = ElementB_;
   using ElementD = ElementD_;
   using ElementC = cute::conditional_t<with_C, ElementD, void>;
-  using ElementZ = ZeroT;
-  using ElementS = ScaleT;
-
-  using ElementAccumulator =
-      AccumulatorT;  // Element type for internal accumulation
+  using ElementAccumulator = AccumulatorT;
   using ElementCompute = AccumulatorT;  // For Epilogue
+  // Use dummy values when we don't have scales or zeropoints
+  using ElementZGroup =
+      cute::conditional_t<with_group_zeropoints, GroupZeroT, MmaType>;
+  using ElementSGroup =
+      cute::conditional_t<with_group_scales, GroupScaleT, MmaType>;
+  using ElementConvertGroup =
+      cute::conditional_t<with_group_scales, GroupScaleT, MmaType>;
+  using ElementSChannel =
+      cute::conditional_t<with_channel_scales, ChannelScaleT, AccumulatorT>;
+  using ElementSToken =
+      cute::conditional_t<with_token_scales, TokenScaleT, AccumulatorT>;
 
   using BTypeTuple = cute::conditional_t<
-      with_scales,
-      cute::conditional_t<with_zeropoints,
-                          cute::tuple<ElementB, ElementS, ElementZ>,
-                          cute::tuple<ElementB, ElementS>>,
+      with_group_scales,
+      cute::conditional_t<with_group_zeropoints,
+                          cute::tuple<ElementB, ElementSGroup, ElementZGroup>,
+                          cute::tuple<ElementB, ElementSGroup>>,
       ElementB>;
 
   using LayoutA = cutlass::layout::RowMajor;
@@ -71,8 +88,8 @@ struct MacheteKernelTemplate {
   using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
   using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
   using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
-  using StrideS = cutlass::detail::TagToStrideA_t<LayoutScale>;
-  using StrideZ = StrideS;
+  using StrideSGroup = cutlass::detail::TagToStrideA_t<LayoutScale>;
+  using StrideZGroup = StrideSGroup;
 
   using LayoutA_Transpose =
       typename cutlass::layout::LayoutTranspose<LayoutA>::type;
@@ -85,8 +102,8 @@ struct MacheteKernelTemplate {
   using OperatorClass = cutlass::arch::OpClassTensorOp;
 
   using PrepackedLayoutB =
-      PrepackedLayoutBTemplate<ElementA_, ElementB_, ElementD_, AccumulatorT,
-                               LayoutA_Transpose, KernelSchedule>;
+      PrepackedLayoutBTemplate<ElementA_, ElementB_, ElementConvertGroup,
+                               AccumulatorT, LayoutA_Transpose, KernelSchedule>;
 
   static int constexpr TileShapeK =
       128 * 8 / cutlass::sizeof_bits<MmaType>::value;
@@ -103,12 +120,42 @@ struct MacheteKernelTemplate {
   using EpilogueTileType = typename ScheduleConfig::EpilogueTileType;
   using TileScheduler = typename ScheduleConfig::TileScheduler;
 
+  static_assert(
+      (!with_channel_scales && !with_token_scales) ||
+          ((with_channel_scales && with_token_scales) &&
+           std::is_same_v<ElementSChannel, ElementSToken>),
+      "Currently token and channel scales (if present) must be the same type");
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  // Currently only supports float scales
+  using ChTokScalesEpilogue =
+      typename vllm::c3x::ScaledEpilogue<ElementAccumulator, ElementD,
+                                         EpilogueDescriptor>;
+  static_assert((with_channel_scales || with_token_scales) ||
+                    (std::is_same_v<ElementSChannel, float> &&
+                     std::is_same_v<ElementSToken, float>),
+                "Currently token and channel scales (if present) must be float "
+                "(and if one is present the other must be too)");
+
+  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
+      cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using EVTCompute =
+      std::conditional_t<with_channel_scales || with_token_scales,
+                         typename ChTokScalesEpilogue::EVTCompute,
+                         StoreEpilogueCompute>;
+
+  // EVTCompute
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
           ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
-          ElementAccumulator, ElementAccumulator, ElementC, LayoutC_Transpose,
-          AlignmentC, ElementD, LayoutD_Transpose, AlignmentD,
-          EpilogueSchedule>::CollectiveOp;
+          ElementAccumulator, ElementSChannel, ElementC, LayoutC_Transpose,
+          AlignmentC, ElementD, LayoutD_Transpose, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
 
   using CollectiveMainloop =
       typename cutlass::gemm::collective::VLLMCollectiveBuilder<
@@ -131,26 +178,44 @@ struct MacheteKernelTemplate {
   using MainloopArguments = typename GemmKernel::MainloopArguments;
   using EpilogueArguments = typename GemmKernel::EpilogueArguments;
 
-  template <typename ShapeA, typename ShapeC, typename ShapeD, typename ShapeS,
-            typename ShapeZ>
   static Arguments create_arguments(
       cudaStream_t stream,
-      ElementA const* A_ptr,  // A is an MxK matrix
-      Layout<ShapeA, StrideA> const& layout_A,
-      ElementB const* B_ptr,  // B is an KxN prepacked matrix
-      ElementD* D_ptr,        // D is an MxN matrix
-      Layout<ShapeD, StrideD> const& layout_D,
-      ElementC const* C_ptr,  // C is an MxN matrix
-      std::optional<Layout<ShapeC, StrideC>> const& layout_C,
-      ElementS const* S_ptr,  // S is an scale_KxN matrix
-      std::optional<Layout<ShapeS, StrideS>> const& layout_S,
-      ElementZ const* Z_ptr,  // Z is an scale_KxN matrix
-      std::optional<Layout<ShapeZ, StrideZ>> const& layout_Z,
-      ElementCompute alpha, ElementCompute beta,
-      std::optional<int> maybe_group_size) {
-    static_assert(!with_zeropoints || with_scales);
-
-    int M = size<0>(layout_A), N = size<1>(layout_D), K = size<1>(layout_A);
+      torch::Tensor const& A,  // MxK matrix
+      torch::Tensor const& B,  // KxN prepacked matrix
+      torch::Tensor& D,        // MxN matrix
+      c10::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
+      c10::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
+      c10::optional<int64_t> maybe_group_size,
+      c10::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
+      c10::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
+  {
+    static_assert(!with_group_zeropoints || with_group_scales);
+
+    int M = A.size(0), N = B.size(1), K = A.size(1);
+    TORCH_CHECK(D.size(0) == M && D.size(1) == N);
+
+    auto layout_A = make_cute_layout<StrideA>(A, "A");
+    auto layout_D = make_cute_layout<StrideD>(D, "D");
+    auto layout_S_group =
+        maybe_make_cute_layout<StrideSGroup>(maybe_g_scales, "group_scales");
+    auto layout_Z_group =
+        maybe_make_cute_layout<StrideZGroup>(maybe_g_zeros, "group_zeros");
+    int64_t numel_S_channel = maybe_ch_scales ? maybe_ch_scales->numel() : 0;
+    int64_t numel_S_token = maybe_tok_scales ? maybe_tok_scales->numel() : 0;
+
+    auto unwrap = [](auto const& t) {
+      return t ? t->const_data_ptr() : nullptr;
+    };
+    auto A_ptr = static_cast<ElementA const*>(A.const_data_ptr());
+    auto B_ptr = static_cast<ElementB const*>(B.const_data_ptr());
+    auto D_ptr = static_cast<ElementD*>(D.mutable_data_ptr());
+    auto S_group_ptr =
+        static_cast<ElementSGroup const*>(unwrap(maybe_g_scales));
+    auto Z_group_ptr = static_cast<ElementZGroup const*>(unwrap(maybe_g_zeros));
+    auto S_channel_ptr =
+        static_cast<ElementSChannel const*>(unwrap(maybe_ch_scales));
+    auto S_token_ptr =
+        static_cast<ElementSToken const*>(unwrap(maybe_tok_scales));
 
     int const group_size =
         maybe_group_size == -1 ? K : maybe_group_size.value_or(K);
@@ -159,26 +224,28 @@ struct MacheteKernelTemplate {
     TORCH_CHECK(size<0>(layout_A) == M && size<1>(layout_A) == K);
     TORCH_CHECK(size<0>(layout_D) == M && size<1>(layout_D) == N);
 
-    if constexpr (with_C) {
-      TORCH_CHECK(C_ptr && layout_C);
+    if constexpr (with_group_scales) {
+      TORCH_CHECK(S_group_ptr && layout_S_group);
+      TORCH_CHECK((size<0>(*layout_S_group) == scale_k &&
+                   size<1>(*layout_S_group) == N));
     } else {
-      TORCH_CHECK(!C_ptr, "C not supported");
+      TORCH_CHECK(!S_group_ptr, "Scales not supported");
     }
 
-    if constexpr (with_scales) {
-      TORCH_CHECK(S_ptr && layout_S);
-      TORCH_CHECK((size<0>(*layout_S) == scale_k && size<1>(*layout_S) == N));
+    if constexpr (with_group_zeropoints) {
+      TORCH_CHECK(Z_group_ptr && layout_Z_group);
+      TORCH_CHECK((size<0>(*layout_Z_group) == scale_k &&
+                   size<1>(*layout_Z_group) == N));
+      TORCH_CHECK(layout_S_group && *layout_Z_group == *layout_S_group,
+                  "Scales and zeros must have the same layout");
     } else {
-      TORCH_CHECK(!S_ptr, "Scales not supported");
+      TORCH_CHECK(!Z_group_ptr, "Zeropoints not supported");
     }
 
-    if constexpr (with_zeropoints) {
-      TORCH_CHECK(Z_ptr && layout_Z);
-      TORCH_CHECK((size<0>(*layout_Z) == scale_k && size<1>(*layout_Z) == N));
-      TORCH_CHECK(layout_S && *layout_Z == *layout_S,
-                  "Scales and zeros must have the same layout");
-    } else {
-      TORCH_CHECK(!Z_ptr, "Zeropoints not supported");
+    if constexpr (with_channel_scales || with_token_scales) {
+      TORCH_CHECK(
+          (maybe_ch_scales->numel() == N || maybe_ch_scales->numel() == 1) &&
+          (maybe_tok_scales->numel() == M || maybe_tok_scales->numel() == 1));
     }
 
     // Transpose A and D
@@ -186,24 +253,33 @@ struct MacheteKernelTemplate {
     //  for B (which is At)
     auto stride_At = layout_A.stride();
     auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
-    auto stride_Ct = stride_Dt;
-    if (layout_C) {
-      stride_Ct = permute_layout<1, 0, 2>(*layout_C).stride();
-    }
 
     MainloopArguments mainloop_arguments{};
-    EpilogueArguments epilogue_arguments{
-        {alpha, beta}, C_ptr, stride_Ct, D_ptr, stride_Dt};
+    // {Accum, C, C_layout, D, D}
+    EpilogueArguments epilogue_arguments{};
+
+    if constexpr (with_channel_scales || with_token_scales) {
+      epilogue_arguments =
+          EpilogueArguments{ChTokScalesEpilogue::prepare_args(
+                                *maybe_ch_scales, *maybe_tok_scales),
+                            nullptr,
+                            {},
+                            D_ptr,
+                            stride_Dt};
+    } else {
+      epilogue_arguments = EpilogueArguments{{}, nullptr, {}, D_ptr, stride_Dt};
+    }
 
-    if constexpr (with_scales && with_zeropoints) {
-      auto stride_S = permute_layout<1, 0, 2>(*layout_S).stride();
-      mainloop_arguments =
-          MainloopArguments{B_ptr, _StrideB{}, A_ptr,      stride_At,
-                            S_ptr, stride_S,   group_size, Z_ptr};
-    } else if constexpr (with_scales) {
-      auto stride_S = permute_layout<1, 0, 2>(*layout_S).stride();
+    if constexpr (with_group_scales && with_group_zeropoints) {
+      auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride();
       mainloop_arguments = MainloopArguments{
-          B_ptr, _StrideB{}, A_ptr, stride_At, S_ptr, stride_S, group_size};
+          B_ptr,       _StrideB{},     A_ptr,      stride_At,
+          S_group_ptr, stride_S_group, group_size, Z_group_ptr};
+    } else if constexpr (with_group_scales) {
+      auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride();
+      mainloop_arguments =
+          MainloopArguments{B_ptr,       _StrideB{},     A_ptr,     stride_At,
+                            S_group_ptr, stride_S_group, group_size};
     } else {
       mainloop_arguments =
           MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At};
diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh
index 60a4ed60535b7..4b0da5b303e0c 100644
--- a/csrc/quantization/machete/machete_mm_launcher.cuh
+++ b/csrc/quantization/machete/machete_mm_launcher.cuh
@@ -5,73 +5,61 @@
 
 #include "machete_mm_kernel.cuh"
 #include "cutlass_extensions/torch_utils.hpp"
+#include "core/scalar_type.hpp"
 
 namespace machete {
 
-struct PyTorchArguments {
+struct MMArgs {
   torch::Tensor const& A;
   torch::Tensor const& B;
-  c10::optional<torch::Tensor> const& scales;
-  c10::optional<torch::Tensor> const& zeros;
-  c10::optional<int64_t> group_size;
-  c10::optional<torch::Tensor> const& C;
-  c10::optional<double> alpha;
-  c10::optional<double> beta;
-  c10::optional<std::string> schedule;
+  vllm::ScalarType const& b_type;
+  c10::optional<at::ScalarType> const& maybe_out_type;
+  c10::optional<torch::Tensor> const& maybe_group_scales;
+  c10::optional<torch::Tensor> const& maybe_group_zeros;
+  c10::optional<int64_t> maybe_group_size;
+  c10::optional<torch::Tensor> const& maybe_channel_scales;
+  c10::optional<torch::Tensor> const& maybe_token_scales;
+  c10::optional<std::string> maybe_schedule;
 };
 
+struct SupportedSchedulesArgs {
+  at::ScalarType a_type;
+  vllm::ScalarType b_type;
+  c10::optional<at::ScalarType> maybe_group_scales_type;
+  c10::optional<at::ScalarType> maybe_group_zeros_type;
+  c10::optional<at::ScalarType> maybe_channel_scales_type;
+  c10::optional<at::ScalarType> maybe_token_scales_type;
+  c10::optional<at::ScalarType> maybe_out_type;
+};
+
+torch::Tensor mm_dispatch(MMArgs args);
+
+std::vector<std::string> supported_schedules_dispatch(
+    SupportedSchedulesArgs args);
+
 template <typename MacheteKernel>
-torch::Tensor run_impl(PyTorchArguments args) {
+torch::Tensor run_impl(MMArgs args) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(args.A));
 
   auto device = args.A.device();
   auto stream = at::cuda::getCurrentCUDAStream(device.index());
 
-  using EleA = typename MacheteKernel::ElementA;
-  using EleB = typename MacheteKernel::ElementB;
-  using EleC = typename MacheteKernel::ElementC;
-  using EleD = typename MacheteKernel::ElementD;
-  using EleScale = typename MacheteKernel::ElementS;
-  using EleZero = typename MacheteKernel::ElementZ;
-
-  using StrideA = typename MacheteKernel::StrideA;
-  using StrideC = typename MacheteKernel::StrideC;
-  using StrideD = typename MacheteKernel::StrideD;
-  using StrideS = typename MacheteKernel::StrideS;
-  using StrideZ = typename MacheteKernel::StrideZ;
-
   int M = args.A.size(0);
   int N = args.B.size(1);
   int K = args.A.size(1);
 
   // Allocate output
-  torch::Tensor D =
-      torch::empty({M, N}, torch::TensorOptions()
-                               .dtype(equivalent_scalar_type_v<EleD>)
-                               .device(device));
-
-  auto const &A = args.A, &B = args.B;
-  auto const &C = args.C, &scales = args.scales, &zeros = args.zeros;
-
-  auto layout_A = make_cute_layout<StrideA>(A, "A");
-  auto layout_D = make_cute_layout<StrideD>(D, "D");
-  auto layout_C = maybe_make_cute_layout<StrideC>(C, "C");
-  auto layout_S = maybe_make_cute_layout<StrideS>(scales, "scales");
-  auto layout_Z = maybe_make_cute_layout<StrideZ>(zeros, "zeros");
-
-  auto A_ptr = static_cast<EleA const*>(A.const_data_ptr());
-  auto B_ptr = static_cast<EleB const*>(B.const_data_ptr());
-  auto D_ptr = static_cast<EleD*>(D.mutable_data_ptr());
-  auto C_ptr = static_cast<EleC const*>(C ? C->const_data_ptr() : nullptr);
-  auto S_ptr =
-      static_cast<EleScale const*>(scales ? scales->const_data_ptr() : nullptr);
-  auto Z_ptr =
-      static_cast<EleZero const*>(zeros ? zeros->const_data_ptr() : nullptr);
+  torch::Tensor D = torch::empty(
+      {M, N},
+      torch::TensorOptions()
+          .dtype(equivalent_scalar_type_v<typename MacheteKernel::ElementD>)
+          .device(device));
 
   auto arguments = MacheteKernel::create_arguments(
-      stream, A_ptr, layout_A, B_ptr, D_ptr, layout_D, C_ptr, layout_C, S_ptr,
-      layout_S, Z_ptr, layout_Z, args.alpha.value_or(1), args.beta.value_or(0),
-      args.group_size);
+      stream,  //
+      args.A, args.B, D, args.maybe_group_scales, args.maybe_group_zeros,
+      args.maybe_group_size, args.maybe_channel_scales,
+      args.maybe_token_scales);
   TORCH_CHECK(MacheteKernel::can_implement(arguments),
               "Machete kernel cannot be run with these arguments");
 
@@ -84,12 +72,4 @@ torch::Tensor run_impl(PyTorchArguments args) {
   return D;
 };
 
-template <typename ElementA, typename ElementB, typename ElementD = ElementA,
-          typename AccumulatorT = float, typename ScaleT = ElementA,
-          typename ZeroT = ElementA>
-struct GemmDispatcher {
-  static torch::Tensor dispatch(PyTorchArguments args);
-  static std::vector<std::string> supported_schedules();
-};
-
 };  // namespace machete
\ No newline at end of file
diff --git a/csrc/quantization/machete/machete_prepack_kernel.cuh b/csrc/quantization/machete/machete_prepack_kernel.cuh
index f23483f928b47..d002355ca49d6 100644
--- a/csrc/quantization/machete/machete_prepack_kernel.cuh
+++ b/csrc/quantization/machete/machete_prepack_kernel.cuh
@@ -6,31 +6,49 @@
 
 namespace machete {
 
-template <typename TileShapeNKL, typename ElementB, typename BInTensor,
-          typename BTiledOutTensor>
-static __global__ void prepack_B_kernel(BInTensor B_in,
-                                        BTiledOutTensor B_tiled_out) {
-  auto tB_in = local_tile(B_in, TileShapeNKL{},
-                          make_coord(blockIdx.x, blockIdx.y, blockIdx.z));
-  auto tB_out = B_tiled_out(make_coord(_, _),
-                            make_coord(blockIdx.x, blockIdx.y), blockIdx.z);
+template <int threads, typename PrepackedLayoutB, typename BInTensor,
+          typename ElementB>
+static __global__ void prepack_B_kernel(BInTensor B_in, ElementB* B_out_ptr) {
+  auto constexpr block_size =
+      Int<size(typename PrepackedLayoutB::PPBlockShape_NK{})>{};
+  auto constexpr eles_per_thread = Int<block_size / threads>{};
+  static_assert(block_size % threads == 0,
+                "block_size must be divisible by the number of threads");
 
-  auto tiled_copy = make_tiled_copy(Copy_Atom<DefaultCopy, ElementB>{},
-                                    Layout<Shape<_4, _32>, Stride<_32, _1>>{},
-                                    Layout<Shape<_1, _2>>{});
+  // Which pre-packed are we responsible for
+  auto blk_coord = make_coord(blockIdx.x, blockIdx.y, blockIdx.z);
+  auto tB_in = local_tile(
+      B_in, append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}),
+      blk_coord);
 
-  auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);
+  // Find the start offset in the output for this pre-packed block
+  auto bNbKL_to_offset = PrepackedLayoutB::bNbKL_to_offset(shape(B_in));
 
-  Tensor thr_tile_S = thr_copy.partition_S(tB_in);
-  Tensor thr_tile_D = thr_copy.partition_D(tB_out);
+  // Tensor representing a 1:1 mapping to the output space in 1D
+  auto tB_out_linear =
+      make_tensor(get_logical_ptr(B_out_ptr) + bNbKL_to_offset(blk_coord),
+                  make_layout(make_shape(block_size)));
+  // Mapping from output space (1D) to input space
+  auto tB_in_linear = make_tensor(
+      tB_in.data(),
+      tB_in.layout()
+          .compose(right_inverse(PrepackedLayoutB::ppblock_ilvd_NK_to_offset()))
+          .with_shape(make_shape(block_size)));
+
+  // Tile for this specific thread (could have used a TiledCopy but these work
+  // best with 2d layouts, this is a simple 1d layout so local_tile is enough,
+  // we are also not that concerned with performance for this kernel)
+  auto thr_tB_in_linear =
+      local_tile(tB_in_linear, make_shape(eles_per_thread), threadIdx.x);
+  auto thr_tB_out_linear =
+      local_tile(tB_out_linear, make_shape(eles_per_thread), threadIdx.x);
 
   // Construct a register-backed Tensor with the same shape as each thread's
   // partition
-  auto fragment = make_tensor<ElementB>(shape(thr_tile_D));
+  auto fragment = make_tensor<ElementB>(shape(thr_tB_in_linear));
 
-  // Copy from GMEM to RMEM and from RMEM to GMEM
-  copy(tiled_copy, thr_tile_S, fragment);
-  copy(Copy_Atom<DefaultCopy, uint8_t>{}, fragment, thr_tile_D);
+  copy(thr_tB_in_linear, fragment);
+  copy(Copy_Atom<DefaultCopy, uint8_t>{}, fragment, thr_tB_out_linear);
 }
 
 template <typename PrepackedLayoutB, typename InLayout>
@@ -44,18 +62,15 @@ static void prepack_B_template(
 
   TORCH_CHECK(size<0>(B_layout) % size<0>(TileShapeNKL{}) == 0);
   TORCH_CHECK(size<1>(B_layout) % size<1>(TileShapeNKL{}) == 0);
-  TORCH_CHECK(size<2>(B_layout) % size<2>(TileShapeNKL{}) == 0);
 
   auto N_tiles = size<0>(B_layout) / size<0>(TileShapeNKL{});
   auto K_tiles = size<1>(B_layout) / size<1>(TileShapeNKL{});
-  auto L_tiles = size<2>(B_layout) / size<2>(TileShapeNKL{});
+  auto L_tiles = size<2>(B_layout);
 
   auto B_in = make_tensor(get_logical_ptr(B_in_ptr), B_layout);
-  auto B_tiled_out =
-      make_tensor(get_logical_ptr(B_out_ptr), ilvd_NKbNbKL_to_offset);
 
-  prepack_B_kernel<TileShapeNKL, typename PrepackedLayoutB::ElementB>
-      <<<dim3(N_tiles, K_tiles, L_tiles), 128, 0, stream>>>(B_in, B_tiled_out);
+  prepack_B_kernel<128, PrepackedLayoutB>
+      <<<dim3(N_tiles, K_tiles, L_tiles), 128, 0, stream>>>(B_in, B_out_ptr);
 }
 
 };  // namespace machete
\ No newline at end of file
diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh
index a33d8f9484cfe..3486d28be2126 100644
--- a/csrc/quantization/machete/machete_prepack_launcher.cuh
+++ b/csrc/quantization/machete/machete_prepack_launcher.cuh
@@ -2,9 +2,17 @@
 
 #include "machete_prepack_kernel.cuh"
 #include "cutlass_extensions/torch_utils.hpp"
+#include "core/scalar_type.hpp"
 
 namespace machete {
 
+struct PrepackBArgs {
+  torch::Tensor const& B;
+  at::ScalarType a_type;
+  vllm::ScalarType b_type;
+  c10::optional<at::ScalarType> maybe_group_scales_type;
+};
+
 template <typename PrepackedLayoutB>
 torch::Tensor prepack_impl(torch::Tensor const B) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(B));
@@ -61,11 +69,6 @@ torch::Tensor prepack_impl(torch::Tensor const B) {
   return D;
 };
 
-template <typename ElementA, typename ElementB, typename ElementD,
-          typename AccumulatorT = float, typename ScaleT = cutlass::half_t,
-          typename ZeroT = cutlass::half_t>
-struct PrepackBDispatcher {
-  static torch::Tensor dispatch(torch::Tensor B);
-};
+torch::Tensor prepack_B_dispatch(PrepackBArgs args);
 
 };  // namespace machete
\ No newline at end of file
diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh
index 78e2cc5eec7d8..680a858a893c1 100644
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -41,7 +41,7 @@ struct IlvBlkLayoutAuto {};
 // The contract here is that the `TiledMma` determined below matches the one
 // ultimately used in the kernel. (this is also why the other element types are
 // required along with the kernel schedule)
-template <typename ElementA_, typename ElementB_, typename ElementD_,
+template <typename ElementA_, typename ElementB_, typename ElementConvert_,
           typename AccumulatorT, class LayoutB, class KernelSchedule,
           typename IlvBlkLayout_ = IlvBlkLayoutAuto>
 // clang-format on
@@ -49,20 +49,27 @@ struct PrepackedLayoutBTemplate {
   using MmaType = ElementA_;
   using ElementA = ElementA_;
   using ElementB = ElementB_;
-  using ElementD = ElementD_;
-  using ElementAccumulator =
-      AccumulatorT;  // Element type for internal accumulation
+  using ElementAccumulator = AccumulatorT;
   using ElementMma = MmaType;
 
-  // Only use interleaved layouts for subbyte weights, prmt instructions makes
-  // non-interleaved layouts for 8bit+ weights efficient enough we don't need
-  // iterleaved layouts
+  // Interleave for 4bit bit types when we are not upconverting to fp8 or int8,
+  // in those cases case we use a LUT using prmt instructions to upconvert and
+  // is more efficient if the data is not interleaved For 8bit+ prmt
+  // instructions makes non-interleaved layouts efficient enough we don't need
+  // iterleaved layouts (and can reuse more of the existing cutlass converts)
+  static constexpr bool should_interleave =
+      sizeof_bits_v<ElementB> <= 4 &&
+      !std::is_same_v<ElementConvert_, cutlass::float_e4m3_t> &&
+      !std::is_same_v<ElementConvert_, int8_t>;
+
+  // Only use interleaved layouts for subbyte weights,
   using IlvdBlkLayout = std::conditional_t<
       std::is_same_v<IlvBlkLayout_, IlvBlkLayoutAuto>,
-      std::conditional_t<sizeof_bits_v<ElementB> <= 4,
-                         decltype(get_interleaved_blk_layout<
-                                  ElementB, sizeof_bits_v<ElementA>, 32>()),
-                         void>,
+      std::conditional_t<
+          should_interleave,
+          decltype(get_interleaved_blk_layout<
+                   ElementB, sizeof_bits_v<ElementConvert_>, 32>()),
+          void>,
       IlvBlkLayout_>;
 
   // TODO (LucasWilkinson): compare the performance for other sizes
@@ -135,7 +142,8 @@ struct PrepackedLayoutBTemplate {
       //   then ((IlvBlk), FrgB) is {A, C, B, D, C, G, D, H}
       auto frgV = get<1, 0>(layout_no_interleave);
       auto ilvdBlk = IlvdBlkLayout{};
-      static_assert(size(frgV) % 4 == 0, "FrgV must be divisible by 4");
+      static_assert(size(frgV) % size(ilvdBlk) == 0,
+                    "FrgV must be divisible by size(ilvdBlk)");
       auto ilvd_FrgV = make_layout(
           make_shape(shape(ilvdBlk), Int<size(frgV) / size(ilvdBlk)>{}),
           make_stride(stride(ilvdBlk), size(ilvdBlk)));
@@ -175,6 +183,15 @@ struct PrepackedLayoutBTemplate {
     return group<1, 3>(result(_, repeat<rank<1>(result)>(_)));
   }
 
+  // ((athrid_val), (BlocksN, BlocksK, L)) -> (N, K, L)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
+      Shape_NKL shape_mkl) {
+    auto layout = TVbNbKL_to_offset(shape_mkl);
+    return make_layout(coalesce(get<0>(layout)), get<1>(layout),
+                       get<2>(layout));
+  }
+
   // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
   template <typename Shape_NKL>
   CUTE_HOST_DEVICE static constexpr auto ilvd_NKbNbKL_to_offset(
@@ -197,6 +214,19 @@ struct PrepackedLayoutBTemplate {
     return group<1, 3>(result(_, repeat<rank<1>(result)>(_)));
   }
 
+  // (BlocksN, BlocksK, L) -> (storage_idx)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto bNbKL_to_offset(Shape_NKL shape_mkl) {
+    // (BlocksN, BlocksK, L)
+    auto blocks_shape =
+        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
+                        [](auto x, auto y) { return x / y; });
+    auto stride = size(PPBlockShape_NK{});
+
+    // (BlocksN, BlocksK, L) -> (storage_idx)
+    return make_layout(blocks_shape, compact_col_major(blocks_shape, stride));
+  }
+
   // ((athrid, val), (BlocksN, BlocksK, L)) -> (N, K, L)
   template <class Shape_NKL>
   CUTE_HOST_DEVICE static auto TVbNbK_to_NKL(Shape_NKL shape_mkl) {
diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu
index 9f9073ded6191..da2c2fb0d3e77 100644
--- a/csrc/quantization/machete/machete_pytorch.cu
+++ b/csrc/quantization/machete/machete_pytorch.cu
@@ -8,89 +8,61 @@ namespace machete {
 
 using namespace vllm;
 
-//
-//  Utils (type dispatching)
-//
-
-template <typename Fn>
-static auto scalar_type_dispatch(ScalarType const& type, Fn fn) {
-  if (type == vllm::kU4) {
-    return fn(cutlass::uint4b_t{});
-  } else if (type == vllm::kU8) {
-    return fn(cutlass::uint8_t{});
-  } else if (type == vllm::kU4B8) {
-    return fn(cutlass::vllm_uint4b8_t{});
-  } else if (type == vllm::kU8B128) {
-    return fn(cutlass::vllm_uint8b128_t{});
-  } else {
-    TORCH_CHECK(false, "Unsupported type ", type.str());
-  }
-}
-
-#define AT_DISPATCH_CASE_SUPPORTED_COMPUTE_TYPES(...) \
-  AT_DISPATCH_CASE_REDUCED_FLOATING_TYPES(__VA_ARGS__)
-
-#define AT_DISPATCH_SUPPORTED_COMPUTE_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME,                             \
-                     AT_DISPATCH_CASE_SUPPORTED_COMPUTE_TYPES(__VA_ARGS__))
-
-//
-//  Interface
-//
-
-std::vector<std::string> supported_schedules(ScalarTypeId const btype_id) {
-#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12
-  vllm::ScalarType b_type = ScalarType::from_id(btype_id);
-  return scalar_type_dispatch(b_type, [&](auto BType) {
-    return GemmDispatcher<half_t, decltype(BType)>::supported_schedules();
+std::vector<std::string> supported_schedules(
+    at::ScalarType a_type, int64_t b_type_id,
+    c10::optional<at::ScalarType> maybe_group_scales_type,
+    c10::optional<at::ScalarType> maybe_group_zeros_type,
+    c10::optional<at::ScalarType> maybe_channel_scales_type,
+    c10::optional<at::ScalarType> maybe_token_scales_type,
+    c10::optional<at::ScalarType> maybe_out_type) {
+  ScalarType const b_type = ScalarType::from_id(b_type_id);
+  return supported_schedules_dispatch({
+      .a_type = a_type,
+      .b_type = b_type,
+      .maybe_group_scales_type = maybe_group_scales_type,
+      .maybe_group_zeros_type = maybe_group_zeros_type,
+      .maybe_channel_scales_type = maybe_channel_scales_type,
+      .maybe_token_scales_type = maybe_token_scales_type,
+      .maybe_out_type = maybe_out_type,
   });
-#else
-  TORCH_CHECK(false, "Machete requires CUDA 12.0 or later");
-#endif
 }
 
-torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
-                   ScalarTypeId const btype_id,
-                   c10::optional<torch::Tensor> const& scales,
-                   c10::optional<torch::Tensor> const& zeros,
-                   c10::optional<int64_t> group_size,
-                   c10::optional<torch::Tensor> const& C,
-                   c10::optional<double> alpha, c10::optional<double> beta,
-                   c10::optional<std::string> schedule) {
-#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12
-  ScalarType const btype = ScalarType::from_id(btype_id);
-  auto args = PyTorchArguments{.A = A,
-                               .B = B,
-                               .scales = scales,
-                               .zeros = zeros,
-                               .group_size = group_size,
-                               .C = C,
-                               .alpha = alpha,
-                               .beta = beta,
-                               .schedule = schedule};
-
-  return scalar_type_dispatch(btype, [&](auto BType) {
-    return AT_DISPATCH_SUPPORTED_COMPUTE_TYPES(
-        A.scalar_type(), "machete_gemm", [&] {
-          using ComputeType = equivalent_cutlass_type_t<scalar_t>;
-          return GemmDispatcher<ComputeType, decltype(BType)>::dispatch(args);
-        });
-  });
-#else
-  TORCH_CHECK(false, "Machete requires CUDA 12.0 or later");
-#endif
+torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
+                 int64_t b_type_id,
+                 c10::optional<at::ScalarType> const& maybe_out_type,
+                 c10::optional<torch::Tensor> const& maybe_group_scales,
+                 c10::optional<torch::Tensor> const& maybe_group_zeros,
+                 c10::optional<int64_t> maybe_group_size,
+                 c10::optional<torch::Tensor> const& maybe_channel_scales,
+                 c10::optional<torch::Tensor> const& maybe_token_scales,
+                 c10::optional<std::string> maybe_schedule) {
+  ScalarType const b_type = ScalarType::from_id(b_type_id);
+  return mm_dispatch({.A = A,
+                      .B = B,
+                      .b_type = b_type,
+                      .maybe_out_type = maybe_out_type,
+                      .maybe_group_scales = maybe_group_scales,
+                      .maybe_group_zeros = maybe_group_zeros,
+                      .maybe_group_size = maybe_group_size,
+                      .maybe_channel_scales = maybe_channel_scales,
+                      .maybe_token_scales = maybe_token_scales,
+                      .maybe_schedule = maybe_schedule});
 }
 
-torch::Tensor prepack_B(torch::Tensor const& B, ScalarTypeId const btype_id) {
-  ScalarType const btype = ScalarType::from_id(btype_id);
-  return scalar_type_dispatch(btype, [&](auto BType) {
-    return PrepackBDispatcher<half_t, decltype(BType), half_t>::dispatch(B);
-  });
+torch::Tensor prepack_B(
+    torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
+    c10::optional<at::ScalarType> const& maybe_group_scales_type) {
+  ScalarType const b_type = ScalarType::from_id(b_type_id);
+  return prepack_B_dispatch(
+      {.B = B,
+       .a_type = a_type,
+       .b_type = b_type,
+       .maybe_group_scales_type = maybe_group_scales_type});
 }
 
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("machete_prepack_B", &prepack_B);
-  m.impl("machete_gemm", &gemm);
+  m.impl("machete_mm", &mm);
 }
 
 // use CatchAll since supported_schedules has no tensor arguments
diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index a33e2660d760e..17837351324be 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -296,13 +296,9 @@ __global__ void Marlin_24(
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
-  if (group_blocks != -1) {
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  } else {
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  }
+  s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+            (threadIdx.x % 32) / 4;  // Note that in the original Marlin kernel
+                                     // this is (threadIdx.x % 32) / 4
 
   // Precompute which thread should not read memory in which iterations; this is
   // needed if there are more threads than required for a certain tilesize or
@@ -910,13 +906,16 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C,
       // than better compute utilization
       thread_k = 128;
       thread_m = 128;
-    } else if (prob_n <= 256) {
+    } else {
       thread_k = 64;
       thread_m = 256;
-    } else {
-      thread_k = 32;
-      thread_m = 512;
     }
+    // Also had
+    // if prob_n > 256
+    //   thread_k = 32;
+    //   thread_m = 512;
+    // but this is broken,
+    // TODO(Lucas, Alex M): figure out why
   }
 
   int thread_k_blocks = thread_k / 32;  // 2:4 version with m16n8k32 instruction
@@ -1079,6 +1078,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   // Verify A device and strides
   TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
   TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+  TORCH_CHECK(a.dtype() == torch::kFloat16,
+              "A is not float16, currently only float16 is supported");
 
   // Verify B device and strides
   TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
@@ -1091,6 +1092,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   // Verify scales device and strides
   TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
   TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat16,
+              "A is not float16, currently only float16 is supported");
 
   // Alloc C matrix
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh
new file mode 100644
index 0000000000000..44c999130f756
--- /dev/null
+++ b/csrc/quantization/vectorization.cuh
@@ -0,0 +1,33 @@
+#pragma once
+/**
+ * __device__ datatypes vectorized by 4
+ */
+
+// Include both AMD and NVIDIA fp8 types to avoid circular import
+// TODO(luka/varun) use FP8_TYPE instead after refactoring
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+
+namespace vllm {
+
+// Vectorization containers
+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
+};
+
+template <typename quant_type_t>
+struct __align__(4) q8x4_t {
+  static_assert(std::is_same_v<quant_type_t, int8_t> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
+  quant_type_t x;
+  quant_type_t y;
+  quant_type_t z;
+  quant_type_t w;
+};
+
+}  // namespace vllm
diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cu b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
new file mode 100644
index 0000000000000..218c5317b4de6
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
@@ -0,0 +1,163 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include "sparse_scaled_mm_c3x.cuh"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+/// Make A structured sparse by replacing elements with 0 and compress it
+template <typename ElementA_, typename ElementAcc_>
+bool cutlass_sparse_compress(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                             torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
+  TORCH_CHECK(a.stride(1) == 1)
+
+  int m = a.size(0);
+  int k = a.size(1);
+
+  // Sparse kernel setup; this kernel is not used for matmul,
+  // but just for setting up the compressor utility
+  // A matrix configuration
+  using ElementA = ElementA_;
+  using LayoutTagA = cutlass::layout::RowMajor;
+  constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  // B matrix configuration
+  using ElementB = ElementA;
+  using LayoutTagB = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  // C/D matrix configuration
+  using ElementC = float;
+  using LayoutTagC = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Core kernel configurations
+  using ElementAccumulator = ElementAcc_;
+  using TileShape = Shape<_128, _128, _128>;
+  using TileShapeRef = Shape<_128, _128, _64>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using KernelSchedule = typename std::conditional<
+      std::is_same_v<ElementA, cutlass::float_e4m3_t>,
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum,
+      cutlass::gemm::KernelTmaWarpSpecialized>::type;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using ProblemShape = Shape<int, int, int, int>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementAccumulator, ElementC, LayoutTagC,
+          AlignmentC, ElementC, LayoutTagC, AlignmentC,
+          EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, ElementA,
+          LayoutTagA, AlignmentA, ElementB, LayoutTagB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  using StrideE = StrideA;
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+
+  // The n (=1) dimension does not matter for the compressor
+  typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
+
+  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  // Offline compressor kernel
+  using CompressorUtility =
+      cutlass::transform::kernel::StructuredSparseCompressorUtility<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig>;
+
+  using CompressorKernel =
+      cutlass::transform::kernel::StructuredSparseCompressor<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig,
+          cutlass::arch::Sm90>;
+
+  using Compressor =
+      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA stride_A;
+  stride_A =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+
+  CompressorUtility compressor_utility(prob_shape, stride_A);
+
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+
+  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
+  auto a_meta_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(
+      a_meta.data_ptr());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, stride_A, a_nzs_ptr, a_meta_ptr}, {hw_info}};
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return true;
+}
+
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a) {
+  if (a.dtype() == torch::kBFloat16) {
+    return cutlass_sparse_compress<cutlass::bfloat16_t, float>(a_nzs, a_meta,
+                                                               a);
+  } else if (a.dtype() == torch::kFloat16) {
+    return cutlass_sparse_compress<cutlass::half_t, float>(a_nzs, a_meta, a);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    return cutlass_sparse_compress<cutlass::float_e4m3_t, float>(a_nzs, a_meta,
+                                                                 a);
+  } else if (a.dtype() == torch::kInt8) {
+    return cutlass_sparse_compress<int8_t, int32_t>(a_nzs, a_meta, a);
+  }
+  return false;
+}
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_compressor_entry.cu b/csrc/sparse/cutlass/sparse_compressor_entry.cu
new file mode 100644
index 0000000000000..d23d937b6ac28
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_entry.cu
@@ -0,0 +1,42 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a);
+#endif
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                   torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && a_meta.dim() == 2 && a_nzs.dim() == 2);
+  TORCH_CHECK(a.size(0) == a_nzs.size(0) && a.size(0) == a_meta.size(0) &&
+              a_nzs.size(1) * 2 == a.size(1) &&
+              a_meta.size(1) * 2 * 4 == a.size(1));
+  // Considering elemsPerMetaElem = 8b / 2b_per_nz = 4
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && a_nzs.stride(1) == 1 &&
+              a_meta.stride(1) == 1);  // Row-major
+  TORCH_CHECK(a.stride(0) % 8 == 0);   // 8 Byte Alignment for Compression
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    return cutlass_sparse_compress_sm90(a_nzs, a_meta, a);
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
new file mode 100644
index 0000000000000..b50e9a3a2c240
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -0,0 +1,303 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+#include "sparse_scaled_mm_c3x.cuh"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& bt_nzs,
+                                    torch::Tensor const& bt_meta,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM512 =
+      typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  using Cutlass3xGemm1 =
+      typename sm90_fp8_config_1<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm2 =
+      typename sm90_fp8_config_2<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm3 =
+      typename sm90_fp8_config_3<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm4 =
+      typename sm90_fp8_config_4<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm5 =
+      typename sm90_fp8_config_5<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm6 =
+      typename sm90_fp8_config_6<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm7 =
+      typename sm90_fp8_config_7<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm8 =
+      typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = bt_nzs.size(0);
+  uint32_t const m = a.size(0);  // Batch size
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096 || n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 128) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 256) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    if (n == 6144 || n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+
+  // Otherwise the default heuristic
+  if (mp2 <= 64) {
+    // n in [1, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // n in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // n in (128, 256]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // n in (256, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::half_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
+  TORCH_CHECK(a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& bt_nzs,
+                                            torch::Tensor const& bt_meta,
+                                            EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat16) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else {  // a.dtype() == torch::kBFloat16
+    TORCH_CHECK(a.dtype() == torch::kBFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+}
+
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& bt_nzs,
+                                   torch::Tensor const& bt_meta,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales, *bias);
+  } else {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales);
+  }
+}
+
+#endif
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..10178b53f4af0
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -0,0 +1,496 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
+   for NVIDIA GPUs with sm90a (Hopper) or later.
+*/
+
+namespace {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_sparse_3x_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc = AccType;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD,
+          ElementD, LayoutD_Transpose, AlignmentCD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& bt_nzs,
+                                torch::Tensor const& bt_meta,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  // Interface stride expected from the argument a (will get transposed)
+  // We compute C^T = B^T * A^T, but we assume B is transposed before
+  // compression and hence the bt_* naming
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  auto layout_A = make_cute_layout<StrideA>(a, "A");
+  auto layout_D = make_cute_layout<StrideD>(out, "D");
+
+  // Transpose A and D
+  // A doesn't need to be transposed since cutlass expects a NxK matrix
+  // for B (which is At)
+  auto stride_At = layout_A.stride();
+  auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{
+      static_cast<int>(bt_nzs.size(0)), static_cast<int>(size<0>(layout_A)),
+      static_cast<int>(size<1>(layout_A)), 1};
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutB b_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{
+      b_ptr, b_layout, a_ptr, stride_At, e_ptr, e_layout};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, stride_Dt, c_ptr, stride_Dt};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default {};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<half_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
+                             ClusterShape, KernelSchedule, EpilogueSchedule,
+                             float>;
+};
+
+//////////////////////// Cherry-Picking Kernels ////////////////////////
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_1 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_2 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_3 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_4 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_5 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_6 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_7 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_8 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _256, _128>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+////////////////////////////////////////////////////////////////////////
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
+                             TileShape, ClusterShape, KernelSchedule,
+                             EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M256 {
+  // M in (128, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M512 {
+  // M in (256, ]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<int8_t, OutType, Epilogue> {
+  // For M > 128 and any N
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<int8_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
new file mode 100644
index 0000000000000..4c930b603c9e4
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -0,0 +1,59 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& e,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias);
+#endif
+
+void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
+                              torch::Tensor const& bt_nzs,
+                              torch::Tensor const& bt_meta,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
+              a.size(0) == c.size(0));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 &&
+              c.stride(1) == 1);            // Row-major
+  TORCH_CHECK(c.stride(0) % 16 == 0);       // 16 Byte Alignment
+  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == bt_nzs.size(0) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
+                                  bias);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 229fd554d3eee..88a4e60c75cbe 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -128,6 +128,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
            &fused_add_rms_norm_static_fp8_quant);
 
+  // Fused Layernorm + Quant kernels
+  ops.def(
+      "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
+      "Tensor weight, Tensor! scale, float epsilon, "
+      "Tensor? scale_ub, Tensor!? residual) -> ()");
+  ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
+           &rms_norm_dynamic_per_token_quant);
+
   // Rotary embedding
   // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
   ops.def(
@@ -203,13 +211,36 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   //  conditionally compiled so impl in source file
 
   // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
-  ops.def("machete_supported_schedules(int btype) -> str[]");
   ops.def(
-      "machete_gemm(Tensor A, Tensor B, int btype, "
-      "             Tensor? scales, Tensor? zeros, int? group_size, "
-      "             Tensor? C, float? alpha, float? beta, str? schedule)"
-      "-> Tensor");
-  ops.def("machete_prepack_B(Tensor B, int btype) -> Tensor");
+      "machete_supported_schedules("
+      "   ScalarType a_type,"
+      "   int b_type,"
+      "   ScalarType? maybe_group_scales_type,"
+      "   ScalarType? maybe_group_zeros_type,"
+      "   ScalarType? maybe_channel_scales_type,"
+      "   ScalarType? maybe_token_scales_type,"
+      "   ScalarType? maybe_out_type"
+      ") -> str[]");
+  ops.def(
+      "machete_mm("
+      "   Tensor A,"
+      "   Tensor B,"
+      "   int b_type,"
+      "   ScalarType? out_type,"
+      "   Tensor? group_scales,"
+      "   Tensor? group_zeros,"
+      "   int?    group_size,"
+      "   Tensor? channel_scales,"
+      "   Tensor? token_scales,"
+      "   str?    schedule"
+      ") -> Tensor");
+  ops.def(
+      "machete_prepack_B("
+      "   Tensor B,"
+      "   ScalarType a_type,"
+      "   int b_type,"
+      "   ScalarType? group_scales_type"
+      ") -> Tensor");
   // conditionally compiled so impl registration is in source file
 
   ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
@@ -221,7 +252,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
       "int b_q_type, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool has_zp, bool use_fp32_reduce) -> Tensor");
+      "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
   // conditionally compiled so impl registration is in source file
 
   // gptq_marlin repack from GPTQ.
@@ -235,6 +266,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
       "SymInt size_n, int num_bits) -> Tensor");
   // conditionally compiled so impl registrations are in source file
+#endif
 
   // Dequantization for GGML.
   ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
@@ -251,6 +283,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
   ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
 
+#ifndef USE_ROCM
   // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
   ops.def(
       "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
@@ -288,6 +321,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization, as well as bias
+  ops.def(
+      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
+      "                         Tensor bt_nzs,"
+      "                         Tensor bt_meta, Tensor a_scales,"
+      "                         Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
+
+  // CUTLASS sparse matrix compressor
+  ops.def(
+      "cutlass_sparse_compress_entry(Tensor! a_nzs, Tensor! a_meta,"
+      "                              Tensor a) -> bool");
+  ops.impl("cutlass_sparse_compress_entry", &cutlass_sparse_compress_entry);
+
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 8ea240f59c38f..ca2da4cd66d2d 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -16,4 +16,6 @@ mistral_common >= 1.5.0
 aiohttp
 starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
+fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+requests
diff --git a/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png b/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png
new file mode 100644
index 0000000000000..bbf46286cfe5d
Binary files /dev/null and b/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png differ
diff --git a/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png b/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png
new file mode 100644
index 0000000000000..ade1d602a9187
Binary files /dev/null and b/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png differ
diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/usage/disagg_prefill/abstraction.jpg
new file mode 100644
index 0000000000000..1a99e3ed8cf5f
Binary files /dev/null and b/docs/source/assets/usage/disagg_prefill/abstraction.jpg differ
diff --git a/docs/source/assets/usage/disagg_prefill/overview.jpg b/docs/source/assets/usage/disagg_prefill/overview.jpg
new file mode 100644
index 0000000000000..f029b4c05c808
Binary files /dev/null and b/docs/source/assets/usage/disagg_prefill/overview.jpg differ
diff --git a/docs/source/automatic_prefix_caching/details.md b/docs/source/automatic_prefix_caching/details.md
index 2d3214e28ed93..17f806217aa65 100644
--- a/docs/source/automatic_prefix_caching/details.md
+++ b/docs/source/automatic_prefix_caching/details.md
@@ -25,7 +25,7 @@ With this mapping, we can add another indirection in vLLM’s KV cache managemen
 This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
 
 
-# Generalized Caching Policy
+## Generalized Caching Policy
 
 Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
 
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index 52fbf9a577c7e..c6f83b3a92ca0 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -15,6 +15,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Dropbox
 - Google Cloud
 - Lambda Lab
+- Nebius
 - NVIDIA
 - Replicate
 - Roblox
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 96ad9a4c26b09..e9d9ac68c9560 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -10,11 +10,13 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 
+import inspect
 import logging
 import os
 import sys
 from typing import List
 
+import requests
 from sphinx.ext import autodoc
 
 logger = logging.getLogger(__name__)
@@ -34,6 +36,7 @@
 extensions = [
     "sphinx.ext.napoleon",
     "sphinx.ext.viewcode",
+    "sphinx.ext.linkcode",
     "sphinx.ext.intersphinx",
     "sphinx_copybutton",
     "sphinx.ext.autodoc",
@@ -94,6 +97,69 @@ def setup(app):
     generate_examples()
 
 
+_cached_base: str = ""
+_cached_branch: str = ""
+
+
+def get_repo_base_and_branch(pr_number):
+    global _cached_base, _cached_branch
+    if _cached_base and _cached_branch:
+        return _cached_base, _cached_branch
+
+    url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
+    response = requests.get(url)
+    if response.status_code == 200:
+        data = response.json()
+        _cached_base = data['head']['repo']['full_name']
+        _cached_branch = data['head']['ref']
+        return _cached_base, _cached_branch
+    else:
+        logger.error("Failed to fetch PR details: %s", response)
+        return None, None
+
+
+def linkcode_resolve(domain, info):
+    if domain != 'py':
+        return None
+    if not info['module']:
+        return None
+    filename = info['module'].replace('.', '/')
+    module = info['module']
+
+    # try to determine the correct file and line number to link to
+    obj = sys.modules[module]
+
+    # get as specific as we can
+    lineno: int = 0
+    filename: str = ""
+    try:
+        for part in info['fullname'].split('.'):
+            obj = getattr(obj, part)
+
+            if not (inspect.isclass(obj) or inspect.isfunction(obj)
+                    or inspect.ismethod(obj)):
+                obj = obj.__class__  # Get the class of the instance
+
+            lineno = inspect.getsourcelines(obj)[1]
+            filename = (inspect.getsourcefile(obj)
+                        or f"{filename}.py").split("vllm/", 1)[1]
+    except Exception:
+        # For some things, like a class member, won't work, so
+        # we'll use the line number of the parent (the class)
+        pass
+
+    if filename.startswith("checkouts/"):
+        # a PR build on readthedocs
+        pr_number = filename.split("/")[1]
+        filename = filename.split("/", 2)[2]
+        base, branch = get_repo_base_and_branch(pr_number)
+        if base and branch:
+            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
+
+    # Otherwise, link to the source file on the main branch
+    return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
+
+
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
     "compressed_tensors",
@@ -112,6 +178,7 @@ def setup(app):
     "tensorizer",
     "pynvml",
     "outlines",
+    "xgrammar,"
     "librosa",
     "soundfile",
     "gguf",
diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.rst
index ac2d2b2fe4103..4cea0afdaea74 100644
--- a/docs/source/contributing/overview.rst
+++ b/docs/source/contributing/overview.rst
@@ -41,15 +41,6 @@ Testing
 Contribution Guidelines
 =======================
 
-DCO and Signed-off-by
-----------------------
-
-When contributing changes to this project, you must agree to the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
-Commits must include a ``Signed-off-by:`` header which certifies agreement with
-the terms of the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
-
-Using ``-s`` with ``git commit`` will automatically add this header.
-
 Issues
 ------
 
@@ -61,7 +52,110 @@ If you encounter a bug or have a feature request, please `search existing issues
 Pull Requests & Code Reviews
 ----------------------------
 
-Please check the PR checklist in the `PR template <https://github.com/vllm-project/vllm/tree/main/.github/PULL_REQUEST_TEMPLATE.md>`_ for a detailed guide for contribution.
+Thank you for your contribution to vLLM! Before submitting the pull request,
+please ensure the PR meets the following criteria. This helps vLLM maintain the
+code quality and improve the efficiency of the review process.
+
+DCO and Signed-off-by
+^^^^^^^^^^^^^^^^^^^^^
+
+When contributing changes to this project, you must agree to the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+Commits must include a ``Signed-off-by:`` header which certifies agreement with
+the terms of the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+
+Using ``-s`` with ``git commit`` will automatically add this header.
+
+PR Title and Classification
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Only specific types of PRs will be reviewed. The PR title is prefixed
+appropriately to indicate the type of change. Please use one of the following:
+
+- ``[Bugfix]`` for bug fixes.
+- ``[CI/Build]`` for build or continuous integration improvements.
+- ``[Doc]`` for documentation fixes and improvements.
+- ``[Model]`` for adding a new model or improving an existing model. Model name
+  should appear in the title.
+- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server,
+  ``LLM`` class, etc.)
+- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels.
+- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``,
+  ``AsyncLLMEngine``, ``Scheduler``, etc.)
+- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should
+  appear in the prefix (e.g., ``[Hardware][AMD]``).
+- ``[Misc]`` for PRs that do not fit the above categories. Please use this
+  sparingly.
+
+.. note::
+   If the PR spans more than one category, please include all relevant prefixes.
+
+Code Quality
+^^^^^^^^^^^^
+
+The PR needs to meet the following code quality standards:
+
+- We adhere to `Google Python style guide
+  <https://google.github.io/styleguide/pyguide.html>`_ and `Google C++ style guide
+  <https://google.github.io/styleguide/cppguide.html>`_.
+- Pass all linter checks. Please use `format.sh
+  <https://github.com/vllm-project/vllm/blob/main/format.sh>`_ to format your
+  code.
+- The code needs to be well-documented to ensure future contributors can easily
+  understand the code.
+- Include sufficient tests to ensure the project stays correct and robust. This
+  includes both unit tests and integration tests.
+- Please add documentation to ``docs/source/`` if the PR modifies the
+  user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
+  new features or changes.
+
+Adding or Changing Kernels
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
+
+- Make sure custom ops are registered following PyTorch guidelines:
+  `Custom C++ and CUDA Operators <https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial>`_
+  and `The Custom Operators Manual <https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU>`_.
+- Custom operations that return ``Tensors`` require meta-functions.
+  Meta-functions should be implemented and registered in Python so that dynamic
+  dims can be handled automatically. See above documents for a description of
+  meta-functions.
+- Use `torch.library.opcheck() <https://pytorch.org/docs/stable/library.html#torch.library.opcheck>`_
+  to test the function registration and meta-function for any registered ops.
+  See ``tests/kernels`` for examples.
+- When changing the C++ signature of an existing op, the schema must be updated
+  to reflect the changes.
+- If a new custom type is needed, see the following document:
+  `Custom Class Support in PT2 <https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA>`_.
+
+Notes for Large Changes
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Please keep the changes as concise as possible. For major architectural changes
+(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
+(RFC) discussing the technical design and justification. Otherwise, we will tag
+it with ``rfc-required`` and might not go through the PR.
+
+What to Expect for the Reviews
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The goal of the vLLM team is to be a *transparent reviewing machine*. We would
+like to make the review process transparent and efficient and make sure no
+contributor feels confused or frustrated. However, the vLLM team is small, so we
+need to prioritize some PRs over others. Here is what you can expect from the
+review process:
+
+- After the PR is submitted, the PR will be assigned to a reviewer. Every
+  reviewer will pick up the PRs based on their expertise and availability.
+- After the PR is assigned, the reviewer will provide status updates every 2-3
+  days. If the PR is not reviewed within 7 days, please feel free to ping the
+  reviewer or the vLLM team.
+- After the review, the reviewer will put an ``action-required`` label on the PR
+  if there are changes required. The contributor should address the comments and
+  ping the reviewer to re-review the PR.
+- Please respond to all comments within a reasonable time frame. If a comment
+  isn't clear or you disagree with a suggestion, feel free to ask for
+  clarification or discuss the suggestion.
 
 Thank You
 ---------
diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.rst
new file mode 100644
index 0000000000000..bc3f509f0a66e
--- /dev/null
+++ b/docs/source/design/arch_overview.rst
@@ -0,0 +1,274 @@
+.. _arch_overview:
+
+Architecture Overview
+======================
+
+This document provides an overview of the vLLM architecture.
+
+.. contents:: Table of Contents
+    :local:
+    :depth: 2
+
+Entrypoints
+-----------
+
+vLLM provides a number of entrypoints for interacting with the system. The
+following diagram shows the relationship between them.
+
+.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png
+    :alt: Entrypoints Diagram
+
+LLM Class
+^^^^^^^^^
+
+The LLM class provides the primary Python interface for doing offline inference,
+which is interacting with a model without using a separate model inference
+server.
+
+Here is a sample of `LLM` class usage:
+
+.. code-block:: python
+
+    from vllm import LLM, SamplingParams
+
+    # Define a list of input prompts
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The largest ocean is",
+    ]
+
+    # Define sampling parameters
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Initialize the LLM engine with the OPT-125M model
+    llm = LLM(model="facebook/opt-125m")
+
+    # Generate outputs for the input prompts
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the generated outputs
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+More API details can be found in the :doc:`Offline Inference
+</dev/offline_inference/offline_index>` section of the API docs.
+
+The code for the `LLM` class can be found in `vllm/entrypoints/llm.py
+<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py>`_.
+
+OpenAI-compatible API server
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The second primary interface to vLLM is via its OpenAI-compatible API server.
+This server can be started using the `vllm serve` command.
+
+.. code-block:: bash
+
+    vllm serve <model>
+
+The code for the `vllm` CLI can be found in `vllm/scripts.py
+<https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py>`_.
+
+Sometimes you may see the API server entrypoint used directly instead of via the
+`vllm` CLI command. For example:
+
+.. code-block:: bash
+
+    python -m vllm.entrypoints.openai.api_server --model <model>
+
+That code can be found in `vllm/entrypoints/openai/api_server.py
+<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py>`_.
+
+More details on the API server can be found in the :doc:`OpenAI Compatible
+Server </serving/openai_compatible_server>` document.
+
+LLM Engine
+----------
+
+The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
+the vLLM system, handling model inference and asynchronous request processing.
+
+.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png
+    :alt: LLMEngine Diagram
+
+LLMEngine
+^^^^^^^^^
+
+The `LLMEngine` class is the core component of the vLLM engine. It is
+responsible for receiving requests from clients and generating outputs from the
+model. The `LLMEngine` includes input processing, model execution (possibly
+distributed across multiple hosts and/or GPUs), scheduling, and output
+processing.
+
+- **Input Processing**: Handles tokenization of input text using the specified
+  tokenizer.
+
+- **Scheduling**: Chooses which requests are processed in each step.
+
+- **Model Execution**: Manages the execution of the language model, including
+  distributed execution across multiple GPUs.
+
+- **Output Processing**: Processes the outputs generated by the model, decoding the
+  token IDs from a language model into human-readable text.
+
+The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_.
+
+.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py
+
+AsyncLLMEngine
+^^^^^^^^^^^^^^
+
+The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class.
+It uses `asyncio` to create a background loop that continuously processes
+incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
+can handle multiple concurrent requests and stream outputs to clients.
+
+The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
+API server that serves as a simpler example in
+`vllm/entrypoints/api_server.py`_.
+
+.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py
+
+The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_.
+
+.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py
+
+Worker
+------
+
+A worker is a process that runs the model inference. vLLM follows the common
+practice of using one process to control one accelerator device, such as GPUs.
+For example, if we use tensor parallelism of size 2 and pipeline parallelism of
+size 2, we will have 4 workers in total. Workers are identified by their
+``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while
+``local_rank`` is mainly used for assigning the accelerator device and accessing
+local resources such as the file system and shared memory.
+
+Model Runner
+------------
+
+Every worker has one model runner object, responsible for loading and running
+the model. Much of the model execution logic resides here, such as preparing
+input tensors and capturing cudagraphs.
+
+Model
+-----
+
+Every model runner object has one model object, which is the actual
+``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various
+configurations affect the class we ultimately get.
+
+Class Hierarchy
+---------------
+
+The following figure shows the class hierarchy of vLLM:
+
+    .. figure:: /assets/design/hierarchy.png
+        :alt: query
+        :width: 100%
+        :align: center
+
+There are several important design choices behind this class hierarchy:
+
+1. **Extensibility**: All classes in the hierarchy accept a configuration object
+containing all the necessary information. The `VllmConfig
+<https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036>`__
+class is the main configuration object that is passed around. The class
+hierarchy is quite deep, and every class needs to read the configuration it is
+interested in. By encapsulating all configurations in one object, we can easily
+pass the configuration object around and access the configuration we need.
+Suppose we want to add a new feature (this is often the case given how fast the
+field of LLM inference is evolving) that only touches the model runner. We will
+have to add a new configuration option in the `VllmConfig` class. Since we pass
+the whole config object around, we only need to add the configuration option to
+the `VllmConfig` class, and the model runner can access it directly. We don't
+need to change the constructor of the engine, worker, or model class to pass the
+new configuration option.
+
+2. **Uniformity**: The model runner needs a unified interface to create and
+initialize the model. vLLM supports more than 50 types of popular open-source
+models. Each model has its own initialization logic. If the constructor
+signature varies with models, the model runner does not know how to call the
+constructor accordingly, without complicated and error-prone inspection logic.
+By making the constructor of the model class uniform, the model runner can
+easily create and initialize the model without knowing the specific model type.
+This is also useful for composing models. Vision-language models often consist
+of a vision model and a language model. By making the constructor uniform, we
+can easily create a vision model and a language model and compose them into a
+vision-language model.
+
+.. note::
+
+    To support this change, all vLLM models' signatures have been updated to:
+
+    .. code-block:: python
+
+        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+
+    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
+
+    .. code-block:: python
+
+        class MyOldModel(nn.Module):
+            def __init__(
+                self,
+                config,
+                cache_config: Optional[CacheConfig] = None,
+                quant_config: Optional[QuantizationConfig] = None,
+                lora_config: Optional[LoRAConfig] = None,
+                prefix: str = "",
+            ) -> None:
+                ...
+
+        from vllm.config import VllmConfig
+        class MyNewModel(MyOldModel):
+            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+                config = vllm_config.model_config.hf_config
+                cache_config = vllm_config.cache_config
+                quant_config = vllm_config.quant_config
+                lora_config = vllm_config.lora_config
+                super().__init__(config, cache_config, quant_config, lora_config, prefix)
+
+        if __version__ >= "0.6.4":
+            MyModel = MyNewModel
+        else:
+            MyModel = MyOldModel
+
+    This way, the model can work with both old and new versions of vLLM.
+
+3. **Sharding and Quantization at Initialization**: Certain features require
+changing the model weights. For example, tensor parallelism needs to shard the
+model weights, and quantization needs to quantize the model weights. There are
+two possible ways to implement this feature. One way is to change the model
+weights after the model is initialized. The other way is to change the model
+weights during the model initialization. vLLM chooses the latter. The first
+approach is not scalable to large models. Suppose we want to run a 405B model
+(with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should
+only load 50GB weights. If we change the model weights after the model is
+initialized, we need to load the full 810GB weights to every GPU and then shard
+the weights, leading to a huge memory overhead. Instead, if we shard the weights
+during the model initialization, every layer will only create a shard of the
+weights it needs, leading to a much smaller memory overhead. The same idea
+applies to quantization. Note that we also add an additional argument ``prefix``
+to the model's constructor so that the model can initialize itself differently
+based on the prefix. This is useful for non-uniform quantization, where
+different parts of the model are quantized differently. The ``prefix`` is
+usually an empty string for the top-level model and a string like ``"vision"``
+or ``"language"`` for the sub-models. In general, it matches the name of the
+module's state dict in the checkpoint file.
+
+One disadvantage of this design is that it is hard to write unit tests for
+individual components in vLLM because every component needs to be initialized by
+a complete config object. We solve this problem by providing a default
+initialization function that creates a default config object with all fields set
+to ``None``. If the component we want to test only cares about a few fields in
+the config object, we can create a default config object and set the fields we
+care about. This way, we can test the component in isolation. Note that many
+tests in vLLM are end-to-end tests that test the whole system, so this is not a
+big problem.
+
+In summary, the complete config object ``VllmConfig`` can be treated as an
+engine-level global state that is shared among all vLLM classes.
diff --git a/docs/source/design/class_hierarchy.rst b/docs/source/design/class_hierarchy.rst
deleted file mode 100644
index 15f0c8ccf77ee..0000000000000
--- a/docs/source/design/class_hierarchy.rst
+++ /dev/null
@@ -1,72 +0,0 @@
-vLLM's Class Hierarchy
-=======================
-
-This document describes the class hierarchy of vLLM. We will explain the relationships between the core classes, their responsibilities, and the design choices behind them to make vLLM more modular and extensible.
-
-1. **Entrypoints**: vLLM has two entrypoints: `command line usage <https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/entrypoints/api_server.py#L138>`__ with ``vllm serve`` for launching an OpenAI-API compatible server, and `library-style usage <https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/entrypoints/llm.py#L38>`__ with the ``vllm.LLM`` class for running inference in a Python script. These are user-facing entrypoints that end-users interact with. Under the hood, both create an engine object to handle model inference.
-
-2. **Engine**: Each vLLM instance contains one engine object, orchestrating and serving as the control plane for model inference. Depending on the configuration, the engine can create multiple workers to handle the inference workload.
-
-3. **Worker**: A worker is a process that runs the model inference. vLLM follows the common practice of using one process to control one accelerator device, such as GPUs. For example, if we use tensor parallelism of size 2 and pipeline parallelism of size 2, we will have 4 workers in total. Workers are identified by their ``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while ``local_rank`` is mainly used for assigning the accelerator device and accessing local resources such as the file system and shared memory.
-
-4. **Model Runner**: Every worker has one model runner object, responsible for loading and running the model. Much of the model execution logic resides here, such as preparing input tensors and capturing cudagraphs.
-
-5. **Model**: Every model runner object has one model object, which is the actual ``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various configurations affect the class we ultimately get.
-
-The following figure shows the class hierarchy of vLLM:
-
-    .. figure:: ../assets/design/hierarchy.png
-        :alt: query
-        :width: 100%
-        :align: center
-
-There are several important design choices behind this class hierarchy:
-
-1. **Extensibility**: All classes in the hierarchy accept a configuration object containing all the necessary information. The `VllmConfig <https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036>`__ class is the main configuration object that is passed around. The class hierarchy is quite deep, and every class needs to read the configuration it is interested in. By encapsulating all configurations in one object, we can easily pass the configuration object around and access the configuration we need. Suppose we want to add a new feature (this is often the case given how fast the field of LLM inference is evolving) that only touches the model runner. We will have to add a new configuration option in the `VllmConfig` class. Since we pass the whole config object around, we only need to add the configuration option to the `VllmConfig` class, and the model runner can access it directly. We don't need to change the constructor of the engine, worker, or model class to pass the new configuration option.
-
-2. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the constructor accordingly, without complicated and error-prone inspection logic. By making the constructor of the model class uniform, the model runner can easily create and initialize the model without knowing the specific model type. This is also useful for composing models. Vision-language models often consist of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model.
-
-.. note::
-
-    To support this change, all vLLM models' signatures have been updated to:
-
-    .. code-block:: python
-
-        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-    
-    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
-
-    .. code-block:: python
-
-        class MyOldModel(nn.Module):
-            def __init__(
-                self,
-                config,
-                cache_config: Optional[CacheConfig] = None,
-                quant_config: Optional[QuantizationConfig] = None,
-                lora_config: Optional[LoRAConfig] = None,
-                prefix: str = "",
-            ) -> None:
-                ...
-
-        from vllm.config import VllmConfig
-        class MyNewModel(MyOldModel):
-            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-                config = vllm_config.model_config.hf_config
-                cache_config = vllm_config.cache_config
-                quant_config = vllm_config.quant_config
-                lora_config = vllm_config.lora_config
-                super().__init__(config, cache_config, quant_config, lora_config, prefix)
-        
-        if __version__ >= "0.6.4":
-            MyModel = MyNewModel
-        else:
-            MyModel = MyOldModel
-
-    This way, the model can work with both old and new versions of vLLM.
-
-3. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model weights after the model is initialized. The other way is to change the model weights during the model initialization. vLLM chooses the latter. The first approach is not scalable to large models. Suppose we want to run a 405B model (with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should only load 50GB weights. If we change the model weights after the model is initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea applies to quantization. Note that we also add an additional argument ``prefix`` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where different parts of the model are quantized differently. The ``prefix`` is usually an empty string for the top-level model and a string like ``"vision"`` or ``"language"`` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file.
-
-One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set to ``None``. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem.
-
-In summary, the complete config object ``VllmConfig`` can be treated as an engine-level global state that is shared among all vLLM classes.
diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.rst
index 30f543abc20c7..c6d47f90b62d5 100644
--- a/docs/source/design/multimodal/multimodal_index.rst
+++ b/docs/source/design/multimodal/multimodal_index.rst
@@ -7,7 +7,7 @@ Multi-Modality
     
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
 
-Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
+Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_mm_models>`
 via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`.
 
 Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
@@ -15,9 +15,6 @@ by following :ref:`this guide <adding_multimodal_plugin>`.
 
 Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
 
-..
-  TODO: Add usage of --limit-mm-per-prompt when multi-image input is officially supported
-
 Guides
 ++++++
 
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
new file mode 100644
index 0000000000000..b58456ecc6da8
--- /dev/null
+++ b/docs/source/design/multiprocessing.md
@@ -0,0 +1,195 @@
+# Python Multiprocessing
+
+## Debugging
+
+Please see the [Debugging
+Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing)
+page for information on known issues and how to solve them.
+
+## Introduction
+
+*Note that source code references are to the state of the code at the time of writing in December, 2024.*
+
+The use of Python multiprocessing in vLLM is complicated by:
+
+- The use of vLLM as a library and the inability to control the code using vLLM
+- Varying levels of incompatibilities between multiprocessing methods and vLLM
+  dependencies
+
+This document describes how vLLM deals with these challenges.
+
+## Multiprocessing Methods
+
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
+
+- `spawn` - spawn a new Python process. This will be the default as of Python
+  3.14.
+
+- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
+  in Python versions prior to 3.14.
+
+- `forkserver` - Spawn a server process that will fork a new process on request.
+
+### Tradeoffs
+
+`fork` is the fastest method, but is incompatible with dependencies that use
+threads.
+
+`spawn` is more compatible with dependencies, but can be problematic when vLLM
+is used as a library. If the consuming code does not use a `__main__` guard (`if
+__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
+spawns a new process. This can lead to infinite recursion, among other problems.
+
+`forkserver` will spawn a new server process that will fork new processes on
+demand. This unfortunately has the same problem as `spawn` when vLLM is used as
+a library. The server process is created as a spawned new process, which will
+re-execute code not protected by a `__main__` guard.
+
+For both `spawn` and `forkserver`, the process must not depend on inheriting any
+global state as would be the case with `fork`.
+
+## Compatibility with Dependencies
+
+Multiple vLLM dependencies indicate either a preference or requirement for using
+`spawn`:
+
+- <https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing>
+- <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
+- <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
+
+It is perhaps more accurate to say that there are known problems with using
+`fork` after initializing these dependencies.
+
+## Current State (v0)
+
+The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
+
+When we know we own the process because the `vllm` command was used, we use
+`spawn` because it's the most widely compatible.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
+
+The `multiproc_xpu_executor` forces the use of `spawn`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/executor/multiproc_xpu_executor.py#L14-L18>
+
+There are other miscellaneous places hard-coding the use of `spawn`:
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L135>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
+
+Related PRs:
+
+- <https://github.com/vllm-project/vllm/pull/8823>
+
+## Prior State in v1
+
+There was an environment variable to control whether multiprocessing is used in
+the v1 engine core, `VLLM_ENABLE_V1_MULTIPROCESSING`. This defaulted to off.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L452-L454>
+
+When it was enabled, the v1 `LLMEngine` would create a new process to run the
+engine core.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L93-L95>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L70-L77>
+- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45
+
+It was off by default for all the reasons mentioned above - compatibility with
+dependencies and code using vLLM as a library.
+
+### Changes Made in v1
+
+There is not an easy solution with Python's `multiprocessing` that will work
+everywhere. As a first step, we can get v1 into a state where it does "best
+effort" choice of multiprocessing method to maximize compatibility.
+
+- Default to `fork`.
+- Use `spawn` when we know we control the main process (`vllm` was executed).
+- If we detect `cuda` was previously initialized, force `spawn` and emit a
+  warning. We know `fork` will break, so this is the best we can do.
+
+The case that is known to still break in this scenario is code using vLLM as a
+library that initializes `cuda` before calling vLLM. The warning we emit should
+instruct users to either add a `__main__` guard or to disable multiprocessing.
+
+If that known-failure case occurs, the user will see two messages that explain
+what is happening. First, a log message from vLLM:
+
+```
+    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+      initialized. We must use the `spawn` multiprocessing start method. Setting
+      VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+      https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+      for more information.
+```
+
+Second, Python itself will raise an exception with a nice explanation:
+
+```
+RuntimeError:
+        An attempt has been made to start a new process before the
+        current process has finished its bootstrapping phase.
+
+        This probably means that you are not using fork to start your
+        child processes and you have forgotten to use the proper idiom
+        in the main module:
+
+            if __name__ == '__main__':
+                freeze_support()
+                ...
+
+        The "freeze_support()" line can be omitted if the program
+        is not going to be frozen to produce an executable.
+
+        To fix this issue, refer to the "Safe importing of main module"
+        section in https://docs.python.org/3/library/multiprocessing.html
+```
+
+## Alternatives Considered
+
+### Detect if a `__main__` guard is present
+
+It has been suggested that we could behave better if we could detect whether
+code using vLLM as a library has a `__main__` guard in place. This [post on
+stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
+was from a library author facing the same question.
+
+It is possible to detect whether we are in the original, `__main__` process, or
+a subsequent spawned process. However, it does not appear to be straight forward
+to detect whether a `__main__` guard is present in the code.
+
+This option has been discarded as impractical.
+
+### Use `forkserver`
+
+At first it appears that `forkserver` is a nice solution to the problem.
+However, the way it works presents the same challenges that `spawn` does when
+vLLM is used as a library.
+
+### Force `spawn` all the time
+
+One way to clean this up is to just force the use of `spawn` all the time and
+document that the use of a `__main__` guard is required when using vLLM as a
+library. This would unfortunately break existing code and make vLLM harder to
+use, violating the desire to make the `LLM` class as easy as possible to use.
+
+Instead of pushing this on our users, we will retain the complexity to do our
+best to make things work.
+
+## Future Work
+
+We may want to consider a different worker management approach in the future
+that works around these challenges.
+
+1. We could implement something `forkserver`-like, but have the process manager
+   be something we initially launch by running our own subprocess and a custom
+   entrypoint for worker management (launch a `vllm-manager` process).
+
+2. We can explore other libraries that may better suit our needs. Examples to
+   consider:
+
+- <https://github.com/joblib/loky>
diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst
new file mode 100644
index 0000000000000..5a96cc8b3a464
--- /dev/null
+++ b/docs/source/design/plugin_system.rst
@@ -0,0 +1,62 @@
+.. _plugin_system:
+
+vLLM's Plugin System
+====================
+
+The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
+
+How Plugins Work in vLLM
+------------------------
+
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins <https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16>`__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work.
+
+How vLLM Discovers Plugins
+--------------------------
+
+vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
+
+.. code-block:: python
+
+    # inside `setup.py` file
+    from setuptools import setup
+
+    setup(name='vllm_add_dummy_model',
+          version='0.1',
+          packages=['vllm_add_dummy_model'],
+          entry_points={
+              'vllm.general_plugins':
+              ["register_dummy_model = vllm_add_dummy_model:register"]
+          })
+    
+    # inside `vllm_add_dummy_model.py` file
+    def register():
+        from vllm import ModelRegistry
+
+        if "MyLlava" not in ModelRegistry.get_supported_archs():
+            ModelRegistry.register_model("MyLlava",
+                                            "vllm_add_dummy_model.my_llava:MyLlava")
+
+For more information on adding entry points to your package, please check the `official documentation <https://setuptools.pypa.io/en/latest/userguide/entry_point.html>`__.
+
+Every plugin has three parts:
+
+1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins.
+
+2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name.
+
+3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module.
+
+What Can Plugins Do?
+--------------------
+
+Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
+
+Guidelines for Writing Plugins
+------------------------------
+
+- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
+
+Compatibility Guarantee
+-----------------------
+
+vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
new file mode 100644
index 0000000000000..7b457df92c11d
--- /dev/null
+++ b/docs/source/getting_started/arm-installation.rst
@@ -0,0 +1,50 @@
+.. _installation_arm:
+
+Installation for ARM CPUs
+=========================
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+
+* CPU backend inference capabilities
+* Relevant runtime environment variables
+* Performance optimization tips
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+Contents:
+
+1. :ref:`Requirements <arm_backend_requirements>`
+2. :ref:`Quick Start with Dockerfile <arm_backend_quick_start_dockerfile>`
+3. :ref:`Building from Source <build_arm_backend_from_source>`
+
+.. _arm_backend_requirements:
+
+Requirements
+------------
+
+* **Operating System**: Linux or macOS
+* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+* **Instruction Set Architecture (ISA)**: NEON support is required
+
+.. _arm_backend_quick_start_dockerfile:
+
+Quick Start with Dockerfile
+---------------------------
+
+You can quickly set up vLLM on ARM using Docker:
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
+    $ docker run -it \
+                 --rm \
+                 --network=host \
+                 --cpuset-cpus=<cpu-id-list, optional> \
+                 --cpuset-mems=<memory-node, optional> \
+                 vllm-cpu-env
+
+.. _build_arm_backend_from_source:
+
+Building from Source
+--------------------
+
+To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index 69530fd778c55..649de1cd9b53c 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -5,11 +5,11 @@ Installation with CPU
 
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
 
-- Tensor Parallel (``-tp = N``)
-- Quantization (``INT8 W8A8, AWQ``)
-
-.. note::
-    More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon.
+- Tensor Parallel 
+- Model Quantization (``INT8 W8A8, AWQ``)
+- Chunked-prefill
+- Prefix-caching
+- FP8-E5M2 KV-Caching (TODO)
 
 Table of contents:
 
diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 77bf550601346..d6c83014dc69f 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -86,7 +86,6 @@ If GPU/CPU communication cannot be established, you can use the following Python
     from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 
     pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
-    pynccl.disabled = False
 
     s = torch.cuda.Stream()
     with torch.cuda.stream(s):
@@ -137,6 +136,62 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b
 
     Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes.
 
+Python multiprocessing
+----------------------
+
+`RuntimeError` Exception
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you have seen a warning in your logs like this:
+
+.. code-block:: console
+
+    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+        initialized. We must use the `spawn` multiprocessing start method. Setting
+        VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+        https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+        for more information.
+
+or an error from Python that looks like this:
+
+.. code-block:: console
+
+    RuntimeError:
+            An attempt has been made to start a new process before the
+            current process has finished its bootstrapping phase.
+
+            This probably means that you are not using fork to start your
+            child processes and you have forgotten to use the proper idiom
+            in the main module:
+
+                if __name__ == '__main__':
+                    freeze_support()
+                    ...
+
+            The "freeze_support()" line can be omitted if the program
+            is not going to be frozen to produce an executable.
+
+            To fix this issue, refer to the "Safe importing of main module"
+            section in https://docs.python.org/3/library/multiprocessing.html
+
+then you must update your Python code to guard usage of ``vllm`` behind a ``if
+__name__ == '__main__':`` block. For example, instead of this:
+
+.. code-block:: python
+
+    import vllm
+
+    llm = vllm.LLM(...)
+
+try this instead:
+
+.. code-block:: python
+
+    if __name__ == '__main__':
+        import vllm
+
+        llm = vllm.LLM(...)
+
 Known Issues
 ----------------------------------------
 - In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 68c1a56660fa4..249e08278ff8f 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -4,7 +4,7 @@ Installation with Intel® Gaudi® AI Accelerators
 This README provides instructions on running vLLM with Intel Gaudi devices.
 
 Requirements and Installation
-=============================
+-----------------------------
 
 Please follow the instructions provided in the `Gaudi Installation
 Guide <https://docs.habana.ai/en/latest/Installation_Guide/index.html>`__
@@ -13,7 +13,7 @@ please follow the methods outlined in the `Optimizing Training Platform
 Guide <https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html>`__.
 
 Requirements
-------------
+~~~~~~~~~~~~
 
 -  OS: Ubuntu 22.04 LTS
 -  Python: 3.10
@@ -22,7 +22,7 @@ Requirements
 
 
 Quick start using Dockerfile
-----------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code:: console
 
    $ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
@@ -34,10 +34,10 @@ Quick start using Dockerfile
 
 
 Build from source
------------------
+~~~~~~~~~~~~~~~~~
 
 Environment verification
-~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -53,7 +53,7 @@ Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verificatio
 for more details.
 
 Run Docker Image
-~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^
 
 It is highly recommended to use the latest Docker image from Intel Gaudi
 vault. Refer to the `Intel Gaudi
@@ -68,7 +68,7 @@ Use the following commands to run a Docker image:
    $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 
 Build and Install vLLM
-~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^
 
 To build and install vLLM from source, run:
 
@@ -90,7 +90,7 @@ Currently, the latest features and performance optimizations are developed in Ga
 
 
 Supported Features
-==================
+------------------
 
 -  `Offline batched
    inference <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference>`__
@@ -107,7 +107,7 @@ Supported Features
 -  Attention with Linear Biases (ALiBi)
 
 Unsupported Features
-====================
+--------------------
 
 -  Beam search
 -  LoRA adapters
@@ -115,7 +115,7 @@ Unsupported Features
 -  Prefill chunking (mixed-batch inferencing)
 
 Supported Configurations
-========================
+------------------------
 
 The following configurations have been validated to be function with
 Gaudi2 devices. Configurations that are not listed may or may not work.
@@ -152,10 +152,10 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
    with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
 
 Performance Tuning
-==================
+------------------
 
 Execution modes
----------------
+~~~~~~~~~~~~~~~
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag.  
 
@@ -184,7 +184,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected
 
 
 Bucketing mechanism
--------------------
+~~~~~~~~~~~~~~~~~~~
 
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler <https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime>`__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
 In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. 
@@ -233,7 +233,7 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come
    Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
 
 Warmup
-------
+~~~~~~
 
 Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
 
@@ -257,7 +257,7 @@ This example uses the same buckets as in *Bucketing mechanism* section. Each out
    Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
 
 HPU Graph capture
------------------
+~~~~~~~~~~~~~~~~~
 
 `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
 
@@ -321,7 +321,7 @@ Each described step is logged by vLLM server, as follows (negative values corres
 
 
 Recommended vLLM Parameters
----------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 -  We recommend running inference on Gaudi 2 with ``block_size`` of 128
    for BF16 data type. Using default values (16, 32) might lead to
@@ -333,7 +333,7 @@ Recommended vLLM Parameters
    If you encounter out-of-memory issues, see troubleshooting section.
 
 Environment variables
----------------------
+~~~~~~~~~~~~~~~~~~~~~
 
 **Diagnostic and profiling knobs:**
 
@@ -380,7 +380,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
 -   ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs
 
 Troubleshooting: Tweaking HPU Graphs
-====================================
+------------------------------------
 
 If you experience device out-of-memory issues or want to attempt
 inference at higher batch sizes, try tweaking HPU Graphs by following
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index f02626bda4c64..9b6cb0e80d60e 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -21,7 +21,7 @@ You can install vLLM using pip:
 .. code-block:: console
 
     $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.10 -y
+    $ conda create -n myenv python=3.12 -y
     $ conda activate myenv
 
     $ # Install vLLM with CUDA 12.1.
@@ -73,7 +73,7 @@ Another way to access the latest code is to use the docker images:
 .. code-block:: console
 
     $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}
+    $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
 
 These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
 
@@ -89,45 +89,24 @@ Build from source
 Python-only build (without compilation)
 ---------------------------------------
 
-If you only need to change Python code, you can simply build vLLM without compilation.
-
-The first step is to install the latest vLLM wheel:
-
-.. code-block:: console
-
-    pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
-
-After verifying that the installation is successful, you can use `the following script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_:
+If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag <https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs>`_, changes you make to the code will be reflected when you run vLLM:
 
 .. code-block:: console
 
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
-    $ python python_only_dev.py
-
-The script will:
-
-* Find the installed vLLM package in the current environment.
-* Copy built files to the current directory.
-* Rename the installed vLLM package.
-* Symbolically link the current directory to the installed vLLM package.
+    $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 
-Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM.
+This will download the latest nightly wheel and use the compiled libraries from there in the install.
 
-Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_ with the ``--quit-dev`` (or ``-q`` for short) flag:
+The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel <https://pypi.org/project/vllm/#files>`_:
 
 .. code-block:: console
 
-    $ python python_only_dev.py --quit-dev
-
-The ``--quit-dev`` flag will:
-
-* Remove the symbolic link from the current directory to the vLLM package.
-* Restore the original vLLM package from the backup.
+   $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
+   $ pip install --editable .
 
-If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again.
+You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
 
 .. note::
 
@@ -148,9 +127,13 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
 .. tip::
 
     Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
+
     For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
     As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 
+    `sccache <https://github.com/mozilla/sccache>`_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments.
+    The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``.
+
 
 Use an existing PyTorch installation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -170,6 +153,18 @@ To build vLLM using an existing PyTorch installation:
     $ pip install -e . --no-build-isolation
 
 
+Use the local cutlass for compilation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
+To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
+
+.. code-block:: console
+
+    $ git clone https://github.com/vllm-project/vllm.git
+    $ cd vllm
+    $ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
+
+
 Troubleshooting
 ~~~~~~~~~~~~~~~
 
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index 75ab2b6ba02dc..22cc684a1c778 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -44,15 +44,18 @@ Requirements
 Provision Cloud TPUs
 ====================
 
-You can provision Cloud TPUs using the `Cloud TPU API <https://cloud.google.com/tpu/docs/reference/rest>`_` 
-or the `queued resources <https://cloud.google.com/tpu/docs/queued-resources>`_` 
-API. This section shows how to create TPUs using the queued resource API. 
-For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API <https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api>`_. 
-`Queued resources <https://cloud.devsite.corp.google.com/tpu/docs/queued-resources>`_
-enable you to request Cloud TPU resources in a queued manner. When you request 
-queued resources, the request is added to a queue maintained by the Cloud TPU 
-service. When the requested resource becomes available, it's assigned to your 
-Google Cloud project for your immediate exclusive use. 
+You can provision Cloud TPUs using the `Cloud TPU API <https://cloud.google.com/tpu/docs/reference/rest>`_ 
+or the `queued resources <https://cloud.google.com/tpu/docs/queued-resources>`_ 
+API. This section shows how to create TPUs using the queued resource API. For 
+more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API <https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api>`_. 
+Queued resources enable you to request Cloud TPU resources in a queued manner. 
+When you request queued resources, the request is added to a queue maintained by 
+the Cloud TPU service. When the requested resource becomes available, it's 
+assigned to your Google Cloud project for your immediate exclusive use. 
+
+.. note::
+   In all of the following commands, replace the ALL CAPS parameter names with 
+   appropriate values. See the parameter descriptions table for more information.
 
 Provision a Cloud TPU with the queued resource API
 --------------------------------------------------
@@ -68,6 +71,7 @@ Create a TPU v5e with 4 TPU chips:
     --runtime-version RUNTIME_VERSION \
     --service-account SERVICE_ACCOUNT
 
+   
 .. list-table:: Parameter descriptions
     :header-rows: 1
 
@@ -81,12 +85,13 @@ Create a TPU v5e with 4 TPU chips:
     * - PROJECT_ID
       - Your Google Cloud project
     * - ZONE
-      - The `zone <https://cloud.google.com/tpu/docs/regions-zones>`_ where you 
-        want to create your Cloud TPU.
+      - The GCP zone where you want to create your Cloud TPU. The value you use 
+        depends on the version of TPUs you are using. For more information, see 
+        `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_ 
     * - ACCELERATOR_TYPE
-      - The TPU version you want to use. Specify the TPU version, followed by a 
-        '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU 
-        with 4 cores. For more information, see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+      - The TPU version you want to use. Specify the TPU version, for example 
+        `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, 
+        see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
     * - RUNTIME_VERSION
       - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
     * - SERVICE_ACCOUNT
@@ -98,7 +103,15 @@ Connect to your TPU using SSH:
 
 .. code-block:: bash
 
-    gcloud compute tpus tpu-vm ssh TPU_NAME
+    gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
+
+Install Miniconda
+
+.. code-block:: bash
+
+    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+    bash Miniconda3-latest-Linux-x86_64.sh
+    source ~/.bashrc
 
 Create and activate a Conda environment for vLLM:
 
@@ -162,9 +175,11 @@ Run the Docker image with the following command:
 
 .. note::
 
-    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape.
-    The compilation time may take 20~30 minutes in the first run.
-    However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
+    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the 
+    possible input shapes and compiles an XLA graph for each shape. The 
+    compilation time may take 20~30 minutes in the first run. However, the 
+    compilation time reduces to ~5 minutes afterwards because the XLA graphs are 
+    cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
 
 .. tip::
 
@@ -173,7 +188,8 @@ Run the Docker image with the following command:
     .. code-block:: console
 
         from torch._C import *  # noqa: F403
-        ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
+        ImportError: libopenblas.so.0: cannot open shared object file: No such 
+        file or directory
 
 
     Install OpenBLAS with the following command:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index a2abd2995b1cc..0d8a2b67f0fec 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -67,6 +67,7 @@ Documentation
    getting_started/openvino-installation
    getting_started/cpu-installation
    getting_started/gaudi-installation
+   getting_started/arm-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
    getting_started/xpu-installation
@@ -81,14 +82,14 @@ Documentation
    serving/openai_compatible_server
    serving/deploying_with_docker
    serving/deploying_with_k8s
+   serving/deploying_with_helm
    serving/deploying_with_nginx
    serving/distributed_serving
    serving/metrics
-   serving/env_vars
-   serving/usage_stats
    serving/integrations
    serving/tensorizer
    serving/compatibility_matrix
+   serving/weights_loading_with_fastsafetensor
    serving/faq
 
 .. toctree::
@@ -96,13 +97,27 @@ Documentation
    :caption: Models
 
    models/supported_models
+   models/generative_models
+   models/pooling_models
    models/adding_model
    models/enabling_multimodal_inputs
-   models/engine_args
-   models/lora
-   models/vlm
-   models/spec_decode
-   models/performance
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Usage
+
+   usage/lora
+   usage/multimodal_inputs
+   usage/tool_calling
+   usage/structured_outputs
+   usage/spec_decode
+   usage/compatibility_matrix
+   usage/performance
+   usage/faq
+   usage/engine_args
+   usage/env_vars
+   usage/usage_stats
+   usage/disagg_prefill
 
 .. toctree::
    :maxdepth: 1
@@ -156,11 +171,13 @@ Documentation
    :maxdepth: 2
    :caption: Design
 
-   design/class_hierarchy
+   design/arch_overview
    design/huggingface_integration
+   design/plugin_system
    design/input_processing/model_inputs_index
    design/kernel/paged_attention
    design/multimodal/multimodal_index
+   design/multiprocessing
 
 .. For Developers: contributing to the vLLM project
 
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index c6d88cc38e99b..df06d736ca86b 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -38,41 +38,70 @@ For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/
     When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
 
 
-2. Rewrite the :code:`forward` methods
+2. Make your code compatible with vLLM
 --------------------------------------
 
-Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your model by following these steps:
-
-1. Remove any unnecessary code, such as the code only used for training.
-2. Change the input parameters:
-
-.. code-block:: diff
-
-      def forward(
-          self,
-          input_ids: torch.Tensor,
-    -     attention_mask: Optional[torch.Tensor] = None,
-    -     position_ids: Optional[torch.LongTensor] = None,
-    -     past_key_values: Optional[List[torch.FloatTensor]] = None,
-    -     inputs_embeds: Optional[torch.FloatTensor] = None,
-    -     labels: Optional[torch.LongTensor] = None,
-    -     use_cache: Optional[bool] = None,
-    -     output_attentions: Optional[bool] = None,
-    -     output_hidden_states: Optional[bool] = None,
-    -     return_dict: Optional[bool] = None,
-    - ) -> Union[Tuple, CausalLMOutputWithPast]:
-    +     positions: torch.Tensor,
-    +     kv_caches: List[torch.Tensor],
-    +     attn_metadata: AttentionMetadata,
-    + ) -> Optional[SamplerOutput]:
-
-1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
-2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
+To ensure compatibility with vLLM, your model must meet the following requirements:
+
+Initialization Code
+^^^^^^^^^^^^^^^^^^^
+
+All vLLM modules within the model must include a ``prefix`` argument in their constructor. This ``prefix`` is typically the full name of the module in the model's state dictionary and is crucial for:
+
+* Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
+* Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the ``prefix`` during initialization, vLLM can match the current layer's ``prefix`` with the quantization configuration to determine if the layer should be initialized in quantized mode.
+
+The initialization code should look like this:
+
+.. code-block:: python
+
+    from torch import nn
+    from vllm.config import VllmConfig
+    from vllm.attention import Attention
+
+    class MyAttention(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str):
+            super().__init__()
+            self.attn = Attention(prefix=f"{prefix}.attn")
+
+    class MyDecoderLayer(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str):
+            super().__init__()
+            self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
+
+    class MyModel(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str):
+            super().__init__()
+            self.layers = nn.ModuleList(
+                [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
+            )
+
+    class MyModelForCausalLM(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+            super().__init__()
+            self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
+
+Computation Code
+^^^^^^^^^^^^^^^^
+
+Rewrite the :meth:`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat ``input_ids`` and ``positions`` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+
+.. code-block:: python
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        ...
 
 .. note::
     Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
     If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
 
+For reference, check out the `LLAMA model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py>`__. vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the `vLLM models <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models>`__ directory for more examples.
 
 3. (Optional) Implement tensor parallelism and quantization support
 -------------------------------------------------------------------
@@ -102,11 +131,11 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py>`_.
 
 6. Out-of-Tree Model Integration
---------------------------------------------
+--------------------------------
 
-We also provide a way to integrate a model without modifying the vLLM codebase. Step 2, 3, 4 are still required, but you can skip step 1 and 5.
+You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`.
 
-Just add the following lines in your code:
+To register the model, use the following code:
 
 .. code-block:: python
 
@@ -114,7 +143,7 @@ Just add the following lines in your code:
     from your_code import YourModelForCausalLM
     ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
 
-If your model imports modules that initialize CUDA, consider instead lazy-importing it to avoid an error like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
 
 .. code-block:: python
 
@@ -123,19 +152,8 @@ If your model imports modules that initialize CUDA, consider instead lazy-import
     ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
 
 .. important::
-    If your model is a multimodal model, make sure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+    If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
     Read more about that :ref:`here <enabling_multimodal_inputs>`.
 
-If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
-
-.. code-block:: python
-
-    from vllm import ModelRegistry
-    from your_code import YourModelForCausalLM
-    ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
-
-    if __name__ == '__main__':
-        import runpy
-        runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
-
-Save the above code in a file and run it with :code:`python your_file.py <args>`.
+.. note::
+    Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst
index 49b5285c45590..5c1236e1a8972 100644
--- a/docs/source/models/enabling_multimodal_inputs.rst
+++ b/docs/source/models/enabling_multimodal_inputs.rst
@@ -3,7 +3,7 @@
 Enabling Multimodal Inputs
 ==========================
 
-This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal <multi_modality>` inputs.
+This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs <multimodal_inputs>`.
 
 .. seealso::
     :ref:`adding_a_new_model`
diff --git a/docs/source/models/generative_models.rst b/docs/source/models/generative_models.rst
new file mode 100644
index 0000000000000..fb71185600863
--- /dev/null
+++ b/docs/source/models/generative_models.rst
@@ -0,0 +1,146 @@
+.. _generative_models:
+
+Generative Models
+=================
+
+vLLM provides first-class support for generative models, which covers most of LLMs.
+
+In vLLM, generative models implement the :class:`~vllm.model_executor.models.VllmModelForTextGeneration` interface.
+Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
+which are then passed through :class:`~vllm.model_executor.layers.Sampler` to obtain the final text.
+
+Offline Inference
+-----------------
+
+The :class:`~vllm.LLM` class provides various methods for offline inference.
+See :ref:`Engine Arguments <engine_args>` for a list of options when initializing the model.
+
+For generative models, the only supported :code:`task` option is :code:`"generate"`.
+Usually, this is automatically inferred so you don't have to specify it.
+
+``LLM.generate``
+^^^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.generate` method is available to all generative models in vLLM.
+It is similar to `its counterpart in HF Transformers <https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate>`__,
+except that tokenization and detokenization are also performed automatically.
+
+.. code-block:: python
+
+    llm = LLM(model="facebook/opt-125m")
+    outputs = llm.generate("Hello, my name is")
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+You can optionally control the language generation by passing :class:`~vllm.SamplingParams`.
+For example, you can use greedy sampling by setting :code:`temperature=0`:
+
+.. code-block:: python
+
+    llm = LLM(model="facebook/opt-125m")
+    params = SamplingParams(temperature=0)
+    outputs = llm.generate("Hello, my name is", params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+A code example can be found in `examples/offline_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py>`_.
+
+``LLM.beam_search``
+^^^^^^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.beam_search` method implements `beam search <https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding>`__ on top of :class:`~vllm.LLM.generate`.
+For example, to search using 5 beams and output at most 50 tokens:
+
+.. code-block:: python
+
+    llm = LLM(model="facebook/opt-125m")
+    params = BeamSearchParams(beam_width=5, max_tokens=50)
+    outputs = llm.generate("Hello, my name is", params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+``LLM.chat``
+^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.chat` method implements chat functionality on top of :class:`~vllm.LLM.generate`.
+In particular, it accepts input similar to `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__
+and automatically applies the model's `chat template <https://huggingface.co/docs/transformers/en/chat_templating>`__ to format the prompt.
+
+.. important::
+
+    In general, only instruction-tuned models have a chat template.
+    Base models may perform poorly as they are not trained to respond to the chat conversation.
+
+.. code-block:: python
+
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    conversation = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello"
+        },
+        {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+        },
+        {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+        },
+    ]
+    outputs = llm.chat(conversation)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+A code example can be found in `examples/offline_inference_chat.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py>`_.
+
+If the model doesn't have a chat template or you want to specify another one,
+you can explicitly pass a chat template:
+
+.. code-block:: python
+
+    from vllm.entrypoints.chat_utils import load_chat_template
+
+    # You can find a list of existing chat templates under `examples/`
+    custom_template = load_chat_template(chat_template="<path_to_template>")
+    print("Loaded chat template:", custom_template)
+
+    outputs = llm.chat(conversation, chat_template=custom_template)
+
+Online Inference
+----------------
+
+Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference.
+Please click on the above link for more details on how to launch the server.
+
+Completions API
+^^^^^^^^^^^^^^^
+
+Our Completions API is similar to ``LLM.generate`` but only accepts text.
+It is compatible with `OpenAI Completions API <https://platform.openai.com/docs/api-reference/completions>`__
+so that you can use OpenAI client to interact with it.
+A code example can be found in `examples/openai_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`_.
+
+Chat API
+^^^^^^^^
+
+Our Chat API is similar to ``LLM.chat``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
+It is compatible with `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__
+so that you can use OpenAI client to interact with it.
+A code example can be found in `examples/openai_chat_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py>`_.
diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst
new file mode 100644
index 0000000000000..4e67677a2767a
--- /dev/null
+++ b/docs/source/models/pooling_models.rst
@@ -0,0 +1,136 @@
+.. _pooling_models:
+
+Pooling Models
+==============
+
+vLLM also supports pooling models, including embedding, reranking and reward models.
+
+In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface.
+These models use a :class:`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
+before returning them.
+
+.. note::
+
+    We currently support pooling models primarily as a matter of convenience.
+    As shown in the :ref:`Compatibility Matrix <compatibility_matrix>`, most vLLM features are not applicable to
+    pooling models as they only work on the generation or decode stage, so performance may not improve as much.
+
+Offline Inference
+-----------------
+
+The :class:`~vllm.LLM` class provides various methods for offline inference.
+See :ref:`Engine Arguments <engine_args>` for a list of options when initializing the model.
+
+For pooling models, we support the following :code:`task` options:
+
+- Embedding (:code:`"embed"` / :code:`"embedding"`)
+- Classification (:code:`"classify"`)
+- Sentence Pair Scoring (:code:`"score"`)
+- Reward Modeling (:code:`"reward"`)
+
+The selected task determines the default :class:`~vllm.model_executor.layers.Pooler` that is used:
+
+- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
+- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
+- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
+- Reward Modeling: Extract all of the hidden states and return them directly.
+
+When loading `Sentence Transformers <https://huggingface.co/sentence-transformers>`__ models,
+we attempt to override the default pooler based on its Sentence Transformers configuration file (:code:`modules.json`).
+
+You can customize the model's pooling method via the :code:`override_pooler_config` option,
+which takes priority over both the model's and Sentence Transformers's defaults.
+
+``LLM.encode``
+^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM.
+It returns the extracted hidden states directly, which is useful for reward models.
+
+.. code-block:: python
+
+    llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
+    (output,) = llm.encode("Hello, my name is")
+
+    data = output.outputs.data
+    print(f"Data: {data!r}")
+
+``LLM.embed``
+^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
+It is primarily designed for embedding models.
+
+.. code-block:: python
+
+    llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
+    (output,) = llm.embed("Hello, my name is")
+
+    embeds = output.outputs.embedding
+    print(f"Embeddings: {embeds!r} (size={len(embeds)})")
+
+A code example can be found in `examples/offline_inference_embedding.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py>`_.
+
+``LLM.classify``
+^^^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.classify` method outputs a probability vector for each prompt.
+It is primarily designed for classification models.
+
+.. code-block:: python
+
+    llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+    (output,) = llm.classify("Hello, my name is")
+
+    probs = output.outputs.probs
+    print(f"Class Probabilities: {probs!r} (size={len(probs)})")
+
+A code example can be found in `examples/offline_inference_classification.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py>`_.
+
+``LLM.score``
+^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
+It is primarily designed for `cross-encoder models <https://www.sbert.net/examples/applications/cross-encoder/README.html>`__.
+These types of models serve as rerankers between candidate query-document pairs in RAG systems.
+
+.. note::
+
+    vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
+    To handle RAG at a higher level, you should use integration frameworks such as `LangChain <https://github.com/langchain-ai/langchain>`_.
+
+.. code-block:: python
+
+    llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+    (output,) = llm.score("What is the capital of France?",
+                          "The capital of Brazil is Brasilia.")
+
+    score = output.outputs.score
+    print(f"Score: {score}")
+
+A code example can be found in `examples/offline_inference_scoring.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py>`_.
+
+Online Inference
+----------------
+
+Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference.
+Please click on the above link for more details on how to launch the server.
+
+Embeddings API
+^^^^^^^^^^^^^^
+
+Our Embeddings API is similar to ``LLM.embed``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
+
+The text-only API is compatible with `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
+so that you can use OpenAI client to interact with it.
+A code example can be found in `examples/openai_embedding_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py>`_.
+
+The multi-modal API is an extension of the `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
+that incorporates `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__,
+so it is not part of the OpenAI standard. Please see :ref:`this page <multimodal_inputs>` for more details on how to use it.
+
+Score API
+^^^^^^^^^
+
+Our Score API is similar to ``LLM.score``.
+Please see `this page <../serving/openai_compatible_server.html#score-api-for-cross-encoder-models>`__ for more details on how to use it.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 96a513d42753b..8d39e6f14a59c 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -3,11 +3,21 @@
 Supported Models
 ================
 
-vLLM supports a variety of generative and embedding models from `HuggingFace (HF) Transformers <https://huggingface.co/models>`_.
-This page lists the model architectures that are currently supported by vLLM.
+vLLM supports generative and pooling models across various tasks.
+If a model supports more than one task, you can set the task via the :code:`--task` argument.
+
+For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
 
-For other models, you can check the :code:`config.json` file inside the model repository.
+Loading a Model
+^^^^^^^^^^^^^^^
+
+HuggingFace Hub
++++++++++++++++
+
+By default, vLLM loads models from `HuggingFace (HF) Hub <https://huggingface.co/models>`_.
+
+To determine whether a given model is supported, you can check the :code:`config.json` file inside the HF repository.
 If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
 .. tip::
@@ -17,38 +27,57 @@ If the :code:`"architectures"` field contains a model architecture listed below,
 
         from vllm import LLM
 
-        llm = LLM(model=...)  # Name or path of your model
+        # For generative models (task=generate) only
+        llm = LLM(model=..., task="generate")  # Name or path of your model
         output = llm.generate("Hello, my name is")
         print(output)
 
-    If vLLM successfully generates text, it indicates that your model is supported.
+        # For pooling models (task={embed,classify,reward}) only
+        llm = LLM(model=..., task="embed")  # Name or path of your model
+        output = llm.encode("Hello, my name is")
+        print(output)
+
+    If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
 
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
 for instructions on how to implement your model in vLLM.
 Alternatively, you can `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ to request vLLM support.
 
-.. note::
-    To use models from `ModelScope <https://www.modelscope.cn>`_ instead of HuggingFace Hub, set an environment variable:
+ModelScope
+++++++++++
 
-    .. code-block:: shell
+To use models from `ModelScope <https://www.modelscope.cn>`_ instead of HuggingFace Hub, set an environment variable:
 
-       $ export VLLM_USE_MODELSCOPE=True
+.. code-block:: shell
 
-    And use with :code:`trust_remote_code=True`.
+    $ export VLLM_USE_MODELSCOPE=True
 
-    .. code-block:: python
+And use with :code:`trust_remote_code=True`.
 
-        from vllm import LLM
+.. code-block:: python
 
-        llm = LLM(model=..., revision=..., trust_remote_code=True)  # Name or path of your model
-        output = llm.generate("Hello, my name is")
-        print(output)
+    from vllm import LLM
+
+    llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
+
+    # For generative models (task=generate) only
+    output = llm.generate("Hello, my name is")
+    print(output)
+
+    # For pooling models (task={embed,classify,reward}) only
+    output = llm.encode("Hello, my name is")
+    print(output)
+
+List of Text-only Language Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Generative Models
++++++++++++++++++
 
-Text-only Language Models
-^^^^^^^^^^^^^^^^^^^^^^^^^
+See :ref:`this page <generative_models>` for more information on how to use generative models.
 
-Text Generation
----------------
+Text Generation (``--task generate``)
+-------------------------------------
 
 .. list-table::
   :widths: 25 25 50 5 5
@@ -89,9 +118,9 @@ Text Generation
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
     - ✅︎
     - ✅︎
-  * - :code:`CohereForCausalLM`
+  * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM`
     - Command-R
-    - :code:`CohereForAI/c4ai-command-r-v01`, etc.
+    - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
     - ✅︎
     - ✅︎
   * - :code:`DbrxForCausalLM`
@@ -128,7 +157,7 @@ Text Generation
     - FalconMamba
     - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc.
     - ✅︎
-    -  
+    - ✅︎
   * - :code:`GemmaForCausalLM`
     - Gemma
     - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
@@ -139,6 +168,11 @@ Text Generation
     - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`GlmForCausalLM`
+    - GLM-4
+    - :code:`THUDM/glm-4-9b-chat-hf`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
@@ -160,8 +194,8 @@ Text Generation
     -
     - ✅︎
   * - :code:`GraniteForCausalLM`
-    - Granite 3.0, PowerLM
-    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
+    - Granite 3.0, Granite 3.1, PowerLM
+    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.1-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
     - ✅︎
     - ✅︎
   * - :code:`GraniteMoeForCausalLM`
@@ -169,6 +203,11 @@ Text Generation
     - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`GritLM`
+    - GritLM
+    - :code:`parasail-ai/GritLM-7B-vllm`.
+    - ✅︎
+    - ✅︎
   * - :code:`InternLMForCausalLM`
     - InternLM
     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
@@ -177,7 +216,7 @@ Text Generation
   * - :code:`InternLM2ForCausalLM`
     - InternLM2
     - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
-    -
+    - ✅︎
     - ✅︎
   * - :code:`JAISLMHeadModel`
     - Jais
@@ -188,7 +227,7 @@ Text Generation
     - Jamba
     - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc.
     - ✅︎
-    - 
+    - ✅︎
   * - :code:`LlamaForCausalLM`
     - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
     - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc.
@@ -198,7 +237,7 @@ Text Generation
     - Mamba
     - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc.
     -
-    -
+    - ✅︎
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
     - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc.
@@ -234,6 +273,11 @@ Text Generation
     - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
     -
     - ✅︎
+  * - :code:`OLMo2ForCausalLM`
+    - OLMo2
+    - :code:`allenai/OLMo2-7B-1124`, etc.
+    -
+    - ✅︎
   * - :code:`OLMoEForCausalLM`
     - OLMoE
     - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc.
@@ -304,6 +348,11 @@ Text Generation
     - :code:`upstage/solar-pro-preview-instruct`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`TeleChat2ForCausalLM`
+    - TeleChat2
+    - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`XverseForCausalLM`
     - XVERSE
     - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
@@ -313,8 +362,24 @@ Text Generation
 .. note::
     Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
 
-Text Embedding
---------------
+Pooling Models
+++++++++++++++
+
+See :ref:`this page <pooling_models>` for more information on how to use pooling models.
+
+.. important::
+    Since some model architectures support both generative and pooling tasks,
+    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+
+Text Embedding (``--task embed``)
+---------------------------------
+
+Any text generation model can be converted into an embedding model by passing :code:`--task embed`.
+
+.. note::
+    To get the best results, you should use pooling models that are specifically trained as such.
+
+The following table lists those that are tested in vLLM.
 
 .. list-table::
   :widths: 25 25 50 5 5
@@ -325,11 +390,21 @@ Text Embedding
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+  * - :code:`BertModel`
+    - BERT-based
+    - :code:`BAAI/bge-base-en-v1.5`, etc.
+    - 
+    - 
   * - :code:`Gemma2Model`
     - Gemma2-based
     - :code:`BAAI/bge-multilingual-gemma2`, etc.
     - 
     - ✅︎
+  * - :code:`GritLM`
+    - GritLM
+    - :code:`parasail-ai/GritLM-7B-vllm`.
+    - ✅︎
+    - ✅︎
   * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc.
     - Llama-based
     - :code:`intfloat/e5-mistral-7b-instruct`, etc.
@@ -337,19 +412,33 @@ Text Embedding
     - ✅︎
   * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM`
     - Qwen2-based
-    - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`, etc.
+    - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
     - ✅︎
     - ✅︎
+  * - :code:`RobertaModel`, :code:`RobertaForMaskedLM`
+    - RoBERTa-based
+    - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc.
+    - 
+    - 
+  * - :code:`XLMRobertaModel`
+    - XLM-RoBERTa-based
+    - :code:`intfloat/multilingual-e5-large`, etc.
+    - 
+    - 
 
-.. important::
-  Some model architectures support both generation and embedding tasks.
-  In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
+.. note::
+  :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
+  You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`.
 
-.. tip::
-  You can override the model's pooling method by passing :code:`--override-pooler-config`.
+.. note::
+  Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
+  You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
+
+  On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
+  despite being described otherwise on its model card.
 
-Reward Modeling
----------------
+Reward Modeling (``--task reward``)
+-----------------------------------
 
 .. list-table::
   :widths: 25 25 50 5 5
@@ -360,17 +449,23 @@ Reward Modeling
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+  * - :code:`LlamaForCausalLM`
+    - Llama-based
+    - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`Qwen2ForRewardModel`
     - Qwen2-based
     - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc.
     - ✅︎
     - ✅︎
 
-.. note::
-    As an interim measure, these models are supported in both offline and online inference via Embeddings API.
+.. important::
+  For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+  e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
 
-Classification
----------------
+Classification (``--task classify``)
+------------------------------------
 
 .. list-table::
   :widths: 25 25 50 5 5
@@ -387,12 +482,38 @@ Classification
     - ✅︎
     - ✅︎
 
-.. note::
-    As an interim measure, these models are supported in both offline and online inference via Embeddings API.
+Sentence Pair Scoring (``--task score``)
+----------------------------------------
+
+.. list-table::
+  :widths: 25 25 50 5 5
+  :header-rows: 1
+
+  * - Architecture
+    - Models
+    - Example HF Models
+    - :ref:`LoRA <lora>`
+    - :ref:`PP <distributed_serving>`
+  * - :code:`BertForSequenceClassification`
+    - BERT-based
+    - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
+    - 
+    - 
+  * - :code:`RobertaForSequenceClassification`
+    - RoBERTa-based
+    - :code:`cross-encoder/quora-roberta-base`, etc.
+    - 
+    - 
+  * - :code:`XLMRobertaForSequenceClassification`
+    - XLM-RoBERTa-based
+    - :code:`BAAI/bge-reranker-v2-m3`, etc.
+    - 
+    - 
 
+.. _supported_mm_models:
 
-Multimodal Language Models
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+List of Multimodal Language Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The following modalities are supported depending on the model:
 
@@ -409,13 +530,18 @@ On the other hand, modalities separated by :code:`/` are mutually exclusive.
 
 - e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
-.. _supported_vlms:
+See :ref:`this page <multimodal_inputs>` on how to pass multi-modal inputs to the model.
+
+Generative Models
++++++++++++++++++
 
-Text Generation
----------------
+See :ref:`this page <generative_models>` for more information on how to use generative models.
+
+Text Generation (``--task generate``)
+-------------------------------------
 
 .. list-table::
-  :widths: 25 25 15 25 5 5
+  :widths: 25 25 15 20 5 5 5
   :header-rows: 1
 
   * - Architecture
@@ -424,151 +550,216 @@ Text Generation
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+    - V1
+  * - :code:`AriaForConditionalGeneration`
+    - Aria
+    - T + I
+    - :code:`rhymes-ai/Aria`
+    - 
+    - ✅︎
+    - 
   * - :code:`Blip2ForConditionalGeneration`
     - BLIP-2
     - T + I\ :sup:`E`
     - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
     -
     - ✅︎
+    - 
   * - :code:`ChameleonForConditionalGeneration`
     - Chameleon
     - T + I
     - :code:`facebook/chameleon-7b` etc.
     - 
     - ✅︎
+    - 
   * - :code:`FuyuForCausalLM`
     - Fuyu
     - T + I
     - :code:`adept/fuyu-8b` etc.
     - 
     - ✅︎
+    - 
   * - :code:`ChatGLMModel`
     - GLM-4V
     - T + I
     - :code:`THUDM/glm-4v-9b` etc.
-    - 
     - ✅︎
+    - ✅︎
+    - 
   * - :code:`H2OVLChatModel`
     - H2OVL
     - T + I\ :sup:`E+`
     - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
     - 
     - ✅︎
+    - 
   * - :code:`Idefics3ForConditionalGeneration`
     - Idefics3
     - T + I
     - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
     - ✅︎
+    -
     - 
   * - :code:`InternVLChatModel`
-    - InternVL2
+    - InternVL 2.5, Mono-InternVL, InternVL 2.0
     - T + I\ :sup:`E+`
-    - :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
+    - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
     - 
     - ✅︎
+    - ✅︎
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
     - T + I\ :sup:`E+`
-    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
+    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`LlavaNextForConditionalGeneration`
     - LLaVA-NeXT
     - T + I\ :sup:`E+`
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`LlavaNextVideoForConditionalGeneration`
     - LLaVA-NeXT-Video
     - T + V
     - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`LlavaOnevisionForConditionalGeneration`
     - LLaVA-Onevision
     - T + I\ :sup:`+` + V\ :sup:`+`
     - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - T + I\ :sup:`E+`
     - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`MllamaForConditionalGeneration`
     - Llama 3.2
     - T + I\ :sup:`+`
     - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
     -
     -
+    -
   * - :code:`MolmoForCausalLM`
     - Molmo
     - T + I
     - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`NVLM_D_Model`
     - NVLM-D 1.0
     - T + I\ :sup:`E+`
     - :code:`nvidia/NVLM-D-72B`, etc.
     - 
     - ✅︎
+    - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
-    - PaliGemma
+    - PaliGemma, PaliGemma 2
     - T + I\ :sup:`E`
-    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
+    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
     - 
     - ✅︎
+    - 
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision, Phi-3.5-Vision
     - T + I\ :sup:`E+`
     - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`PixtralForConditionalGeneration`
     - Pixtral
     - T + I\ :sup:`+`
     - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`QWenLMHeadModel`
     - Qwen-VL
     - T + I\ :sup:`E+`
     - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
     - ✅︎
     - ✅︎
+    -
   * - :code:`Qwen2AudioForConditionalGeneration`
     - Qwen2-Audio
     - T + A\ :sup:`+`
     - :code:`Qwen/Qwen2-Audio-7B-Instruct`
     -
     - ✅︎
+    - 
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`E+`
     - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`UltravoxModel`
     - Ultravox
     - T + A\ :sup:`E+`
     - :code:`fixie-ai/ultravox-v0_3`
     -
     - ✅︎
+    - 
 
 | :sup:`E` Pre-computed embeddings can be inputted for this modality.
 | :sup:`+` Multiple items can be inputted per text prompt for this modality.
 
+.. important::
+    To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference)
+    or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+
+    .. code-block:: python
+
+        llm = LLM(
+            model="Qwen/Qwen2-VL-7B-Instruct",
+            limit_mm_per_prompt={"image": 4},
+        )
+
+    .. code-block:: bash
+
+        vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+
+.. note::
+  vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+
 .. note::
-  vLLM currently only supports adding LoRA to the language backbone of multimodal models.               
+  To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
+  and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 
 .. note::
-  For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
+  The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 
-Multimodal Embedding
---------------------
+Pooling Models
+++++++++++++++
+
+See :ref:`this page <pooling_models>` for more information on how to use pooling models.
+
+.. important::
+    Since some model architectures support both generative and pooling tasks,
+    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+
+Text Embedding (``--task embed``)
+---------------------------------
+
+Any text generation model can be converted into an embedding model by passing :code:`--task embed`.
+
+.. note::
+    To get the best results, you should use pooling models that are specifically trained as such.
+
+The following table lists those that are tested in vLLM.
 
 .. list-table::
   :widths: 25 25 15 25 5 5
@@ -599,12 +790,7 @@ Multimodal Embedding
     - 
     - ✅︎
 
-.. important::
-  Some model architectures support both generation and embedding tasks.
-  In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
-
-.. tip::
-  You can override the model's pooling method by passing :code:`--override-pooler-config`.
+----
 
 Model Support Policy
 =====================
@@ -615,6 +801,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
+.. tip::
+  When comparing the output of :code:`model.generate` from HuggingFace Transformers with the output of :code:`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., `generation_config.json <https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945>`__) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
 
 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst
index 682938cc63d48..84f805bb60c2a 100644
--- a/docs/source/quantization/bnb.rst
+++ b/docs/source/quantization/bnb.rst
@@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
 
 .. code-block:: console
 
-    $ pip install bitsandbytes>=0.44.0
+    $ pip install bitsandbytes>=0.45.0
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
 
diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst
index aacd07a34ad46..4dbf8e9d346e1 100644
--- a/docs/source/quantization/fp8.rst
+++ b/docs/source/quantization/fp8.rst
@@ -45,7 +45,7 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the
 
 .. code-block:: console
 
-   $ pip install llmcompressor==0.1.0
+   $ pip install llmcompressor
 
 Quantization Process
 --------------------
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst
index 9ae07bcd3b991..b2d824427f786 100644
--- a/docs/source/quantization/fp8_e5m2_kvcache.rst
+++ b/docs/source/quantization/fp8_e5m2_kvcache.rst
@@ -4,7 +4,7 @@ FP8 E5M2 KV Cache
 ==================
 
 The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
-The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other.
+The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other.
 
 Here is an example of how to enable this feature:
 
diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst
index 04fa308449507..aa5b251becb1c 100644
--- a/docs/source/quantization/int8.rst
+++ b/docs/source/quantization/int8.rst
@@ -19,7 +19,7 @@ To use INT8 quantization with vLLM, you'll need to install the `llm-compressor <
 
 .. code-block:: console
 
-   $ pip install llmcompressor==0.1.0
+   $ pip install llmcompressor
 
 Quantization Process
 --------------------
@@ -142,4 +142,4 @@ Best Practices
 Troubleshooting and Support
 ---------------------------
 
-If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository.
\ No newline at end of file
+If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository.
diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst
index 9bf0cdb80376d..09f8e7112cf0c 100644
--- a/docs/source/quantization/supported_hardware.rst
+++ b/docs/source/quantization/supported_hardware.rst
@@ -27,7 +27,7 @@ The table below shows the compatibility of various quantization implementations
      - ✅︎
      - ✅︎
      - ✗
-     - ✗
+     - ✅︎
      - ✅︎
      - ✗
      - ✗
@@ -38,8 +38,8 @@ The table below shows the compatibility of various quantization implementations
      - ✅︎
      - ✅︎
      - ✗
-     - ✗
-     - ✗
+     - ✅︎
+     - ✅︎
      - ✗
      - ✗
    * - Marlin (GPTQ/AWQ/FP8)
@@ -129,4 +129,4 @@ Notes:
 
 Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
-For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
\ No newline at end of file
+For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
diff --git a/docs/source/serving/architecture_helm_deployment.png b/docs/source/serving/architecture_helm_deployment.png
new file mode 100644
index 0000000000000..8f9ca29795ffe
Binary files /dev/null and b/docs/source/serving/architecture_helm_deployment.png differ
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 14d94b09e9b9c..56f0020a1011a 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -37,6 +37,29 @@ You can build and run vLLM from source via the provided `Dockerfile <https://git
         current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
         for vLLM to find the current GPU type and build for that.
 
+Building for Arm64/aarch64
+--------------------------
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+
+.. note::
+
+        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
+        Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+
+.. code-block:: console
+
+    # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
+    $ DOCKER_BUILDKIT=1 sudo docker build . \
+      --target vllm-openai \
+      --platform "linux/arm64" \
+      -t vllm/vllm-gh200-openai:latest \
+      --build-arg max_jobs=66 \
+      --build-arg nvcc_threads=2 \
+      --build-arg torch_cuda_arch_list="9.0+PTX" \
+      --build-arg vllm_fa_cmake_gpu_arches="90-real"
 
 To run vLLM:
 
diff --git a/docs/source/serving/deploying_with_helm.rst b/docs/source/serving/deploying_with_helm.rst
new file mode 100644
index 0000000000000..d185a6951d7ec
--- /dev/null
+++ b/docs/source/serving/deploying_with_helm.rst
@@ -0,0 +1,253 @@
+.. _deploying_with_helm:
+
+Deploying with Helm
+===================
+
+A Helm chart to deploy vLLM for Kubernetes
+
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
+
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
+
+Prerequisites
+-------------
+Before you begin, ensure that you have the following:
+
+- A running Kubernetes cluster
+- NVIDIA Kubernetes Device Plugin (``k8s-device-plugin``): This can be found at `https://github.com/NVIDIA/k8s-device-plugin <https://github.com/NVIDIA/k8s-device-plugin>`__
+- Available GPU resources in your cluster
+- S3 with the model which will be deployed
+
+Installing the chart
+--------------------
+
+To install the chart with the release name ``test-vllm``:
+
+.. code-block:: console
+
+    helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+
+Uninstalling the Chart
+----------------------
+
+To uninstall the ``test-vllm`` deployment:
+
+.. code-block:: console
+
+    helm uninstall test-vllm --namespace=ns-vllm
+
+The command removes all the Kubernetes components associated with the
+chart **including persistent volumes** and deletes the release.
+
+Architecture
+------------
+
+.. image:: architecture_helm_deployment.png
+
+Values
+------
+
+.. list-table:: Values
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Key
+     - Type
+     - Default
+     - Description
+   * - autoscaling
+     - object
+     - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
+     - Autoscaling configuration
+   * - autoscaling.enabled
+     - bool
+     - false
+     - Enable autoscaling
+   * - autoscaling.maxReplicas
+     - int
+     - 100
+     - Maximum replicas
+   * - autoscaling.minReplicas
+     - int
+     - 1
+     - Minimum replicas
+   * - autoscaling.targetCPUUtilizationPercentage
+     - int
+     - 80
+     - Target CPU utilization for autoscaling
+   * - configs
+     - object
+     - {}
+     - Configmap
+   * - containerPort
+     - int
+     - 8000
+     - Container port
+   * - customObjects
+     - list
+     - []
+     - Custom Objects configuration
+   * - deploymentStrategy
+     - object
+     - {}
+     - Deployment strategy configuration
+   * - externalConfigs
+     - list
+     - []
+     - External configuration
+   * - extraContainers
+     - list
+     - []
+     - Additional containers configuration
+   * - extraInit
+     - object
+     - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
+     - Additional configuration for the init container
+   * - extraInit.pvcStorage
+     - string
+     - "50Gi"
+     - Storage size of the s3
+   * - extraInit.s3modelpath
+     - string
+     - "relative_s3_model_path/opt-125m"
+     - Path of the model on the s3 which hosts model weights and config files
+   * - extraInit.awsEc2MetadataDisabled
+     - boolean
+     - true
+     - Disables the use of the Amazon EC2 instance metadata service
+   * - extraPorts
+     - list
+     - []
+     - Additional ports configuration
+   * - gpuModels
+     - list
+     - ["TYPE_GPU_USED"]
+     - Type of gpu used
+   * - image
+     - object
+     - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
+     - Image configuration
+   * - image.command
+     - list
+     - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
+     - Container launch command
+   * - image.repository
+     - string
+     - "vllm/vllm-openai"
+     - Image repository
+   * - image.tag
+     - string
+     - "latest"
+     - Image tag
+   * - livenessProbe
+     - object
+     - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
+     - Liveness probe configuration
+   * - livenessProbe.failureThreshold
+     - int
+     - 3
+     - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+   * - livenessProbe.httpGet
+     - object
+     - {"path":"/health","port":8000}
+     - Configuration of the Kubelet http request on the server
+   * - livenessProbe.httpGet.path
+     - string
+     - "/health"
+     - Path to access on the HTTP server
+   * - livenessProbe.httpGet.port
+     - int
+     - 8000
+     - Name or number of the port to access on the container, on which the server is listening
+   * - livenessProbe.initialDelaySeconds
+     - int
+     - 15
+     - Number of seconds after the container has started before liveness probe is initiated
+   * - livenessProbe.periodSeconds
+     - int
+     - 10
+     - How often (in seconds) to perform the liveness probe
+   * - maxUnavailablePodDisruptionBudget
+     - string
+     - ""
+     - Disruption Budget Configuration
+   * - readinessProbe
+     - object
+     - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
+     - Readiness probe configuration
+   * - readinessProbe.failureThreshold
+     - int
+     - 3
+     - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+   * - readinessProbe.httpGet
+     - object
+     - {"path":"/health","port":8000}
+     - Configuration of the Kubelet http request on the server
+   * - readinessProbe.httpGet.path
+     - string
+     - "/health"
+     - Path to access on the HTTP server
+   * - readinessProbe.httpGet.port
+     - int
+     - 8000
+     - Name or number of the port to access on the container, on which the server is listening
+   * - readinessProbe.initialDelaySeconds
+     - int
+     - 5
+     - Number of seconds after the container has started before readiness probe is initiated
+   * - readinessProbe.periodSeconds
+     - int
+     - 5
+     - How often (in seconds) to perform the readiness probe
+   * - replicaCount
+     - int
+     - 1
+     - Number of replicas
+   * - resources
+     - object
+     - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
+     - Resource configuration
+   * - resources.limits."nvidia.com/gpu"
+     - int
+     - 1
+     - Number of gpus used
+   * - resources.limits.cpu
+     - int
+     - 4
+     - Number of CPUs
+   * - resources.limits.memory
+     - string
+     - "16Gi"
+     - CPU memory configuration
+   * - resources.requests."nvidia.com/gpu"
+     - int
+     - 1
+     - Number of gpus used
+   * - resources.requests.cpu
+     - int
+     - 4
+     - Number of CPUs
+   * - resources.requests.memory
+     - string
+     - "16Gi"
+     - CPU memory configuration
+   * - secrets
+     - object
+     - {}
+     - Secrets configuration
+   * - serviceName
+     - string
+     -
+     - Service name
+   * - servicePort
+     - int
+     - 80
+     - Service port
+   * - labels.environment
+     - string
+     - test
+     - Environment name
+   * - labels.release
+     - string
+     - test
+     - Release name
diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst
index 7dc076dc709df..cc3606f0df851 100644
--- a/docs/source/serving/deploying_with_k8s.rst
+++ b/docs/source/serving/deploying_with_k8s.rst
@@ -162,7 +162,7 @@ To test the deployment, run the following ``curl`` command:
     curl http://mistral-7b.default.svc.cluster.local/v1/completions \
       -H "Content-Type: application/json" \
       -d '{
-            "model": "facebook/opt-125m",
+            "model": "mistralai/Mistral-7B-Instruct-v0.3",
             "prompt": "San Francisco is a",
             "max_tokens": 7,
             "temperature": 0
@@ -172,4 +172,4 @@ If the service is correctly deployed, you should receive a response from the vLL
 
 Conclusion
 ----------
-Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
\ No newline at end of file
+Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
diff --git a/docs/source/serving/deploying_with_kubeai.rst b/docs/source/serving/deploying_with_kubeai.rst
new file mode 100644
index 0000000000000..ec3c065320fd9
--- /dev/null
+++ b/docs/source/serving/deploying_with_kubeai.rst
@@ -0,0 +1,17 @@
+.. _deploying_with_kubeai:
+
+Deploying with KubeAI
+=====================
+
+`KubeAI <https://github.com/substratusai/kubeai>`_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
+
+
+Please see the Installation Guides for environment specific instructions:
+
+* `Any Kubernetes Cluster <https://www.kubeai.org/installation/any/>`_
+* `EKS <https://www.kubeai.org/installation/eks/>`_
+* `GKE <https://www.kubeai.org/installation/gke/>`_
+
+Once you have KubeAI installed, you can
+`configure text generation models <https://www.kubeai.org/how-to/configure-text-generation-models/>`_
+using vLLM.
\ No newline at end of file
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 4d57206e53a05..b24ba53e59694 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -54,7 +54,7 @@ Multi-Node Inference and Serving
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
-The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster.
+The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have ``CAP_SYS_ADMIN`` to the docker container by using the ``--cap-add`` option in the docker run command.
 
 Pick a node as the head node, and run the following command:
 
diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst
index f39997e0e44d9..0dd505a739863 100644
--- a/docs/source/serving/integrations.rst
+++ b/docs/source/serving/integrations.rst
@@ -6,6 +6,7 @@ Integrations
 
    run_on_sky
    deploying_with_kserve
+   deploying_with_kubeai
    deploying_with_triton
    deploying_with_bentoml
    deploying_with_cerebrium
diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst
index 15e57bd3fec65..231111cd7b738 100644
--- a/docs/source/serving/metrics.rst
+++ b/docs/source/serving/metrics.rst
@@ -2,9 +2,34 @@ Production Metrics
 ==================
 
 vLLM exposes a number of metrics that can be used to monitor the health of the
-system. These metrics are exposed via the `/metrics` endpoint on the vLLM
+system. These metrics are exposed via the ``/metrics`` endpoint on the vLLM
 OpenAI compatible API server.
 
+You can start the server using Python, or using [Docker](deploying_with_docker.rst):
+
+.. code-block:: console
+
+    $ vllm serve unsloth/Llama-3.2-1B-Instruct
+
+Then query the endpoint to get the latest metrics from the server:
+
+.. code-block:: console
+
+    $ curl http://0.0.0.0:8000/metrics
+    
+    # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
+    # TYPE vllm:iteration_tokens_total histogram
+    vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
+    vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    ...
+
 The following metrics are exposed:
 
 .. literalinclude:: ../../../vllm/engine/metrics.py
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 78965813b1213..1bc8d32d2d161 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -1,13 +1,13 @@
 # OpenAI Compatible Server
 
-vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
+vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
 
-You can start the server using Python, or using [Docker](deploying_with_docker.rst):
+You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst):
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
-To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
+To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
 ```python
 from openai import OpenAI
 client = OpenAI(
@@ -25,24 +25,71 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message)
 ```
 
-## API Reference
+## Supported APIs
 
 We currently support the following OpenAI APIs:
 
-- [Completions API](https://platform.openai.com/docs/api-reference/completions)
+- [Completions API](#completions-api) (`/v1/completions`)
+  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`).
   - *Note: `suffix` parameter is not supported.*
-- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
-  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst).
-    - *Note: `image_url.detail` parameter is not supported.*
-  - We also support `audio_url` content type for audio files.
-    - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
-    - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
+- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
+  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
-- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
-  - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
-    which will be treated as a single prompt to the model according to its chat template.
-    - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst).
-  - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
+- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
+  - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
+
+In addition, we have the following custom APIs:
+
+- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
+  - Applicable to any model with a tokenizer.
+- [Score API](#score-api) (`/score`)
+  - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`).
+
+(chat-template)=
+## Chat Template
+
+In order for the language model to support chat protocol, vLLM requires the model to include
+a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
+specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
+
+An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
+
+Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
+template, or the template in string form. Without a chat template, the server will not be able to process chat
+and all chat requests will error.
+
+```bash
+vllm serve <model> --chat-template ./path-to-chat-template.jinja
+```
+
+vLLM community provides a set of chat templates for popular models. You can find them in the examples
+directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
+
+With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
+both a `type` and a `text` field. An example is provided below:
+```python
+completion = client.chat.completions.create(
+  model="NousResearch/Meta-Llama-3-8B-Instruct",
+  messages=[
+    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
+  ]
+)
+```
+
+Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
+`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
+request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
+*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
+the detected format, which can be one of:
+
+- `"string"`: A string.
+  - Example: `"Hello world"`
+- `"openai"`: A list of dictionaries, similar to OpenAI schema.
+  - Example: `[{"type": "text", "text": "Hello world!"}]`
+
+If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
+to override which format to use.
 
 ## Extra Parameters
 
@@ -62,7 +109,7 @@ completion = client.chat.completions.create(
 )
 ```
 
-### Extra HTTP Headers
+## Extra HTTP Headers
 
 Only `X-Request-Id` HTTP request header is supported for now.
 
@@ -88,7 +135,53 @@ completion = client.completions.create(
 print(completion._request_id)
 ```
 
-### Extra Parameters for Completions API
+## CLI Reference
+
+(vllm-serve)=
+### `vllm serve`
+
+The `vllm serve` command is used to launch the OpenAI-compatible server.
+
+```{argparse}
+:module: vllm.entrypoints.openai.cli_args
+:func: create_parser_for_docs
+:prog: vllm serve
+```
+
+#### Configuration file
+
+You can load CLI arguments via a [YAML](https://yaml.org/) config file.
+The argument names must be the long form of those outlined [above](#vllm-serve).
+
+For example:
+
+```yaml
+# config.yaml
+
+host: "127.0.0.1"
+port: 6379
+uvicorn-log-level: "info"
+```
+
+To use the above config file:
+
+```bash
+$ vllm serve SOME_MODEL --config config.yaml
+```
+
+```{note}
+In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
+The order of priorities is `command line > config file values > defaults`.
+```
+
+## API Reference
+
+(completions-api)=
+### Completions API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/completions) for more details.
+
+#### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
@@ -106,7 +199,17 @@ The following extra parameters are supported:
 :end-before: end-completion-extra-params
 ```
 
-### Extra Parameters for Chat Completions API
+(chat-api)=
+### Chat Completions API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
+
+We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
+[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
+see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information.
+- *Note: `image_url.detail` parameter is not supported.*
+
+#### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
@@ -124,7 +227,19 @@ The following extra parameters are supported:
 :end-before: end-chat-completion-extra-params
 ```
 
-### Extra Parameters for Embeddings API
+(embeddings-api)=
+### Embeddings API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/embeddings) for more details.
+
+If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat Completions API](#chat-api))
+which will be treated as a single prompt to the model.
+
+```{tip}
+This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details.
+```
+
+#### Extra parameters
 
 The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
@@ -134,7 +249,7 @@ The following [pooling parameters (click through to see documentation)](../dev/p
 :end-before: end-embedding-pooling-params
 ```
 
-The following extra parameters are supported:
+The following extra parameters are supported by default:
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -142,289 +257,179 @@ The following extra parameters are supported:
 :end-before: end-embedding-extra-params
 ```
 
-## Chat Template
-
-In order for the language model to support chat protocol, vLLM requires the model to include
-a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
-specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
-
-An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
-
-Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
-you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
-template, or the template in string form. Without a chat template, the server will not be able to process chat
-and all chat requests will error.
-
-```bash
-vllm serve <model> --chat-template ./path-to-chat-template.jinja
-```
-
-vLLM community provides a set of chat templates for popular models. You can find them in the examples
-directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
+For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
 
-With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
-both a `type` and a `text` field. An example is provided below:
-```python
-completion = client.chat.completions.create(
-  model="NousResearch/Meta-Llama-3-8B-Instruct",
-  messages=[
-    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
-  ]
-)
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-chat-embedding-extra-params
+:end-before: end-chat-embedding-extra-params
 ```
-Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like
-`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which
-format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify
-between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match
-this, unless explicitly specified.
 
+(tokenizer-api)=
+### Tokenizer API
 
-## Command line arguments for the server
+The Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
+It consists of two endpoints:
 
-```{argparse}
-:module: vllm.entrypoints.openai.cli_args
-:func: create_parser_for_docs
-:prog: vllm serve
-```
+- `/tokenize` corresponds to calling `tokenizer.encode()`.
+- `/detokenize` corresponds to calling `tokenizer.decode()`.
 
+(score-api)=
+### Score API
 
-### Config file
+The Score API applies a cross-encoder model to predict scores for sentence pairs.
+Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
 
-The `serve` module can also accept arguments from a config file in
-`yaml` format. The arguments in the yaml must be specified using the
-long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server):
+You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-For example:
+#### Single inference
 
-```yaml
-# config.yaml
+You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
 
-host: "127.0.0.1"
-port: 6379
-uvicorn-log-level: "info"
-```
+Request:
 
 ```bash
-$ vllm serve SOME_MODEL --config config.yaml
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "encoding_format": "float",
+  "text_1": "What is the capital of France?",
+  "text_2": "The capital of France is Paris."
+}'
 ```
----
-**NOTE**
-In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence.
-The order of priorities is `command line > config file values > defaults`.
-
----
-
-## Tool calling in the chat completion API
-vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but on the roadmap.
-
-It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt.
-Please see below for recommended configuration and chat templates to use when function calling is to be used with the different models.
-
-
-### Named Function Calling
-vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is
-enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a
-high-quality one.
-
-vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
-
-To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
-specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
-
-
-### Automatic Function Calling
-To enable this feature, you should set the following flags:
-* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it
-deems appropriate.
-* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers
-will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`.
-* `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`.
-* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages
-that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their
-`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat
-template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates)
-from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json)
-
-If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
-
-
-#### Hermes Models (`hermes`)
-
-All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.
-* `NousResearch/Hermes-2-Pro-*`
-* `NousResearch/Hermes-2-Theta-*`
-* `NousResearch/Hermes-3-*`
-
-
-_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge
-step in their creation_.
-
-Flags: `--tool-call-parser hermes`
-
-
-#### Mistral Models (`mistral`)
-
-Supported models:
-* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
-* Additional mistral function-calling models are compatible as well.
-
-Known issues:
-1. Mistral 7B struggles to generate parallel tool calls correctly.
-2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
-much shorter than what vLLM generates. Since an exception is thrown when this condition
-is not met, the following additional chat templates are provided:
-
-* `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that
-it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
-* `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt
-when tools are provided, that results in much better reliability when working with parallel tool calling.
-
-
-Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
-
 
-#### Llama Models (`llama3_json`)
+Response:
 
-Supported models:
-* `meta-llama/Meta-Llama-3.1-8B-Instruct`
-* `meta-llama/Meta-Llama-3.1-70B-Instruct`
-* `meta-llama/Meta-Llama-3.1-405B-Instruct`
-* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8`
-
-The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below.
-Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
-
-Known issues:
-1. Parallel tool calls are not supported.
-2. The model can generate parameters with a wrong format, such as generating
-   an array serialized as string instead of an array.
-
-The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that
-it works better with vLLM.
-
-Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
-
-#### IBM Granite
-
-Supported models:
-* `ibm-granite/granite-3.0-8b-instruct`
-
-Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
-
-`examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported.
-
-* `ibm-granite/granite-20b-functioncalling`
-
-Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
-
-`examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
-
-
-#### InternLM Models (`internlm`)
-
-Supported models:
-* `internlm/internlm2_5-7b-chat` (confirmed)
-* Additional internlm2.5 function-calling models are compatible as well
-
-Known issues:
-* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model.
-
-Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja`
-
-
-#### Jamba Models (`jamba`)
-AI21's Jamba-1.5 models are supported.
-* `ai21labs/AI21-Jamba-1.5-Mini`
-* `ai21labs/AI21-Jamba-1.5-Large`
-
-
-Flags: `--tool-call-parser jamba`
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693447,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
+```
 
+#### Batch inference
 
-#### Models with Pythonic Tool Calls (`pythonic`)
+You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs
+where each pair is built from `text_1` and a string in `text_2`.
+The total number of pairs is `len(text_2)`.
 
-A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
+Request:
 
-As a concrete example, these models may look up the weather in San Francisco and Seattle by generating:
-```python
-[get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')]
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "text_1": "What is the capital of France?",
+  "text_2": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris."
+  ]
+}'
 ```
 
-Limitations:
-* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls.  (In particular, the Llama 3.2 models emit no such tokens.)
-* Llama's smaller models struggle to use tools effectively.
-
-Example supported models:
-* `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
-* `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
-* `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
-* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
-
-Flags: `--tool-call-parser pythonic --chat-template {see_above}`
+Response:
 
----
-**WARNING**
-Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary.
-
----
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693570,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 0.001094818115234375
+    },
+    {
+      "index": 1,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
+```
 
+You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
+where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
+The total number of pairs is `len(text_2)`.
 
-### How to write a tool parser plugin
+Request:
 
-A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py.
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "encoding_format": "float",
+  "text_1": [
+    "What is the capital of Brazil?",
+    "What is the capital of France?"
+  ],
+  "text_2": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris."
+  ]
+}'
+```
 
-Here is a summary of a plugin file:
+Response:
 
-```python
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693447,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 1
+    },
+    {
+      "index": 1,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
+```
 
-# import the required packages
-
-# define a tool parser and register it to vllm
-# the name list in register_module can be used
-# in --tool-call-parser. you can define as many
-# tool parsers as you want here.
-@ToolParserManager.register_module(["example"])
-class ExampleToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
-        super().__init__(tokenizer)
-
-    # adjust request. e.g.: set skip special tokens
-    # to False for tool call output.
-    def adjust_request(
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        return request
-
-    # implement the tool call parse for stream call
-    def extract_tool_calls_streaming(
-        self,
-        previous_text: str,
-        current_text: str,
-        delta_text: str,
-        previous_token_ids: Sequence[int],
-        current_token_ids: Sequence[int],
-        delta_token_ids: Sequence[int],
-        request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-        return delta
-
-    # implement the tool parse for non-stream call
-    def extract_tool_calls(
-        self,
-        model_output: str,
-        request: ChatCompletionRequest,
-    ) -> ExtractedToolCallInformation:
-        return ExtractedToolCallInformation(tools_called=False,
-                                            tool_calls=[],
-                                            content=text)
+#### Extra parameters
 
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-score-pooling-params
+:end-before: end-score-pooling-params
 ```
 
-Then you can use this plugin in the command line like this.
-```
-    --enable-auto-tool-choice \
-    --tool-parser-plugin <absolute path of the plugin file>
-    --tool-call-parser example \
-    --chat-template <your chat template> \
-```
+The following extra parameters are supported:
 
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-score-extra-params
+:end-before: end-score-extra-params
+```
diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst
index 8ef96c4e54369..a2acd7b39f887 100644
--- a/docs/source/serving/serving_with_llamastack.rst
+++ b/docs/source/serving/serving_with_llamastack.rst
@@ -24,7 +24,7 @@ Then start Llama Stack server pointing to your vLLM server with the following co
         config:
           url: http://127.0.0.1:8000
 
-Please refer to `this guide <https://github.com/meta-llama/llama-stack/blob/main/docs/source/getting_started/distributions/self_hosted_distro/remote_vllm.md>`_ for more details on this remote vLLM provider.
+Please refer to `this guide <https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html>`_ for more details on this remote vLLM provider.
 
 Inference via Embedded vLLM
 ---------------------------
diff --git a/docs/source/serving/weights_loading_with_fastsafetensor.rst b/docs/source/serving/weights_loading_with_fastsafetensor.rst
new file mode 100644
index 0000000000000..2678ae38a15e0
--- /dev/null
+++ b/docs/source/serving/weights_loading_with_fastsafetensor.rst
@@ -0,0 +1,5 @@
+Loading Model weights with fastsafetensors
+===================================================================
+
+Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
+For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
\ No newline at end of file
diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst
similarity index 91%
rename from docs/source/serving/compatibility_matrix.rst
rename to docs/source/usage/compatibility_matrix.rst
index f629b3ca78318..04dd72b1e3527 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/usage/compatibility_matrix.rst
@@ -39,12 +39,13 @@ Feature x Feature
      - :abbr:`prmpt adptr (Prompt Adapter)`
      - :ref:`SD <spec_decode>`
      - CUDA graph
+     - :abbr:`pooling (Pooling Models)`
      - :abbr:`enc-dec (Encoder-Decoder Models)`
      - :abbr:`logP (Logprobs)`
      - :abbr:`prmpt logP (Prompt Logprobs)`
      - :abbr:`async output (Async Output Processing)`
      - multi-step
-     - :abbr:`MM (Multimodal)`
+     - :abbr:`mm (Multimodal Inputs)`
      - best-of
      - beam-search
      - :abbr:`guided dec (Guided Decoding)`
@@ -64,6 +65,7 @@ Feature x Feature
      - 
      - 
      - 
+     - 
    * - :ref:`APC <apc>`
      - ✅
      - 
@@ -80,6 +82,7 @@ Feature x Feature
      - 
      - 
      - 
+     - 
    * - :ref:`LoRA <lora>`
      - `✗ <https://github.com/vllm-project/vllm/pull/9057>`__ 
      - ✅
@@ -96,6 +99,7 @@ Feature x Feature
      - 
      - 
      - 
+     - 
    * - :abbr:`prmpt adptr (Prompt Adapter)`
      - ✅
      - ✅
@@ -112,8 +116,9 @@ Feature x Feature
      - 
      - 
      - 
+     - 
    * - :ref:`SD <spec_decode>`
-     - ✗
+     - ✅
      - ✅
      - ✗
      - ✅
@@ -128,6 +133,7 @@ Feature x Feature
      - 
      - 
      - 
+     - 
    * - CUDA graph
      - ✅
      - ✅
@@ -144,6 +150,24 @@ Feature x Feature
      - 
      - 
      - 
+     - 
+   * - :abbr:`pooling (Pooling Models)`
+     - ✗
+     - ✗
+     - ✗ 
+     - ✗
+     - ✗
+     - ✗
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
    * - :abbr:`enc-dec (Encoder-Decoder Models)`
      - ✗
      - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
@@ -151,6 +175,7 @@ Feature x Feature
      - ✗
      - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
      - ✅
+     - ✅
      - 
      - 
      - 
@@ -166,7 +191,8 @@ Feature x Feature
      - ✅
      - ✅
      - ✅
-     - ✅
+     - ✅ 
+     - ✗
      - ✅
      - 
      - 
@@ -183,7 +209,8 @@ Feature x Feature
      - ✅
      - `✗ <https://github.com/vllm-project/vllm/pull/8199>`__ 
      - ✅
-     - ✅
+     - ✗
+     - ✅ 
      - ✅
      - 
      - 
@@ -199,6 +226,7 @@ Feature x Feature
      - ✅
      - ✗
      - ✅ 
+     - ✗ 
      - ✗
      - ✅
      - ✅
@@ -215,6 +243,7 @@ Feature x Feature
      - ✅
      - ✗
      - ✅
+     - ✗ 
      - ✗
      - ✅
      - `✗ <https://github.com/vllm-project/vllm/issues/8198>`__ 
@@ -224,14 +253,15 @@ Feature x Feature
      - 
      - 
      - 
-   * - :abbr:`MM (Multimodal)`
-     -  `✗ <https://github.com/vllm-project/vllm/pull/8346>`__ 
+   * - :abbr:`mm (Multimodal Inputs)`
+     - ✅
      -  `✗ <https://github.com/vllm-project/vllm/pull/8348>`__ 
      -  `✗ <https://github.com/vllm-project/vllm/pull/7199>`__ 
      - ?
      - ?
      - ✅
-     - ✗
+     - ✅
+     - ✅
      - ✅
      - ✅
      - ✅
@@ -247,6 +277,7 @@ Feature x Feature
      - ✅
      - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
      - ✅
+     - ✗
      - ✅
      - ✅
      - ✅
@@ -263,6 +294,7 @@ Feature x Feature
      - ✅
      - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
      - ✅
+     - ✗
      - ✅
      - ✅
      - ✅
@@ -279,6 +311,7 @@ Feature x Feature
      - ?
      - ✅
      - ✅
+     - ✗
      - ?
      - ✅
      - ✅
@@ -311,7 +344,7 @@ Feature x Hardware
      - ✅
      - ✅
      - ✅
-     - ✗ 
+     - ✅
      - ✅
    * - :ref:`APC <apc>`
      - `✗ <https://github.com/vllm-project/vllm/issues/3687>`__ 
@@ -319,7 +352,7 @@ Feature x Hardware
      - ✅
      - ✅
      - ✅
-     - ✗
+     - ✅
      - ✅
    * - :ref:`LoRA <lora>`
      - ✅
@@ -353,6 +386,14 @@ Feature x Hardware
      - ✅
      - ✗
      - ✅
+   * - :abbr:`pooling (Pooling Models)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ?
    * - :abbr:`enc-dec (Encoder-Decoder Models)`
      - ✅
      - ✅
@@ -361,7 +402,7 @@ Feature x Hardware
      - ✅
      - ✅
      - ✗
-   * - :abbr:`logP (Logprobs)`
+   * - :abbr:`mm (Multimodal Inputs)`
      - ✅
      - ✅
      - ✅
@@ -369,7 +410,7 @@ Feature x Hardware
      - ✅
      - ✅
      - ✅
-   * - :abbr:`prmpt logP (Prompt Logprobs)`
+   * - :abbr:`logP (Logprobs)`
      - ✅
      - ✅
      - ✅
@@ -377,29 +418,29 @@ Feature x Hardware
      - ✅
      - ✅
      - ✅
-   * - :abbr:`async output (Async Output Processing)`
+   * - :abbr:`prmpt logP (Prompt Logprobs)`
      - ✅
      - ✅
      - ✅
      - ✅
      - ✅
-     - ✗
-     - ✗
-   * - multi-step
      - ✅
      - ✅
+   * - :abbr:`async output (Async Output Processing)`
      - ✅
      - ✅
      - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8477>`__ 
      - ✅
-   * - :abbr:`MM (Multimodal)`
      - ✅
+     - ✗
+     - ✗
+   * - multi-step
      - ✅
      - ✅
      - ✅
      - ✅
      - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/8477>`__ 
      - ✅
    * - best-of
      - ✅
diff --git a/docs/source/usage/disagg_prefill.rst b/docs/source/usage/disagg_prefill.rst
new file mode 100644
index 0000000000000..9fe714b4fd856
--- /dev/null
+++ b/docs/source/usage/disagg_prefill.rst
@@ -0,0 +1,69 @@
+.. _disagg_prefill:
+
+Disaggregated prefilling (experimental)
+=======================================
+
+This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. 
+
+Why disaggregated prefilling?
+-----------------------------
+
+Two main reasons:
+
+* **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. ``tp`` and ``pp``) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
+* **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
+
+.. note::
+    Disaggregated prefill DOES NOT improve throughput.
+
+Usage example
+-------------
+
+Please refer to ``examples/disaggregated_prefill.sh`` for the example usage of disaggregated prefilling.
+
+
+Benchmarks
+----------
+
+Please refer to ``benchmarks/disagg_benchmarks/`` for disaggregated prefilling benchmarks.
+
+
+Development
+-----------
+
+We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
+
+All disaggregated prefilling implementation is under ``vllm/distributed/kv_transfer``.
+
+Key abstractions for disaggregated prefilling:
+
+* **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
+* **LookupBuffer**: LookupBuffer provides two API: ``insert`` KV cache and ``drop_select`` KV cache. The semantics of ``insert`` and ``drop_select`` are similar to SQL, where ``insert`` inserts a KV cache into the buffer, and ``drop_select`` returns the KV cache that matches the given condition and drop it from the buffer.
+* **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports ``send_tensor`` and ``recv_tensor``.
+
+.. note::
+    ``insert`` is non-blocking operation but ``drop_select`` is blocking operation.
+
+Here is a figure illustrating how the above 3 abstractions are organized:
+
+.. image:: /assets/usage/disagg_prefill/abstraction.jpg
+    :alt: Disaggregated prefilling abstractions
+
+The workflow of disaggregated prefilling is as follows:
+
+.. image:: /assets/usage/disagg_prefill/overview.jpg
+    :alt: Disaggregated prefilling workflow
+
+The ``buffer`` corresponds to ``insert`` API in LookupBuffer, and the ``drop_select`` corresponds to ``drop_select`` API in LookupBuffer.
+
+
+Third-party contributions
+-------------------------
+
+Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
+
+We recommend three ways of implementations:
+
+* **Fully-customized connector**: Implement your own ``Connector``, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+* **Database-like connector**: Implement your own ``LookupBuffer`` and support the ``insert`` and ``drop_select`` APIs just like SQL.
+* **Distributed P2P connector**: Implement your own ``Pipe`` and support the ``send_tensor`` and ``recv_tensor`` APIs, just like `torch.distributed`.
diff --git a/docs/source/models/engine_args.rst b/docs/source/usage/engine_args.rst
similarity index 100%
rename from docs/source/models/engine_args.rst
rename to docs/source/usage/engine_args.rst
diff --git a/docs/source/serving/env_vars.rst b/docs/source/usage/env_vars.rst
similarity index 100%
rename from docs/source/serving/env_vars.rst
rename to docs/source/usage/env_vars.rst
diff --git a/docs/source/serving/faq.rst b/docs/source/usage/faq.rst
similarity index 76%
rename from docs/source/serving/faq.rst
rename to docs/source/usage/faq.rst
index 9e858e612c8bf..d88da32092924 100644
--- a/docs/source/serving/faq.rst
+++ b/docs/source/usage/faq.rst
@@ -1,3 +1,5 @@
+.. _faq:
+
 Frequently Asked Questions
 ===========================
 
@@ -9,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
 
     Q: Which model to use for offline inference embedding?
 
-A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
+A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
+more are listed :ref:`here <supported_models>`.
+
+By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
+`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
+but they are expected be inferior to models that are specifically trained on embedding tasks.
 
 ----------------------------------------
 
diff --git a/docs/source/models/lora.rst b/docs/source/usage/lora.rst
similarity index 99%
rename from docs/source/models/lora.rst
rename to docs/source/usage/lora.rst
index ef0177eaf2162..c2c6fa2aebfaf 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/usage/lora.rst
@@ -1,7 +1,7 @@
 .. _lora:
 
-Using LoRA adapters
-===================
+LoRA Adapters
+=============
 
 This document shows you how to use `LoRA adapters <https://arxiv.org/abs/2106.09685>`_ with vLLM on top of a base model.
 
diff --git a/docs/source/models/vlm.rst b/docs/source/usage/multimodal_inputs.rst
similarity index 52%
rename from docs/source/models/vlm.rst
rename to docs/source/usage/multimodal_inputs.rst
index bcbe50a25fa09..680382e457cc5 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -1,34 +1,31 @@
-.. _vlm:
+.. _multimodal_inputs:
 
-Using VLMs
-==========
+Multimodal Inputs
+=================
 
-vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here <supported_vlms>`.
-This document shows you how to run and serve these models using vLLM.
+This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models <supported_mm_models>` in vLLM.
 
 .. note::
-    We are actively iterating on VLM support. See `this RFC <https://github.com/vllm-project/vllm/issues/4194>`_ for upcoming changes,
+    We are actively iterating on multi-modal support. See `this RFC <https://github.com/vllm-project/vllm/issues/4194>`_ for upcoming changes,
     and `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
 
 Offline Inference
 -----------------
 
-Single-image input
-^^^^^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models.
-
-.. code-block:: python
-
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-
-To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`:
+To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`:
 
 * ``prompt``: The prompt should follow the format that is documented on HuggingFace.
 * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`.
 
+Image
+^^^^^
+
+You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples:
+
 .. code-block:: python
 
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
     # Refer to the HuggingFace repo for the correct format to use
     prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
@@ -41,41 +38,6 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT
         "multi_modal_data": {"image": image},
     })
 
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-    # Inference with image embeddings as input
-    image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {"image": image_embeds},
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-    # Inference with image embeddings as input with additional parameters
-    # Specifically, we are conducting a trial run of Qwen2VL and MiniCPM-V with the new input format, which utilizes additional parameters.
-    mm_data = {}
-
-    image_embeds = torch.load(...) # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-    # For Qwen2VL, image_grid_thw is needed to calculate positional encoding.
-    mm_data['image'] = {
-        "image_embeds": image_embeds,
-        "image_grid_thw": torch.load(...) # torch.Tensor of shape (1, 3),
-    }
-    # For MiniCPM-V, image_size_list is needed to calculate details of the sliced image.
-    mm_data['image'] = {
-        "image_embeds": image_embeds,
-        "image_size_list": [image.size] # list of image sizes
-    }
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": mm_data,
-    })
-
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
@@ -102,12 +64,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT
 
 A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
 
-Multi-image input
-^^^^^^^^^^^^^^^^^
-
-Multi-image input is only supported for a subset of VLMs, as shown :ref:`here <supported_vlms>`.
-
-To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class.
+To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
 .. code-block:: python
 
@@ -118,10 +75,6 @@ To enable multiple multi-modal items per text prompt, you have to set ``limit_mm
         limit_mm_per_prompt={"image": 2},  # The maximum number to accept
     )
 
-Instead of passing in a single image, you can pass in a list of images.
-
-.. code-block:: python
-
     # Refer to the HuggingFace repo for the correct format to use
     prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
 
@@ -169,30 +122,114 @@ Multi-image input can be extended to perform video captioning. We show this with
         generated_text = o.outputs[0].text
         print(generated_text)
 
+Video
+^^^^^
+
+You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary
+instead of using multi-image input.
+
+Please refer to `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_ for more details.
+
+Audio
+^^^^^
+
+You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary.
+
+Please refer to `examples/offline_inference_audio_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py>`_ for more details.
+
+Embedding
+^^^^^^^^^
+
+To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
+pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+
+.. code-block:: python
+
+    # Inference with image embeddings as input
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+
+    # Embeddings for single image
+    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_embeds},
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
+
+.. code-block:: python
+
+    # Construct the prompt based on your model
+    prompt = ...
+
+    # Embeddings for multiple images
+    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)
+
+    # Qwen2-VL
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_grid_thw is needed to calculate positional encoding.
+            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+        }
+    }
+
+    # MiniCPM-V
+    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_size_list is needed to calculate details of the sliced image.
+            "image_size_list": [image.size for image in images],  # list of image sizes
+        }
+    }
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": mm_data,
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
 Online Inference
 ----------------
 
-OpenAI Vision API
-^^^^^^^^^^^^^^^^^
+Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`_.
+
+.. important::
+    A chat template is **required** to use Chat Completions API.
+
+    Although most models come with a chat template, for others you have to define one yourself.
+    The chat template can be inferred based on the documentation on the model's HuggingFace repo.
+    For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`__.
+
+Image
+^^^^^
 
-You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
+Image input is supported according to `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
+Here is a simple example using Phi-3.5-Vision.
 
-Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server.
+First, launch the OpenAI-compatible server:
 
 .. code-block:: bash
 
     vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
       --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 
-.. important::
-    Since OpenAI Vision API is based on `Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`_,
-    a chat template is **required** to launch the API server.
-
-    Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
-    The chat template can be inferred based on the documentation on the model's HuggingFace repo.
-    For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
-
-To consume the server, you can use the OpenAI client like in the example below:
+Then, you can use the OpenAI client as follows:
 
 .. code-block:: python
 
@@ -252,37 +289,160 @@ A full code example can be found in `examples/openai_chat_completion_client_for_
 
 .. note::
 
-    By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
+    By default, the timeout for fetching images through HTTP URL is ``5`` seconds.
+    You can override this by setting the environment variable:
 
     .. code-block:: console
 
         $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
-Chat Embeddings API
-^^^^^^^^^^^^^^^^^^^
+Video
+^^^^^
+
+Instead of :code:`image_url`, you can pass a video file via :code:`video_url`.
+
+You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/openai/test_video.py>`_ as reference.
+
+.. note::
+
+    By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds.
+    You can override this by setting the environment variable:
+
+    .. code-block:: console
+
+        $ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
+
+Audio
+^^^^^
+
+Audio input is supported according to `OpenAI Audio API <https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in>`_.
+Here is a simple example using Ultravox-v0.3.
+
+First, launch the OpenAI-compatible server:
+
+.. code-block:: bash
+
+    vllm serve fixie-ai/ultravox-v0_3
+    
+Then, you can use the OpenAI client as follows:
+
+.. code-block:: python
+
+    import base64
+    import requests
+    from openai import OpenAI
+    from vllm.assets.audio import AudioAsset
+
+    def encode_base64_content_from_url(content_url: str) -> str:
+        """Encode a content retrieved from a remote url to base64 format."""
+
+        with requests.get(content_url) as response:
+            response.raise_for_status()
+            result = base64.b64encode(response.content).decode('utf-8')
+
+        return result
+
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Any format supported by librosa is supported
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input:
+
+.. code-block:: python
+
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
 
-vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`_,
-where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
+A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
+
+.. note::
+
+    By default, the timeout for fetching audios through HTTP URL is ``10`` seconds.
+    You can override this by setting the environment variable:
+
+    .. code-block:: console
+
+        $ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+
+Embedding
+^^^^^^^^^
+
+vLLM's Embeddings API is a superset of OpenAI's `Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`_,
+where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
 
 .. tip::
     The schema of ``messages`` is exactly the same as in Chat Completions API.
+    You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
+
+Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
+Refer to the examples below for illustration.
 
-In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
+Here is an end-to-end example using VLM2Vec. To serve the model:
 
 .. code-block:: bash
 
-    vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
+    vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
       --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 
 .. important::
 
-    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
+    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embed``
     to run this model in embedding mode instead of text generation mode.
 
-.. important::
-
-    VLM2Vec does not expect chat-based input. We use a `custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`_
-    to combine the text and images together.
+    The custom chat template is completely different from the original one for this model,
+    and can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`__.
 
 Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
 
@@ -310,17 +470,19 @@ Since the request schema is not defined by OpenAI client, we post a request to t
     response_json = response.json()
     print("Embedding output:", response_json["data"][0]["embedding"])
 
-Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
+Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
 
 .. code-block:: bash
 
-    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
       --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
 
 .. important::
 
-    Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, 
-    which is handled by the jinja template.
+    Like with VLM2Vec, we have to explicitly pass ``--task embed``.
+    
+    Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled
+    by `this custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja>`__.
 
 .. important::
 
diff --git a/docs/source/models/performance.rst b/docs/source/usage/performance.rst
similarity index 100%
rename from docs/source/models/performance.rst
rename to docs/source/usage/performance.rst
diff --git a/docs/source/models/spec_decode.rst b/docs/source/usage/spec_decode.rst
similarity index 97%
rename from docs/source/models/spec_decode.rst
rename to docs/source/usage/spec_decode.rst
index d57ffec53215d..f1f1917f974bb 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/usage/spec_decode.rst
@@ -1,13 +1,16 @@
 .. _spec_decode:
 
-Speculative decoding in vLLM
-============================
+Speculative decoding
+====================
 
 .. warning::
     Please note that speculative decoding in vLLM is not yet optimized and does
     not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work
     to optimize it is ongoing and can be followed in `this issue. <https://github.com/vllm-project/vllm/issues/4630>`_
 
+.. warning::
+    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
+
 This document shows how to use `Speculative Decoding <https://x.com/karpathy/status/1697318534555336961>`_ with vLLM.
 Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
 
@@ -182,7 +185,7 @@ speculative decoding, breaking down the guarantees into three key areas:
 3. **vLLM Logprob Stability**
    - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the 
    same request across runs. For more details, see the FAQ section 
-   titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_.
+   titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
 
 
 **Conclusion**
@@ -197,7 +200,7 @@ can occur due to following factors:
 
 **Mitigation Strategies**
 
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_.
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
 
 Resources for vLLM contributors
 -------------------------------
diff --git a/docs/source/usage/structured_outputs.rst b/docs/source/usage/structured_outputs.rst
new file mode 100644
index 0000000000000..484e1f17d191e
--- /dev/null
+++ b/docs/source/usage/structured_outputs.rst
@@ -0,0 +1,267 @@
+.. _structured_outputs:
+
+Structured Outputs
+==================
+
+vLLM supports the generation of structured outputs using `outlines <https://github.com/dottxt-ai/outlines>`_ or `lm-format-enforcer <https://github.com/noamgat/lm-format-enforcer>`_ as backends for the guided decoding.
+This document shows you some examples of the different options that are available to generate structured outputs. 
+
+
+Online Inference (OpenAI API)
+-----------------------------
+
+You can generate structured outputs using the OpenAI's `Completions <https://platform.openai.com/docs/api-reference/completions>`_ and `Chat <https://platform.openai.com/docs/api-reference/chat>`_  API.
+
+The following parameters are supported, which must be added as extra parameters:
+
+- ``guided_choice``: the output will be exactly one of the choices.
+- ``guided_regex``: the output will follow the regex pattern.
+- ``guided_json``: the output will follow the JSON schema.
+- ``guided_grammar``: the output will follow the context free grammar.
+- ``guided_whitespace_pattern``: used to override the default whitespace pattern for guided json decoding.
+- ``guided_decoding_backend``: used to select the guided decoding backend to use.
+
+You can see the complete list of supported parameters on the `OpenAI Compatible Server </../serving/openai_compatible_server.html>`_ page. 
+
+Now let´s see an example for each of the cases, starting with the ``guided_choice``, as it´s the easiest one: 
+
+.. code-block:: python
+
+    from openai import OpenAI
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="-",
+    )
+
+    completion = client.chat.completions.create(
+        model="Qwen/Qwen2.5-3B-Instruct",
+        messages=[
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        ],
+        extra_body={"guided_choice": ["positive", "negative"]},
+    )
+    print(completion.choices[0].message.content)
+
+
+The next example shows how to use the ``guided_regex``. The idea is to generate an email address, given a simple regex template: 
+
+.. code-block:: python
+
+    completion = client.chat.completions.create(
+        model="Qwen/Qwen2.5-3B-Instruct",
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
+            }
+        ],
+        extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
+    )
+    print(completion.choices[0].message.content)
+
+One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. 
+For this we can use the ``guided_json`` parameter in two different ways:
+
+- Using directly a `JSON Schema <https://json-schema.org/>`_ 
+- Defining a `Pydantic model <https://docs.pydantic.dev/latest/>`_ and then extracting the JSON Schema from it (which is normally an easier option).
+
+The next example shows how to use the ``guided_json`` parameter with a Pydantic model:
+
+.. code-block:: python
+
+    from pydantic import BaseModel
+    from enum import Enum
+
+    class CarType(str, Enum):
+        sedan = "sedan"
+        suv = "SUV"
+        truck = "Truck"
+        coupe = "Coupe"
+
+
+    class CarDescription(BaseModel):
+        brand: str
+        model: str
+        car_type: CarType
+
+
+    json_schema = CarDescription.model_json_schema()
+
+    completion = client.chat.completions.create(
+        model="Qwen/Qwen2.5-3B-Instruct",
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            }
+        ],
+        extra_body={"guided_json": json_schema},
+    )
+    print(completion.choices[0].message.content)
+
+.. tip::
+    While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
+    This can improve the results notably in most cases.
+
+
+Finally we have the ``guided_grammar``, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
+It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
+
+.. code-block:: python
+
+    simplified_sql_grammar = """
+        ?start: select_statement
+
+        ?select_statement: "SELECT " column_list " FROM " table_name
+
+        ?column_list: column_name ("," column_name)*
+
+        ?table_name: identifier
+
+        ?column_name: identifier
+
+        ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+    """
+
+    completion = client.chat.completions.create(
+        model="Qwen/Qwen2.5-3B-Instruct",
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+            }
+        ],
+        extra_body={"guided_grammar": simplified_sql_grammar},
+    )
+    print(completion.choices[0].message.content)
+
+The complete code of the examples can be found on `examples/openai_chat_completion_structured_outputs.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_structured_outputs.py>`_.
+
+Experimental Automatic Parsing (OpenAI API)
+--------------------------------------------
+
+This section covers the OpenAI beta wrapper over the ``client.chat.completions.create()`` method that provides richer integrations with Python specific types.
+
+At the time of writing (``openai==1.54.4``), this is a "beta" feature in the OpenAI client library. Code reference can be found `here <https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104>`_.
+
+For the following examples, vLLM was setup using ``vllm serve meta-llama/Llama-3.1-8B-Instruct``
+
+Here is a simple example demonstrating how to get structured output using Pydantic models:
+
+.. code-block:: python
+
+    from pydantic import BaseModel
+    from openai import OpenAI
+
+
+    class Info(BaseModel):
+        name: str
+        age: int
+
+
+    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+    completion = client.beta.chat.completions.parse(
+        model="meta-llama/Llama-3.1-8B-Instruct",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
+        ],
+        response_format=Info,
+        extra_body=dict(guided_decoding_backend="outlines"),
+    )
+
+    message = completion.choices[0].message
+    print(message)
+    assert message.parsed
+    print("Name:", message.parsed.name)
+    print("Age:", message.parsed.age)
+
+Output:
+
+.. code-block:: console
+
+    ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
+    Name: Cameron
+    Age: 28
+
+
+Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
+
+.. code-block:: python
+
+    from typing import List
+    from pydantic import BaseModel
+    from openai import OpenAI
+
+
+    class Step(BaseModel):
+        explanation: str
+        output: str
+
+
+    class MathResponse(BaseModel):
+        steps: List[Step]
+        final_answer: str
+
+
+    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+    completion = client.beta.chat.completions.parse(
+        model="meta-llama/Llama-3.1-8B-Instruct",
+        messages=[
+            {"role": "system", "content": "You are a helpful expert math tutor."},
+            {"role": "user", "content": "Solve 8x + 31 = 2."},
+        ],
+        response_format=MathResponse,
+        extra_body=dict(guided_decoding_backend="outlines"),
+    )
+
+    message = completion.choices[0].message
+    print(message)
+    assert message.parsed
+    for i, step in enumerate(message.parsed.steps):
+        print(f"Step #{i}:", step)
+    print("Answer:", message.parsed.final_answer)
+
+Output:
+
+.. code-block:: console
+
+    ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8'))
+    Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31'
+    Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29'
+    Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8'
+    Answer: x = -29/8
+
+Offline Inference
+-----------------
+
+Offline inference allows for the same types of guided decoding.
+To use it, we´ll need to configure the guided decoding using the class ``GuidedDecodingParams`` inside ``SamplingParams``. 
+The main available options inside ``GuidedDecodingParams`` are: 
+
+- ``json`` 
+- ``regex`` 
+- ``choice``
+- ``grammar``
+- ``backend``
+- ``whitespace_pattern``
+
+These parameters can be used in the same way as the parameters from the Online Inference examples above. 
+One example for the usage of the ``choices`` parameter is shown below: 
+
+.. code-block:: python
+
+    from vllm import LLM, SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams
+
+    llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+
+    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+    outputs = llm.generate(
+        prompts="Classify this sentiment: vLLM is wonderful!",
+        sampling_params=sampling_params,
+    )
+    print(outputs[0].outputs[0].text)
+
+A complete example with all options can be found in `examples/offline_inference_structured_outputs.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_structured_outputs.py>`_.
diff --git a/docs/source/usage/tool_calling.md b/docs/source/usage/tool_calling.md
new file mode 100644
index 0000000000000..34b26647a959f
--- /dev/null
+++ b/docs/source/usage/tool_calling.md
@@ -0,0 +1,292 @@
+# Tool Calling
+
+vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but on the roadmap.
+
+## Quickstart
+
+Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8B model, so we need to use the llama3 tool calling chat template from the vLLM examples directory:
+
+```bash
+vllm serve meta-llama/Llama-3.1-8B-Instruct \
+    --enable-auto-tool-choice \
+    --tool-call-parser llama3_json \
+    --chat-template examples/tool_chat_template_llama3_json.jinja
+```
+
+Next, make a request to the model that should result in it using the available tools:
+
+```python
+from openai import OpenAI
+import json
+
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+
+def get_weather(location: str, unit: str):
+    return f"Getting the weather for {location} in {unit}..."
+tool_functions = {"get_weather": get_weather}
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+            },
+            "required": ["location", "unit"]
+        }
+    }
+}]
+
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+    tools=tools,
+    tool_choice="auto"
+)
+
+tool_call = response.choices[0].message.tool_calls[0].function
+print(f"Function called: {tool_call.name}")
+print(f"Arguments: {tool_call.arguments}")
+print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
+```
+
+Example output:
+```
+Function called: get_weather
+Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"}
+Result: Getting the weather for San Francisco, CA in fahrenheit...
+```
+
+This example demonstrates:
+- Setting up the server with tool calling enabled
+- Defining an actual function to handle tool calls
+- Making a request with `tool_choice="auto"`
+- Handling the structured response and executing the corresponding function
+
+You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
+
+Remember that it's the callers responsibility to:
+1. Define appropriate tools in the request
+2. Include relevant context in the chat messages
+3. Handle the tool calls in your application logic
+
+For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below.
+
+## Named Function Calling
+vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
+enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a
+high-quality one.
+
+vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. 
+For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
+
+To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
+specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
+
+
+## Automatic Function Calling
+
+To enable this feature, you should set the following flags:
+* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it
+deems appropriate.
+* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers
+will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`.
+* `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`.
+* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages
+that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their
+`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat
+template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates)
+from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json)
+
+If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
+
+
+### Hermes Models (`hermes`)
+
+All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.
+* `NousResearch/Hermes-2-Pro-*`
+* `NousResearch/Hermes-2-Theta-*`
+* `NousResearch/Hermes-3-*`
+
+
+_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge
+step in their creation_.
+
+Flags: `--tool-call-parser hermes`
+
+
+### Mistral Models (`mistral`)
+
+Supported models:
+* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
+* Additional mistral function-calling models are compatible as well.
+
+Known issues:
+1. Mistral 7B struggles to generate parallel tool calls correctly.
+2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
+much shorter than what vLLM generates. Since an exception is thrown when this condition
+is not met, the following additional chat templates are provided:
+
+* `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that
+it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
+* `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt
+when tools are provided, that results in much better reliability when working with parallel tool calling.
+
+
+Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+
+
+### Llama Models (`llama3_json`)
+
+Supported models:
+* `meta-llama/Meta-Llama-3.1-8B-Instruct`
+* `meta-llama/Meta-Llama-3.1-70B-Instruct`
+* `meta-llama/Meta-Llama-3.1-405B-Instruct`
+* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8`
+
+The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below.
+Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
+
+Known issues:
+1. Parallel tool calls are not supported.
+2. The model can generate parameters with a wrong format, such as generating
+   an array serialized as string instead of an array.
+
+The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that
+it works better with vLLM.
+
+Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
+
+#### IBM Granite
+
+Supported models:
+* `ibm-granite/granite-3.0-8b-instruct`
+
+Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
+
+`examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported.
+
+* `ibm-granite/granite-3.1-8b-instruct`
+
+Recommended flags: `--tool-call-parser granite`
+
+The chat template from Huggingface can be used directly. Parallel function calls are supported.
+
+* `ibm-granite/granite-20b-functioncalling`
+
+Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
+
+`examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
+
+
+### InternLM Models (`internlm`)
+
+Supported models:
+* `internlm/internlm2_5-7b-chat` (confirmed)
+* Additional internlm2.5 function-calling models are compatible as well
+
+Known issues:
+* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model.
+
+Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja`
+
+
+### Jamba Models (`jamba`)
+AI21's Jamba-1.5 models are supported.
+* `ai21labs/AI21-Jamba-1.5-Mini`
+* `ai21labs/AI21-Jamba-1.5-Large`
+
+
+Flags: `--tool-call-parser jamba`
+
+
+### Models with Pythonic Tool Calls (`pythonic`)
+
+A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
+
+As a concrete example, these models may look up the weather in San Francisco and Seattle by generating:
+```python
+[get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')]
+```
+
+Limitations:
+* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls.  (In particular, the Llama 3.2 models emit no such tokens.)
+* Llama's smaller models struggle to use tools effectively.
+
+Example supported models:
+* `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
+* `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
+* `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
+* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
+
+Flags: `--tool-call-parser pythonic --chat-template {see_above}`
+
+---
+**WARNING**
+Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary.
+
+---
+
+
+## How to write a tool parser plugin
+
+A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py.
+
+Here is a summary of a plugin file:
+
+```python
+
+# import the required packages
+
+# define a tool parser and register it to vllm
+# the name list in register_module can be used
+# in --tool-call-parser. you can define as many
+# tool parsers as you want here.
+@ToolParserManager.register_module(["example"])
+class ExampleToolParser(ToolParser):
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+    # adjust request. e.g.: set skip special tokens
+    # to False for tool call output.
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        return request
+
+    # implement the tool call parse for stream call
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        return delta
+
+    # implement the tool parse for non-stream call
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        return ExtractedToolCallInformation(tools_called=False,
+                                            tool_calls=[],
+                                            content=text)
+
+
+```
+
+Then you can use this plugin in the command line like this.
+```
+    --enable-auto-tool-choice \
+    --tool-parser-plugin <absolute path of the plugin file>
+    --tool-call-parser example \
+    --chat-template <your chat template> \
+```
diff --git a/docs/source/serving/usage_stats.md b/docs/source/usage/usage_stats.md
similarity index 100%
rename from docs/source/serving/usage_stats.md
rename to docs/source/usage/usage_stats.md
diff --git a/examples/chart-helm/.helmignore b/examples/chart-helm/.helmignore
new file mode 100644
index 0000000000000..2d1303b784cb8
--- /dev/null
+++ b/examples/chart-helm/.helmignore
@@ -0,0 +1,6 @@
+*.png
+.git/
+ct.yaml
+lintconf.yaml
+values.schema.json
+/workflows
\ No newline at end of file
diff --git a/examples/chart-helm/Chart.yaml b/examples/chart-helm/Chart.yaml
new file mode 100644
index 0000000000000..fb0f06f6d2701
--- /dev/null
+++ b/examples/chart-helm/Chart.yaml
@@ -0,0 +1,21 @@
+apiVersion: v2
+name: chart-vllm
+description: Chart vllm
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.0.1
+
+maintainers:
+  - name: mfournioux
diff --git a/examples/chart-helm/ct.yaml b/examples/chart-helm/ct.yaml
new file mode 100644
index 0000000000000..d273e118203ad
--- /dev/null
+++ b/examples/chart-helm/ct.yaml
@@ -0,0 +1,3 @@
+chart-dirs:
+  - charts
+validate-maintainers: false
\ No newline at end of file
diff --git a/examples/chart-helm/lintconf.yaml b/examples/chart-helm/lintconf.yaml
new file mode 100644
index 0000000000000..c8e8c5d7d9767
--- /dev/null
+++ b/examples/chart-helm/lintconf.yaml
@@ -0,0 +1,42 @@
+---
+rules:
+  braces:
+    min-spaces-inside: 0
+    max-spaces-inside: 0
+    min-spaces-inside-empty: -1
+    max-spaces-inside-empty: -1
+  brackets:
+    min-spaces-inside: 0
+    max-spaces-inside: 0
+    min-spaces-inside-empty: -1
+    max-spaces-inside-empty: -1
+  colons:
+    max-spaces-before: 0
+    max-spaces-after: 1
+  commas:
+    max-spaces-before: 0
+    min-spaces-after: 1
+    max-spaces-after: 1
+  comments:
+    require-starting-space: true
+    min-spaces-from-content: 2
+  document-end: disable
+  document-start: disable           # No --- to start a file
+  empty-lines:
+    max: 2
+    max-start: 0
+    max-end: 0
+  hyphens:
+    max-spaces-after: 1
+  indentation:
+    spaces: consistent
+    indent-sequences: whatever      # - list indentation will handle both indentation and without
+    check-multi-line-strings: false
+  key-duplicates: enable
+  line-length: disable              # Lines can be any length
+  new-line-at-end-of-file: disable
+  new-lines:
+    type: unix
+  trailing-spaces: enable
+  truthy:
+    level: warning
\ No newline at end of file
diff --git a/examples/chart-helm/templates/_helpers.tpl b/examples/chart-helm/templates/_helpers.tpl
new file mode 100644
index 0000000000000..a9690bad3c945
--- /dev/null
+++ b/examples/chart-helm/templates/_helpers.tpl
@@ -0,0 +1,164 @@
+{{/*
+Define ports for the pods
+*/}}
+{{- define "chart.container-port" -}}
+{{-  default "8000" .Values.containerPort }}
+{{- end }}
+
+{{/*
+Define service name
+*/}}
+{{- define "chart.service-name" -}}
+{{-  if .Values.serviceName }}
+{{-    .Values.serviceName | lower | trim }}
+{{-  else }}
+"{{ .Release.Name }}-service"
+{{-  end }}
+{{- end }}
+
+{{/*
+Define service port
+*/}}
+{{- define "chart.service-port" -}}
+{{-  if .Values.servicePort }}
+{{-    .Values.servicePort }}
+{{-  else }}
+{{-    include "chart.container-port" . }}
+{{-  end }}
+{{- end }}
+
+{{/*
+Define service port name
+*/}}
+{{- define "chart.service-port-name" -}}
+"service-port"
+{{- end }}
+
+{{/*
+Define container port name
+*/}}
+{{- define "chart.container-port-name" -}}
+"container-port"
+{{- end }}
+
+{{/*
+Define deployment strategy
+*/}}
+{{- define "chart.strategy" -}}
+strategy:
+{{-   if not .Values.deploymentStrategy }}
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 0
+{{-   else }}
+{{      toYaml .Values.deploymentStrategy | indent 2 }}
+{{-   end }}
+{{- end }}
+
+{{/*
+Define additional ports
+*/}}
+{{- define "chart.extraPorts" }}
+{{-   with .Values.extraPorts }}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
+
+{{/*
+Define chart external ConfigMaps and Secrets
+*/}}
+{{- define "chart.externalConfigs" -}}
+{{-   with .Values.externalConfigs -}}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
+
+
+{{/*
+Define liveness et readiness probes
+*/}}
+{{- define "chart.probes" -}}
+{{-   if .Values.readinessProbe  }}
+readinessProbe:
+{{-     with .Values.readinessProbe }}
+{{-       toYaml . | nindent 2 }}
+{{-     end }}
+{{-   end }}
+{{-   if .Values.livenessProbe  }}
+livenessProbe:
+{{-     with .Values.livenessProbe }}
+{{-       toYaml . | nindent 2 }}
+{{-     end }}
+{{-   end }}
+{{- end }}
+
+{{/*
+Define resources
+*/}}
+{{- define "chart.resources" -}}
+requests:
+  memory: {{ required "Value 'resources.requests.memory' must be defined !" .Values.resources.requests.memory | quote }}
+  cpu: {{ required "Value 'resources.requests.cpu' must be defined !" .Values.resources.requests.cpu | quote }}
+  {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
+  nvidia.com/gpu: {{ required "Value 'resources.requests.nvidia.com/gpu' must be defined !" (index .Values.resources.requests "nvidia.com/gpu") | quote }}
+  {{- end }}
+limits:
+  memory: {{ required "Value 'resources.limits.memory' must be defined !" .Values.resources.limits.memory | quote }}
+  cpu: {{ required "Value 'resources.limits.cpu' must be defined !" .Values.resources.limits.cpu | quote }}
+  {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
+  nvidia.com/gpu: {{ required "Value 'resources.limits.nvidia.com/gpu' must be defined !" (index .Values.resources.limits "nvidia.com/gpu") | quote }}
+  {{- end }}
+{{- end }}
+
+
+{{/*
+Define User used for the main container
+*/}}
+{{- define "chart.user" }}
+{{-   if .Values.image.runAsUser  }}
+runAsUser: 
+{{-     with .Values.runAsUser }}
+{{-       toYaml . | nindent 2 }}
+{{-     end }}
+{{-   end }}
+{{- end }}
+
+{{- define "chart.extraInitImage" -}}
+"amazon/aws-cli:2.6.4"
+{{- end }}
+
+{{- define "chart.extraInitEnv" -}}
+- name: S3_ENDPOINT_URL
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3endpoint
+- name: S3_BUCKET_NAME
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3bucketname
+- name: AWS_ACCESS_KEY_ID
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3accesskeyid
+- name: AWS_SECRET_ACCESS_KEY
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3accesskey
+- name: S3_PATH
+  value: "{{ .Values.extraInit.s3modelpath }}"
+- name: AWS_EC2_METADATA_DISABLED
+  value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
+{{- end }}
+
+{{/*
+  Define chart labels
+*/}}
+{{- define "chart.labels" -}}
+{{-   with .Values.labels -}}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
\ No newline at end of file
diff --git a/examples/chart-helm/templates/configmap.yaml b/examples/chart-helm/templates/configmap.yaml
new file mode 100644
index 0000000000000..cc5d03782f878
--- /dev/null
+++ b/examples/chart-helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+{{- if .Values.configs -}}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-configs"
+  namespace: {{ .Release.Namespace }}
+data:
+  {{- with .Values.configs }}
+  {{- toYaml . | nindent 2 }}
+  {{- end }}
+{{- end -}}
\ No newline at end of file
diff --git a/examples/chart-helm/templates/custom-objects.yaml b/examples/chart-helm/templates/custom-objects.yaml
new file mode 100644
index 0000000000000..8a65ffd0e552d
--- /dev/null
+++ b/examples/chart-helm/templates/custom-objects.yaml
@@ -0,0 +1,6 @@
+{{- if .Values.customObjects }}
+{{- range .Values.customObjects }}
+{{- tpl (. | toYaml) $ }}
+---
+{{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/examples/chart-helm/templates/deployment.yaml b/examples/chart-helm/templates/deployment.yaml
new file mode 100644
index 0000000000000..536983b587be2
--- /dev/null
+++ b/examples/chart-helm/templates/deployment.yaml
@@ -0,0 +1,122 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: "{{ .Release.Name }}-deployment-vllm"
+  namespace: {{ .Release.Namespace }}
+  labels:
+  {{- include "chart.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  {{- include "chart.strategy" . | nindent 2 }}
+  selector:                                                                                                                                  
+    matchLabels:
+      environment: "test"
+      release: "test"
+  progressDeadlineSeconds: 1200
+  template:
+    metadata:
+      labels:
+        environment: "test"
+        release: "test"
+    spec:
+      containers:
+        - name: "vllm"
+          image: "{{ required "Required value 'image.repository' must be defined !" .Values.image.repository }}:{{ required "Required value 'image.tag' must be defined !" .Values.image.tag }}"
+          {{- if .Values.image.command }}
+          command :
+            {{- with .Values.image.command }}
+            {{- toYaml . | nindent 10 }}
+            {{- end }}
+          {{- end }}
+          securityContext:
+            {{- if .Values.image.securityContext }}
+              {{- with .Values.image.securityContext }}
+              {{- toYaml . | nindent 12 }}
+              {{- end }}
+            {{- else }}
+            runAsNonRoot: false
+              {{- include "chart.user" . | indent 12 }}
+            {{- end }}
+          imagePullPolicy: IfNotPresent
+          {{- if .Values.image.env }}
+          env :
+            {{- with .Values.image.env }}
+            {{- toYaml . | nindent 10 }}
+            {{- end }}
+          {{- else }}
+          env: []
+          {{- end }}
+          {{- if or .Values.externalConfigs .Values.configs .Values.secrets }}
+          envFrom:
+            {{- if .Values.configs }}
+            - configMapRef:
+                name: "{{ .Release.Name }}-configs"
+            {{- end }}
+            {{- if .Values.secrets}}
+            - secretRef:
+                name: "{{ .Release.Name }}-secrets"
+            {{- end }}
+            {{- include "chart.externalConfigs" . | nindent 12 }}
+          {{- end }}          
+          ports:
+            - name: {{ include "chart.container-port-name" . }}
+              containerPort: {{ include "chart.container-port" . }}
+            {{- include "chart.extraPorts" . | nindent 12 }}
+          {{- include "chart.probes" . | indent 10 }}
+          resources: {{- include "chart.resources" . | nindent 12 }}
+          volumeMounts:
+          - name: {{ .Release.Name }}-storage
+            mountPath: /data
+
+        {{- with .Values.extraContainers }}
+        {{ toYaml . | nindent 8 }}
+        {{- end }}
+
+      {{-   if .Values.extraInit  }}
+      initContainers:
+      - name: wait-download-model
+        image: {{ include "chart.extraInitImage" . }}
+        command: 
+          - /bin/bash
+        args:
+          - -eucx
+          - while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done
+        env: {{- include "chart.extraInitEnv" . | nindent 10 }}
+        resources:
+          requests:
+            cpu: 200m
+            memory: 1Gi
+          limits:
+            cpu: 500m
+            memory: 2Gi
+        volumeMounts:
+        - name: {{ .Release.Name }}-storage
+          mountPath: /data
+      {{- end }}
+      volumes:
+        - name: {{ .Release.Name }}-storage
+          persistentVolumeClaim:
+            claimName: {{ .Release.Name }}-storage-claim     
+
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
+      runtimeClassName: nvidia
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                - key: nvidia.com/gpu.product
+                  operator: In
+                  {{- with .Values.gpuModels }}
+                  values:
+                    {{- toYaml . | nindent 20 }}
+                  {{- end }}
+      {{- end }} 
\ No newline at end of file
diff --git a/examples/chart-helm/templates/hpa.yaml b/examples/chart-helm/templates/hpa.yaml
new file mode 100644
index 0000000000000..5ca94c8213541
--- /dev/null
+++ b/examples/chart-helm/templates/hpa.yaml
@@ -0,0 +1,31 @@
+{{- if .Values.autoscaling.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: "{{ .Release.Name }}-hpa"
+  namespace: {{ .Release.Namespace }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: vllm
+  minReplicas: {{ .Values.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+  metrics:
+    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
+    {{- end }}
+    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    {{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/examples/chart-helm/templates/job.yaml b/examples/chart-helm/templates/job.yaml
new file mode 100644
index 0000000000000..f9ea3541e78d2
--- /dev/null
+++ b/examples/chart-helm/templates/job.yaml
@@ -0,0 +1,37 @@
+{{-   if .Values.extraInit  }}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: "{{ .Release.Name }}-init-vllm"
+  namespace: {{ .Release.Namespace }}
+spec:
+  ttlSecondsAfterFinished: 100
+  template:
+   metadata:
+     name: init-vllm
+   spec:
+    containers:
+    - name: job-download-model
+      image: {{ include "chart.extraInitImage" . }}
+      command: 
+        - /bin/bash
+      args:
+        - -eucx
+        - aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data
+      env: {{- include "chart.extraInitEnv" . | nindent 8 }}
+      volumeMounts:
+        - name: {{ .Release.Name }}-storage
+          mountPath: /data
+      resources:
+        requests:
+          cpu: 200m
+          memory: 1Gi
+        limits:
+          cpu: 500m
+          memory: 2Gi
+    restartPolicy: OnFailure
+    volumes:
+    - name: {{ .Release.Name }}-storage
+      persistentVolumeClaim:
+        claimName: "{{ .Release.Name }}-storage-claim"
+{{- end }}
\ No newline at end of file
diff --git a/examples/chart-helm/templates/poddisruptionbudget.yaml b/examples/chart-helm/templates/poddisruptionbudget.yaml
new file mode 100644
index 0000000000000..512bac727da87
--- /dev/null
+++ b/examples/chart-helm/templates/poddisruptionbudget.yaml
@@ -0,0 +1,7 @@
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: "{{ .Release.Name }}-pdb"
+  namespace: {{ .Release.Namespace }}
+spec:
+  maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}
\ No newline at end of file
diff --git a/examples/chart-helm/templates/pvc.yaml b/examples/chart-helm/templates/pvc.yaml
new file mode 100644
index 0000000000000..e8d203a7a5ace
--- /dev/null
+++ b/examples/chart-helm/templates/pvc.yaml
@@ -0,0 +1,13 @@
+{{-   if .Values.extraInit  }}
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: "{{ .Release.Name }}-storage-claim"
+  namespace: {{ .Release.Namespace }}
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{ .Values.extraInit.pvcStorage }}
+{{- end }}
\ No newline at end of file
diff --git a/examples/chart-helm/templates/secrets.yaml b/examples/chart-helm/templates/secrets.yaml
new file mode 100644
index 0000000000000..4e88e747b616a
--- /dev/null
+++ b/examples/chart-helm/templates/secrets.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: "{{ .Release.Name }}-secrets"
+  namespace: {{ .Release.Namespace }}
+type: Opaque
+data:
+  {{- range $key, $val := .Values.secrets }}
+  {{ $key }}: {{ $val | b64enc | quote }}
+  {{- end }}
\ No newline at end of file
diff --git a/examples/chart-helm/templates/service.yaml b/examples/chart-helm/templates/service.yaml
new file mode 100644
index 0000000000000..12d0f68b03a35
--- /dev/null
+++ b/examples/chart-helm/templates/service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}-service"
+  namespace: {{ .Release.Namespace }}
+spec:
+  type: ClusterIP
+  ports:
+    - name: {{ include "chart.service-port-name" . }}
+      port: {{ include "chart.service-port" . }}
+      targetPort: {{ include "chart.container-port-name" . }}
+      protocol: TCP
+  selector:
+  {{- include "chart.labels" . | nindent 4 }}
\ No newline at end of file
diff --git a/examples/chart-helm/values.schema.json b/examples/chart-helm/values.schema.json
new file mode 100644
index 0000000000000..812d54bde1397
--- /dev/null
+++ b/examples/chart-helm/values.schema.json
@@ -0,0 +1,265 @@
+{
+    "$schema": "http://json-schema.org/schema#",
+    "type": "object",
+    "properties": {
+        "image": {
+            "type": "object",
+            "properties": {
+                "repository": {
+                    "type": "string"
+                },
+                "tag": {
+                    "type": "string"
+                },
+                "command": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            },
+            "required": [
+                "command",
+                "repository",
+                "tag"
+            ]
+        },
+        "containerPort": {
+            "type": "integer"
+        },
+        "serviceName": {
+            "type": "null"
+        },
+        "servicePort": {
+            "type": "integer"
+        },
+        "extraPorts": {
+            "type": "array"
+        },
+        "replicaCount": {
+            "type": "integer"
+        },
+        "deploymentStrategy": {
+            "type": "object"
+        },
+        "resources": {
+            "type": "object",
+            "properties": {
+                "requests": {
+                    "type": "object",
+                    "properties": {
+                        "cpu": {
+                            "type": "integer"
+                        },
+                        "memory": {
+                            "type": "string"
+                        },
+                        "nvidia.com/gpu": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "cpu",
+                        "memory",
+                        "nvidia.com/gpu"
+                    ]
+                },
+                "limits": {
+                    "type": "object",
+                    "properties": {
+                        "cpu": {
+                            "type": "integer"
+                        },
+                        "memory": {
+                            "type": "string"
+                        },
+                        "nvidia.com/gpu": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "cpu",
+                        "memory",
+                        "nvidia.com/gpu"
+                    ]
+                }
+            },
+            "required": [
+                "limits",
+                "requests"
+            ]
+        },
+        "gpuModels": {
+            "type": "array",
+            "items": {
+                "type": "string"
+            }
+        },
+        "autoscaling": {
+            "type": "object",
+            "properties": {
+                "enabled": {
+                    "type": "boolean"
+                },
+                "minReplicas": {
+                    "type": "integer"
+                },
+                "maxReplicas": {
+                    "type": "integer"
+                },
+                "targetCPUUtilizationPercentage": {
+                    "type": "integer"
+                }
+            },
+            "required": [
+                "enabled",
+                "maxReplicas",
+                "minReplicas",
+                "targetCPUUtilizationPercentage"
+            ]
+        },
+        "configs": {
+            "type": "object"
+        },
+        "secrets": {
+            "type": "object"
+        },
+        "externalConfigs": {
+            "type": "array"
+        },
+        "customObjects": {
+            "type": "array"
+        },
+        "maxUnavailablePodDisruptionBudget": {
+            "type": "string"
+        },
+        "extraInit": {
+            "type": "object",
+            "properties": {
+                "s3modelpath": {
+                    "type": "string"
+                },
+                "pvcStorage": {
+                    "type": "string"
+                },
+                "awsEc2MetadataDisabled": {
+                    "type": "boolean"
+                }
+            },
+            "required": [
+                "pvcStorage",
+                "s3modelpath",
+                "awsEc2MetadataDisabled"
+            ]
+        },
+        "extraContainers": {
+            "type": "array"
+        },
+        "readinessProbe": {
+            "type": "object",
+            "properties": {
+                "initialDelaySeconds": {
+                    "type": "integer"
+                },
+                "periodSeconds": {
+                    "type": "integer"
+                },
+                "failureThreshold": {
+                    "type": "integer"
+                },
+                "httpGet": {
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string"
+                        },
+                        "port": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "path",
+                        "port"
+                    ]
+                }
+            },
+            "required": [
+                "failureThreshold",
+                "httpGet",
+                "initialDelaySeconds",
+                "periodSeconds"
+            ]
+        },
+        "livenessProbe": {
+            "type": "object",
+            "properties": {
+                "initialDelaySeconds": {
+                    "type": "integer"
+                },
+                "failureThreshold": {
+                    "type": "integer"
+                },
+                "periodSeconds": {
+                    "type": "integer"
+                },
+                "httpGet": {
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string"
+                        },
+                        "port": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "path",
+                        "port"
+                    ]
+                }
+            },
+            "required": [
+                "failureThreshold",
+                "httpGet",
+                "initialDelaySeconds",
+                "periodSeconds"
+            ]
+        },
+        "labels": {
+            "type": "object",
+            "properties": {
+                "environment": {
+                    "type": "string"
+                },
+                "release": {
+                    "type": "string"
+                }
+            },
+            "required": [
+                "environment",
+                "release"
+            ]
+        }
+    },
+    "required": [
+        "autoscaling",
+        "configs",
+        "containerPort",
+        "customObjects",
+        "deploymentStrategy",
+        "externalConfigs",
+        "extraContainers",
+        "extraInit",
+        "extraPorts",
+        "gpuModels",
+        "image",
+        "labels",
+        "livenessProbe",
+        "maxUnavailablePodDisruptionBudget",
+        "readinessProbe",
+        "replicaCount",
+        "resources",
+        "secrets",
+        "servicePort"
+    ]
+}
\ No newline at end of file
diff --git a/examples/chart-helm/values.yaml b/examples/chart-helm/values.yaml
new file mode 100644
index 0000000000000..9c48e7d061bf7
--- /dev/null
+++ b/examples/chart-helm/values.yaml
@@ -0,0 +1,119 @@
+# -- Default values for chart vllm
+# -- Declare variables to be passed into your templates.
+
+# -- Image configuration
+image:
+  # -- Image repository
+  repository: "vllm/vllm-openai"
+  # -- Image tag
+  tag: "latest"
+  # -- Container launch command
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
+
+# -- Container port
+containerPort: 8000
+# -- Service name
+serviceName:
+# -- Service port
+servicePort: 80
+# -- Additional ports configuration
+extraPorts: []
+
+# -- Number of replicas
+replicaCount: 1
+
+# -- Deployment strategy configuration
+deploymentStrategy: {}
+
+# -- Resource configuration
+resources:
+  requests:
+    # -- Number of CPUs
+    cpu: 4
+    # -- CPU memory configuration
+    memory: 16Gi
+    # -- Number of gpus used
+    nvidia.com/gpu: 1
+  limits:
+    # -- Number of CPUs
+    cpu: 4
+    # -- CPU memory configuration
+    memory: 16Gi
+    # -- Number of gpus used
+    nvidia.com/gpu: 1
+
+# -- Type of gpu used
+gpuModels:
+  - "TYPE_GPU_USED"
+
+# -- Autoscaling configuration
+autoscaling:
+  # -- Enable autoscaling
+  enabled: false
+  # -- Minimum replicas
+  minReplicas: 1
+  # -- Maximum replicas
+  maxReplicas: 100
+  # -- Target CPU utilization for autoscaling
+  targetCPUUtilizationPercentage: 80
+  # targetMemoryUtilizationPercentage: 80
+
+# -- Configmap
+configs: {}
+
+# -- Secrets configuration
+secrets: {}
+
+# -- External configuration
+externalConfigs: []
+
+# -- Custom Objects configuration
+customObjects: []
+
+# -- Disruption Budget Configuration
+maxUnavailablePodDisruptionBudget: ""
+
+# -- Additional configuration for the init container
+extraInit:
+   # -- Path of the model on the s3 which hosts model weights and config files
+  s3modelpath: "relative_s3_model_path/opt-125m"
+   # -- Storage size of the s3
+  pvcStorage: "1Gi"
+  awsEc2MetadataDisabled: true
+
+# -- Additional containers configuration
+extraContainers: []
+
+# -- Readiness probe configuration
+readinessProbe:
+  # -- Number of seconds after the container has started before readiness probe is initiated
+  initialDelaySeconds: 5
+  # -- How often (in seconds) to perform the readiness probe
+  periodSeconds: 5
+  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+  failureThreshold: 3
+   # -- Configuration of the Kubelet http request on the server
+  httpGet:
+    # -- Path to access on the HTTP server
+    path: /health
+    # -- Name or number of the port to access on the container, on which the server is listening
+    port: 8000
+
+# -- Liveness probe configuration
+livenessProbe:
+ # -- Number of seconds after the container has started before liveness probe is initiated
+  initialDelaySeconds: 15
+  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+  failureThreshold: 3
+  # -- How often (in seconds) to perform the liveness probe
+  periodSeconds: 10
+  # -- Configuration of the Kubelet http request on the server
+  httpGet:
+    # -- Path to access on the HTTP server
+    path: /health
+    # -- Name or number of the port to access on the container, on which the server is listening
+    port: 8000
+
+labels:
+  environment: "test"
+  release: "test"
diff --git a/examples/disaggregated_prefill.sh b/examples/disaggregated_prefill.sh
new file mode 100644
index 0000000000000..87155273a81d1
--- /dev/null
+++ b/examples/disaggregated_prefill.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# This file demonstrates the example usage of disaggregated prefilling
+# We will launch 2 vllm instances (1 for prefill and 1 for decode),
+# and then transfer the KV cache between them.
+
+echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
+sleep 1
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+# install quart first -- required for disagg prefill proxy serve
+if python3 -c "import quart" &> /dev/null; then
+    echo "Quart is already installed."
+else
+    echo "Quart is not installed. Installing..."
+    python3 -m pip install quart
+fi 
+
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+# You can also adjust --kv-ip and --kv-port for distributed inference.
+
+# prefilling instance, which is the KV producer
+CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8100 \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+
+# decoding instance, which is the KV consumer
+CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8200 \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+
+# wait until prefill and decode instances are ready
+wait_for_server 8100
+wait_for_server 8200
+
+# launch a proxy server that opens the service at port 8000
+# the workflow of this proxy:
+# - send the request to prefill vLLM instance (port 8100), change max_tokens 
+#   to 1
+# - after the prefill vLLM finishes prefill, send the request to decode vLLM 
+#   instance
+# NOTE: the usage of this API is subject to change --- in the future we will 
+# introduce "vllm connect" to connect between prefill and decode instances
+python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
+sleep 1
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"prompt": "San Francisco is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"prompt": "Santa Clara is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+
+# Cleanup commands
+pgrep python | xargs kill -9
+pkill -f python
+
+echo ""
+
+sleep 1
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
diff --git a/examples/logging_configuration.md b/examples/logging_configuration.md
index 0d278b0392403..9ac8b13cd5eaf 100644
--- a/examples/logging_configuration.md
+++ b/examples/logging_configuration.md
@@ -118,7 +118,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
 {
   "formatters": {
     "vllm": {
-      "class": "vllm.logging.NewLineFormatter",
+      "class": "vllm.logging_utils.NewLineFormatter",
       "datefmt": "%m-%d %H:%M:%S",
       "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
     }
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 9b758fa2479f6..23cc6e8539431 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -19,4 +19,4 @@
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 050b791b62adb..68b786961b14a 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -25,16 +25,16 @@ def run_ultravox(question: str, audio_count: int):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
-        'role':
-        'user',
-        'content':
-        "<|reserved_special_token_0|>\n" * audio_count + question
+        'role': 'user',
+        'content': "<|audio|>\n" * audio_count + question
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count})
+    llm = LLM(model=model_name,
+              trust_remote_code=True,
+              limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
diff --git a/examples/offline_inference_classification.py b/examples/offline_inference_classification.py
new file mode 100644
index 0000000000000..de539b639a196
--- /dev/null
+++ b/examples/offline_inference_classification.py
@@ -0,0 +1,28 @@
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create an LLM.
+# You should pass task="classify" for classification models
+model = LLM(
+    model="jason9693/Qwen2.5-1.5B-apeach",
+    task="classify",
+    enforce_eager=True,
+)
+
+# Generate logits. The output is a list of ClassificationRequestOutputs.
+outputs = model.classify(prompts)
+
+# Print the outputs.
+for prompt, output in zip(prompts, outputs):
+    probs = output.outputs.probs
+    probs_trimmed = ((str(probs[:16])[:-1] +
+                      ", ...]") if len(probs) > 16 else probs)
+    print(f"Prompt: {prompt!r} | "
+          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
diff --git a/examples/offline_inference_cli.py b/examples/offline_inference_cli.py
new file mode 100644
index 0000000000000..391ac6b9b6b03
--- /dev/null
+++ b/examples/offline_inference_cli.py
@@ -0,0 +1,80 @@
+from dataclasses import asdict
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def get_prompts(num_prompts: int):
+    # The default sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    if num_prompts != len(prompts):
+        prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
+
+    return prompts
+
+
+def main(args):
+    # Create prompts
+    prompts = get_prompts(args.num_prompts)
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(n=args.n,
+                                     temperature=args.temperature,
+                                     top_p=args.top_p,
+                                     top_k=args.top_k,
+                                     max_tokens=args.max_tokens)
+
+    # Create an LLM.
+    # The default model is 'facebook/opt-125m'
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**asdict(engine_args))
+
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    group = parser.add_argument_group("SamplingParams options")
+    group.add_argument("--num-prompts",
+                       type=int,
+                       default=4,
+                       help="Number of prompts used for inference")
+    group.add_argument("--max-tokens",
+                       type=int,
+                       default=16,
+                       help="Generated output length for sampling")
+    group.add_argument('--n',
+                       type=int,
+                       default=1,
+                       help='Number of generated sequences per prompt')
+    group.add_argument('--temperature',
+                       type=float,
+                       default=0.8,
+                       help='Temperature for text generation')
+    group.add_argument('--top-p',
+                       type=float,
+                       default=0.95,
+                       help='top_p for text generation')
+    group.add_argument('--top-k',
+                       type=int,
+                       default=-1,
+                       help='top_k for text generation')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
index 7d5ef128bc8e0..58d004313ad51 100644
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
@@ -9,9 +9,20 @@
 ]
 
 # Create an LLM.
-model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
+# You should pass task="embed" for embedding models
+model = LLM(
+    model="intfloat/e5-mistral-7b-instruct",
+    task="embed",
+    enforce_eager=True,
+)
+
 # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-outputs = model.encode(prompts)
+outputs = model.embed(prompts)
+
 # Print the outputs.
-for output in outputs:
-    print(output.outputs.embedding)  # list of 4096 floats
+for prompt, output in zip(prompts, outputs):
+    embeds = output.outputs.embedding
+    embeds_trimmed = ((str(embeds[:16])[:-1] +
+                       ", ...]") if len(embeds) > 16 else embeds)
+    print(f"Prompt: {prompt!r} | "
+          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
diff --git a/examples/offline_inference_openai.md b/examples/offline_inference_openai.md
index 4c64197975534..2436417cb543a 100644
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference_openai.md
@@ -1,45 +1,48 @@
 # Offline Inference with the OpenAI Batch file format
 
- **NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API.
- 
- ## File Format
- 
- The OpenAI batch file format consists of a series of json objects on new lines.
+```{important}
+This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
+```
+
+## File Format
  
- [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+The OpenAI batch file format consists of a series of json objects on new lines.
  
- Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
  
- **NOTE:** We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
- ## Pre-requisites
+```{note}
+We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+```
  
-* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`.
+## Pre-requisites
+
 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
   - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
   - Install the token on your machine (Run `huggingface-cli login`).
   - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
  
  
- ## Example 1: Running with a local file
- 
- ### Step 1: Create your batch file
- 
- To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- 
- ```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
- 
- Once you've created your batch file it should look like this
- 
- ```
- $ cat openai_example_batch.jsonl
+## Example 1: Running with a local file
+
+### Step 1: Create your batch file
+
+To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
+
+```
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```
+$ cat openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
- 
- ### Step 2: Run the batch
+```
+
+### Step 2: Run the batch
  
 The batch running tool is designed to be used from the command line.
 
@@ -85,18 +88,18 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 ### Step 1: Upload your input script
 
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- 
- ```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
- 
- Once you've created your batch file it should look like this
- 
- ```
- $ cat openai_example_batch.jsonl
+
+```
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```
+$ cat openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
+```
 
 Now upload your batch file to your S3 bucket.
 
@@ -104,7 +107,6 @@ Now upload your batch file to your S3 bucket.
 aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
-  
 ### Step 2: Generate your presigned urls
 
 Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
@@ -179,21 +181,19 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 
 ### Step 1: Create your batch file
  
- Add embedding requests to your batch file. The following is an example:
+Add embedding requests to your batch file. The following is an example:
  
- ```
- {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
 ```
- 
- You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
 
+You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
 
- ### Step 2: Run the batch
+### Step 2: Run the batch
 
 You can run the batch using the same command as in earlier examples.
 
-
 ### Step 3: Check your results
 
 You can check your results by running `cat results.jsonl`
@@ -201,5 +201,5 @@ You can check your results by running `cat results.jsonl`
 ```
 $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
-...```
+...
 ```
diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference_scoring.py
new file mode 100644
index 0000000000000..5da9e710959b5
--- /dev/null
+++ b/examples/offline_inference_scoring.py
@@ -0,0 +1,23 @@
+from vllm import LLM
+
+# Sample prompts.
+text_1 = "What is the capital of France?"
+texts_2 = [
+    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+]
+
+# Create an LLM.
+# You should pass task="score" for cross-encoder models
+model = LLM(
+    model="BAAI/bge-reranker-v2-m3",
+    task="score",
+    enforce_eager=True,
+)
+
+# Generate scores. The output is a list of ScoringRequestOutputs.
+outputs = model.score(text_1, texts_2)
+
+# Print the outputs.
+for text_2, output in zip(texts_2, outputs):
+    score = output.outputs.score
+    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference_structured_outputs.py
new file mode 100644
index 0000000000000..00d864606eeff
--- /dev/null
+++ b/examples/offline_inference_structured_outputs.py
@@ -0,0 +1,78 @@
+from enum import Enum
+
+from pydantic import BaseModel
+
+from vllm import LLM, SamplingParams
+from vllm.sampling_params import GuidedDecodingParams
+
+llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
+
+# Guided decoding by Choice (list of possible options)
+guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+outputs = llm.generate(
+    prompts="Classify this sentiment: vLLM is wonderful!",
+    sampling_params=sampling_params,
+)
+print(outputs[0].outputs[0].text)
+
+# Guided decoding by Regex
+guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
+                                 stop=["\n"])
+prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+          "End in .com and new line. Example result:"
+          "alan.turing@enigma.com\n")
+outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+print(outputs[0].outputs[0].text)
+
+
+# Guided decoding by JSON using Pydantic schema
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+json_schema = CarDescription.model_json_schema()
+
+guided_decoding_params = GuidedDecodingParams(json=json_schema)
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+prompt = ("Generate a JSON with the brand, model and car_type of"
+          "the most iconic car from the 90's")
+outputs = llm.generate(
+    prompts=prompt,
+    sampling_params=sampling_params,
+)
+print(outputs[0].outputs[0].text)
+
+# Guided decoding by Grammar
+simplified_sql_grammar = """
+    ?start: select_statement
+
+    ?select_statement: "SELECT " column_list " FROM " table_name
+
+    ?column_list: column_name ("," column_name)*
+
+    ?table_name: identifier
+
+    ?column_name: identifier
+
+    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+"""
+guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+prompt = ("Generate an SQL query to show the 'username' and 'email'"
+          "from the 'users' table.")
+outputs = llm.generate(
+    prompts=prompt,
+    sampling_params=sampling_params,
+)
+print(outputs[0].outputs[0].text)
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 11af6880e1b5a..6d0495fdd4054 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -5,6 +5,8 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import random
+
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
@@ -17,13 +19,168 @@
 # Unless specified, these settings have been tested to work on a single L4.
 
 
+# Aria
+def run_aria(question: str, modality: str):
+    assert modality == "image"
+    model_name = "rhymes-ai/Aria"
+
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+
+    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
+              "<|im_end|>\n<|im_start|>assistant\n")
+
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+    return llm, prompt, stop_token_ids
+
+
+# BLIP-2
+def run_blip2(question: str, modality: str):
+    assert modality == "image"
+
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompt = f"Question: {question} Answer:"
+    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Chameleon
+def run_chameleon(question: str, modality: str):
+    assert modality == "image"
+
+    prompt = f"{question}<image>"
+    llm = LLM(model="facebook/chameleon-7b",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Fuyu
+def run_fuyu(question: str, modality: str):
+    assert modality == "image"
+
+    prompt = f"{question}\n"
+    llm = LLM(model="adept/fuyu-8b",
+              max_model_len=2048,
+              max_num_seqs=2,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# GLM-4v
+def run_glm4v(question: str, modality: str):
+    assert modality == "image"
+    model_name = "THUDM/glm-4v-9b"
+
+    llm = LLM(model=model_name,
+              max_model_len=2048,
+              max_num_seqs=2,
+              trust_remote_code=True,
+              enforce_eager=True,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    prompt = question
+    stop_token_ids = [151329, 151336, 151338]
+    return llm, prompt, stop_token_ids
+
+
+# H2OVL-Mississippi
+def run_h2ovl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "h2oai/h2ovl-mississippi-2b"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+    return llm, prompt, stop_token_ids
+
+
+# Idefics3-8B-Llama3
+def run_idefics3(question: str, modality: str):
+    assert modality == "image"
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    prompt = (
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    )
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# InternVL
+def run_internvl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "OpenGVLab/InternVL2-2B"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return llm, prompt, stop_token_ids
+
+
 # LLaVA-1.5
 def run_llava(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
 
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -33,7 +190,9 @@ def run_llava_next(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -44,7 +203,9 @@ def run_llava_next_video(question: str, modality: str):
     assert modality == "video"
 
     prompt = f"USER: <video>\n{question} ASSISTANT:"
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -61,75 +222,26 @@ def run_llava_onevision(question: str, modality: str):
         <|im_start|>assistant\n"
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-              max_model_len=16384)
+              max_model_len=16384,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Fuyu
-def run_fuyu(question: str, modality: str):
+# Mantis
+def run_mantis(question: str, modality: str):
     assert modality == "image"
 
-    prompt = f"{question}\n"
-    llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
+    prompt = llama3_template.format(f"{question}\n<image>")
 
-# Phi-3-Vision
-def run_phi3v(question: str, modality: str):
-    assert modality == "image"
-
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (128k) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-
-    # In this example, we override max_num_seqs to 5 while
-    # keeping the original context length of 128k.
-
-    # num_crops is an override kwarg to the multimodal image processor;
-    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
-    # to use 16 for single frame scenarios, and 4 for multi-frame.
-    #
-    # Generally speaking, a larger value for num_crops results in more
-    # tokens per image instance, because it may scale the image more in
-    # the image preprocessing. Some references in the model docs and the
-    # formula for image tokens after the preprocessing
-    # transform can be found below.
-    #
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model="microsoft/Phi-3-vision-128k-instruct",
-        trust_remote_code=True,
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
-        max_num_seqs=2,
-        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={"num_crops": 16},
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# PaliGemma
-def run_paligemma(question: str, modality: str):
-    assert modality == "image"
-
-    # PaliGemma has special prompt format for VQA
-    prompt = "caption en"
-    llm = LLM(model="google/paligemma-3b-mix-224")
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Chameleon
-def run_chameleon(question: str, modality: str):
-    assert modality == "image"
-
-    prompt = f"{question}<image>"
-    llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
-    stop_token_ids = None
+    stop_token_ids = [128009]
     return llm, prompt, stop_token_ids
 
 
@@ -154,6 +266,7 @@ def run_minicpmv(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
@@ -176,56 +289,45 @@ def run_minicpmv(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# H2OVL-Mississippi
-def run_h2ovl(question: str, modality: str):
+# LLama 3.2
+def run_mllama(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "h2oai/h2ovl-mississippi-2b"
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (131072) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
 
+    # The configuration below has been confirmed to launch on a single L40 GPU.
     llm = LLM(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-
-    # Stop tokens for H2OVL-Mississippi
-    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
-    stop_token_ids = [tokenizer.eos_token_id]
+    prompt = f"<|image|><|begin_of_text|>{question}"
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# InternVL
-def run_internvl(question: str, modality: str):
+# Molmo
+def run_molmo(question, modality):
     assert modality == "image"
 
-    model_name = "OpenGVLab/InternVL2-2B"
+    model_name = "allenai/Molmo-7B-D-0924"
 
     llm = LLM(
         model=model_name,
         trust_remote_code=True,
-        max_model_len=4096,
+        dtype="bfloat16",
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-
-    # Stop tokens for InternVL
-    # models variants may have different stop tokens
-    # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    prompt = question
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
@@ -241,6 +343,7 @@ def run_nvlm_d(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=4096,
         tensor_parallel_size=4,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -253,55 +356,57 @@ def run_nvlm_d(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# BLIP-2
-def run_blip2(question: str, modality: str):
+# PaliGemma
+def run_paligemma(question: str, modality: str):
     assert modality == "image"
 
-    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
-    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompt = f"Question: {question} Answer:"
-    llm = LLM(model="Salesforce/blip2-opt-2.7b")
+    # PaliGemma has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma-3b-mix-224",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Qwen
-def run_qwen_vl(question: str, modality: str):
+# PaliGemma 2
+def run_paligemma2(question: str, modality: str):
     assert modality == "image"
 
-    llm = LLM(
-        model="Qwen/Qwen-VL",
-        trust_remote_code=True,
-        max_model_len=1024,
-        max_num_seqs=2,
-    )
-
-    prompt = f"{question}Picture 1: <img></img>\n"
+    # PaliGemma 2 has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Qwen2-VL
-def run_qwen2_vl(question: str, modality: str):
+# Phi-3-Vision
+def run_phi3v(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
 
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model=model_name,
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={
-            "min_pixels": 28 * 28,
-            "max_pixels": 1280 * 28 * 28,
-        },
+        mm_processor_kwargs={"num_crops": 16},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -315,6 +420,7 @@ def run_pixtral_hf(question: str, modality: str):
     llm = LLM(
         model=model_name,
         max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = f"<s>[INST]{question}\n[IMG][/INST]"
@@ -322,107 +428,73 @@ def run_pixtral_hf(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# LLama 3.2
-def run_mllama(question: str, modality: str):
-    assert modality == "image"
-
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (131072) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
-        enforce_eager=True,
-    )
-
-    prompt = f"<|image|><|begin_of_text|>{question}"
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Molmo
-def run_molmo(question, modality):
+# Qwen
+def run_qwen_vl(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "allenai/Molmo-7B-D-0924"
-
     llm = LLM(
-        model=model_name,
+        model="Qwen/Qwen-VL",
         trust_remote_code=True,
-        dtype="bfloat16",
+        max_model_len=1024,
+        max_num_seqs=2,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    prompt = question
+    prompt = f"{question}Picture 1: <img></img>\n"
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# GLM-4v
-def run_glm4v(question: str, modality: str):
+# Qwen2-VL
+def run_qwen2_vl(question: str, modality: str):
     assert modality == "image"
-    model_name = "THUDM/glm-4v-9b"
 
-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              trust_remote_code=True,
-              enforce_eager=True)
-    prompt = question
-    stop_token_ids = [151329, 151336, 151338]
-    return llm, prompt, stop_token_ids
-
-
-# Idefics3-8B-Llama3
-def run_idefics3(question: str, modality: str):
-    assert modality == "image"
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     llm = LLM(
         model=model_name,
-        max_model_len=8192,
-        max_num_seqs=2,
-        enforce_eager=True,
-        # if you are running out of memory, you can reduce the "longest_edge".
-        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        max_model_len=4096,
+        max_num_seqs=5,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={
-            "size": {
-                "longest_edge": 3 * 364
-            },
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
         },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-    prompt = (
-        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-    )
+
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
 model_example_map = {
+    "aria": run_aria,
+    "blip-2": run_blip2,
+    "chameleon": run_chameleon,
+    "fuyu": run_fuyu,
+    "glm4v": run_glm4v,
+    "h2ovl_chat": run_h2ovl,
+    "idefics3": run_idefics3,
+    "internvl_chat": run_internvl,
     "llava": run_llava,
     "llava-next": run_llava_next,
     "llava-next-video": run_llava_next_video,
     "llava-onevision": run_llava_onevision,
-    "fuyu": run_fuyu,
-    "phi3_v": run_phi3v,
-    "paligemma": run_paligemma,
-    "chameleon": run_chameleon,
+    "mantis": run_mantis,
     "minicpmv": run_minicpmv,
-    "blip-2": run_blip2,
-    "h2ovl_chat": run_h2ovl,
-    "internvl_chat": run_internvl,
+    "mllama": run_mllama,
+    "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
+    "paligemma": run_paligemma,
+    "paligemma2": run_paligemma2,
+    "phi3_v": run_phi3v,
+    "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
-    "pixtral_hf": run_pixtral_hf,
-    "mllama": run_mllama,
-    "molmo": run_molmo,
-    "glm4v": run_glm4v,
-    "idefics3": run_idefics3,
 }
 
 
@@ -459,6 +531,35 @@ def get_multi_modal_input(args):
     raise ValueError(msg)
 
 
+def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
@@ -489,14 +590,29 @@ def main(args):
 
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompt,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompt,
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for _ in range(args.num_prompts)]
+
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
@@ -526,5 +642,23 @@ def main(args):
                         type=int,
                         default=16,
                         help='Number of frames to extract from the video.')
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--mm-cache-preprocessor',
+        action='store_true',
+        help='If True, enable caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
     args = parser.parse_args()
     main(args)
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
index e1732d045f949..4ce3d496bf45b 100644
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -59,7 +59,7 @@ def run_e5_v(query: Query):
 
     llm = LLM(
         model="royokong/e5-v",
-        task="embedding",
+        task="embed",
         max_model_len=4096,
     )
 
@@ -88,7 +88,7 @@ def run_vlm2vec(query: Query):
 
     llm = LLM(
         model="TIGER-Lab/VLM2Vec-Full",
-        task="embedding",
+        task="embed",
         trust_remote_code=True,
         mm_processor_kwargs={"num_crops": 4},
     )
@@ -133,7 +133,7 @@ def run_encode(model: str, modality: QueryModality):
     if req_data.image is not None:
         mm_data["image"] = req_data.image
 
-    outputs = req_data.llm.encode({
+    outputs = req_data.llm.embed({
         "prompt": req_data.prompt,
         "multi_modal_data": mm_data,
     })
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index dc12df8d78211..6af8d7768e75d 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -33,78 +33,23 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
-    model_name = "Qwen/Qwen-VL-Chat"
-    llm = LLM(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=1024,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-    placeholders = "".join(f"Picture {i}: <img></img>\n"
-                           for i, _ in enumerate(image_urls, start=1))
-
-    # This model does not have a chat_template attribute on its tokenizer,
-    # so we need to explicitly pass it. We use ChatML since it's used in the
-    # generation utils of the model:
-    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-
-    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
-    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
-
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True,
-                                           chat_template=chat_template)
-
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=stop_token_ids,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=chat_template,
-    )
-
-
-def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
-    # num_crops is an override kwarg to the multimodal image processor;
-    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
-    # to use 16 for single frame scenarios, and 4 for multi-frame.
-    #
-    # Generally speaking, a larger value for num_crops results in more
-    # tokens per image instance, because it may scale the image more in
-    # the image preprocessing. Some references in the model docs and the
-    # formula for image tokens after the preprocessing
-    # transform can be found below.
-    #
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,
-        max_model_len=4096,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        mm_processor_kwargs={"num_crops": 4},
-    )
-    placeholders = "\n".join(f"<|image_{i}|>"
-                             for i, _ in enumerate(image_urls, start=1))
-    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
-    stop_token_ids = None
-
+def load_aria(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "rhymes-ai/Aria"
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              limit_mm_per_prompt={"image": len(image_urls)})
+    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
+    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
+        chat_template=None)
 
 
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
@@ -141,6 +86,37 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 2 * 364
+            },
+        },
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -165,7 +141,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     # Stop tokens for InternVL
     # models variants may have different stop tokens
     # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
@@ -178,6 +154,28 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_nvlm_d(question: str, image_urls: List[str]):
     model_name = "nvidia/NVLM-D-72B"
 
@@ -211,6 +209,80 @@ def load_nvlm_d(question: str, image_urls: List[str]):
     )
 
 
+def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"num_crops": 4},
+    )
+    placeholders = "\n".join(f"<|image_{i}|>"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
+def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "Qwen/Qwen-VL-Chat"
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=1024,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = "".join(f"Picture {i}: <img></img>\n"
+                           for i, _ in enumerate(image_urls, start=1))
+
+    # This model does not have a chat_template attribute on its tokenizer,
+    # so we need to explicitly pass it. We use ChatML since it's used in the
+    # generation utils of the model:
+    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+
+    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
+    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
+
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True,
+                                           chat_template=chat_template)
+
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=chat_template,
+    )
+
+
 def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
@@ -268,68 +340,16 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-
-    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=None,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
-
-
-def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        # if you are running out of memory, you can reduce the "longest_edge".
-        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
-        mm_processor_kwargs={
-            "size": {
-                "longest_edge": 2 * 364
-            },
-        },
-    )
-
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
-    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=None,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
-
-
 model_example_map = {
-    "phi3_v": load_phi3v,
+    "aria": load_aria,
     "h2ovl_chat": load_h2onvl,
+    "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
-    "qwen2_vl": load_qwen2_vl,
+    "phi3_v": load_phi3v,
     "qwen_vl_chat": load_qwenvl_chat,
-    "mllama": load_mllama,
-    "idefics3": load_idefics3,
+    "qwen2_vl": load_qwen2_vl,
 }
 
 
diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference_with_profiler.py
index 1f00d26808771..abcfa8e8f2f2a 100644
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -1,4 +1,5 @@
 import os
+import time
 
 from vllm import LLM, SamplingParams
 
@@ -15,19 +16,25 @@
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
+if __name__ == "__main__":
 
-llm.start_profile()
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    llm.start_profile()
 
-llm.stop_profile()
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
 
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    llm.stop_profile()
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)
diff --git a/examples/offline_profile.py b/examples/offline_profile.py
index 1d415b82cddb6..46afe8aa2604b 100644
--- a/examples/offline_profile.py
+++ b/examples/offline_profile.py
@@ -4,9 +4,10 @@
 import sys
 from argparse import RawTextHelpFormatter
 from dataclasses import asdict, dataclass
-from typing import Optional
+from typing import Any, Dict, Generator, List, Optional, TypeAlias
 
 import torch
+import tqdm
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -15,16 +16,21 @@
 
 BATCH_SIZE_DEFAULT = 1
 PROMPT_LEN_DEFAULT = 256
-OUTPUT_LEN_DEFAULT = 2
 
 
 @dataclass
 class ProfileContext:
     engine_args: EngineArgs
     prompt_len: int
-    output_len: int
     batch_size: int
-    save_chrome_traces_folder: Optional[str]
+
+    # The profiler can run in 2 modes,
+    # 1. Run profiler for user specified num_steps
+    num_steps: Optional[int] = None
+    # 2. Run profiler until all requests complete
+    complete_num_requests_per_step: Optional[int] = None
+
+    save_chrome_traces_folder: Optional[str] = None
 
 
 def get_dtype(dtype: str):
@@ -34,23 +40,155 @@ def get_dtype(dtype: str):
         return dtype
 
 
+OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
+def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
+      -> OutputLen_NumReqs_Map:
+    """
+    Given the number of requests, batch_size, and the number of requests
+    that each engine-step should process, step_requests, determine the
+    output lengths of the requests such that step_request is honoured.
+
+    Example: 
+    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
+    then return,
+    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
+    32 requests should have output length 2,
+    32 requests should have output length 3,
+    32 requests should have output length 4,
+    31 requests should have output length 5,
+    1 request should have output length 6.
+
+    Args:
+        batch_size (int): Number of requests submitted for profile. This is
+            args.batch_size.
+        step_requests (List[int]): step_requests[i] is the number of requests
+            that the ith engine step should process.
+
+    Returns:
+        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
+            number of requests required to have that output-length as values.
+    """
+    ol_nr: OutputLen_NumReqs_Map = {}
+
+    # Number of request that are assigned an output-length
+    num_reqs_assigned: int = 0
+    num_steps: int = len(step_requests)
+
+    # sanity check. The first step (prefill-step), must process all requests.
+    assert step_requests[0] == batch_size
+
+    # Begin assignments from the last step.
+    output_length: int = num_steps
+    for num_requests_at_step in reversed(step_requests):
+        if num_reqs_assigned == batch_size:
+            break
+
+        assert num_reqs_assigned < batch_size
+
+        # Remove the number of requests that have been determined
+        # to participate in this step and beyond.
+        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
+        assert num_reqs_unassigned_at_step >= 0
+
+        if num_reqs_unassigned_at_step > 0:
+            ol_nr[output_length] = num_reqs_unassigned_at_step
+            num_reqs_assigned += num_reqs_unassigned_at_step
+
+        output_length -= 1
+
+    # sanity checks.
+    assert sum(ol_nr.values()) == batch_size, \
+            ("Number of requests in output-length assignment does not match "
+             f"batch-size.\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+
+    # Check that the output-length is in [1, num-steps]. Output length must be
+    # at least 1 as all requests must participate in the prefill-step.
+    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \
+            ("Output lengths of requests should be in range "
+             f"[1, num-engine-steps].\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+
+    return ol_nr
+
+
+def determine_requests_per_step(context: ProfileContext) -> List[int]:
+    """
+    Determine number of requests each engine step should process.
+    If context.num_steps is set, then all engine steps process the
+    same number of requests and the output list is of length
+    context.num_steps.
+
+    If context.complete_num_requests_per_step is set, then each decode step
+    processes fewer and fewer requests until there are no requests to process.
+    In this case, the output list is as big as the number of steps
+    required to process all requests.
+
+    Args:
+        context: ProfileContext object.
+
+    Returns:
+        List[int]: Number of requests to process for all engine-steps. 
+         output[i], contains the number of requests that the ith step
+         should process.
+    """
+    if context.num_steps:
+        # All requests must run until num_engine_steps. This implies
+        # that their output lengths must be equal to num_engine_steps.
+        return [context.batch_size] * context.num_steps
+
+    assert context.complete_num_requests_per_step and \
+                context.complete_num_requests_per_step > 0, \
+        (f"Expected a positive complete_num_requests_per_step argument."
+         f"Instead got {context.complete_num_requests_per_step}")
+
+    # We start dropping after the first decode step.
+    step_requests = [
+        context.batch_size,  # prefill
+        context.batch_size,  # decode
+    ]
+
+    num_running_requests = context.batch_size
+    num_running_requests -= context.complete_num_requests_per_step
+    while num_running_requests > 0:
+        step_requests.append(num_running_requests)
+        num_running_requests -= context.complete_num_requests_per_step
+
+    if step_requests[-1] != 1:
+        # have 1 request running at the last step. This is often
+        # useful
+        step_requests.append(1)
+
+    return step_requests
+
+
 def run_profile(context: ProfileContext, csv_output: Optional[str],
                 json_output: Optional[str]):
     print("Run profile with:")
     for key, value in asdict(context).items():
         print(f"  {key} = {value}")
 
+    requests_per_step: List[int] = determine_requests_per_step(context)
+
+    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
+        context.batch_size, requests_per_step)
+
+    num_steps_to_profile: int = len(requests_per_step)
+    max_output_len: int = max(ol_nr.keys())
+    assert max_output_len >= 1
+
     # Create sampling params
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=args.output_len,
-                                     ignore_eos=True)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        # max_tokens is set on a per-request basis.
+        max_tokens=None,
+        ignore_eos=True)
 
     # Create LLM
     llm = LLM(**asdict(context.engine_args))
     batch_size = context.batch_size
     prompt_len = context.prompt_len
-    output_len = context.output_len
 
     scheduler_config = llm.llm_engine.scheduler_config
     max_model_len = llm.llm_engine.model_config.max_model_len
@@ -65,7 +203,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
               f"choose a smaller batch size or prompt length, or increase "
               f"--max-num-batched-tokens")
         sys.exit(-1)
-    if batch_size >= max_num_seqs:
+    if batch_size > max_num_seqs:
         print(
             f"ERROR: chosen batch_size ({batch_size}) is larger than "
             f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
@@ -73,16 +211,26 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
         sys.exit(-1)
     print("llm.llm_engine.model_config.max_model_len: ",
           llm.llm_engine.model_config.max_model_len)
-    if prompt_len + output_len > llm.llm_engine.model_config.max_model_len:
-        print(
-            f"ERROR: chosen prompt_len + output_len ({prompt_len} + "
-            f"{output_len} = {prompt_len + output_len}) is larger than the "
-            f"model's max_model_len ({max_model_len}), please choose a smaller "
-            f"prompt_len or output_len, or increase --max-model-len")
+    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
+        print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
+              f"{max_output_len} = {prompt_len + max_output_len}) is larger "
+              f"than the model's max_model_len ({max_model_len}), please "
+              f"choose a smaller prompt_len or max_output_len, or increase "
+              f"--max-model-len")
         sys.exit(-1)
 
     def add_requests():
+
+        def get_output_len_generator() -> Generator[int, Any, Any]:
+            for output_len, num_reqs in ol_nr.items():
+                for _ in range(num_reqs):
+                    yield output_len
+
+        output_len_generator = get_output_len_generator()
         for i in range(batch_size):
+            sampling_params.max_tokens = next(output_len_generator)
+            assert isinstance(sampling_params.max_tokens, int)
+
             prompt_token_ids = torch.randint(
                 llm.llm_engine.model_config.get_vocab_size(),
                 size=(prompt_len, )).tolist()
@@ -110,8 +258,11 @@ def abort_requests():
         llm.llm_engine.step()  # First step is prefill
 
     decode_profs = []
-    for x in range(args.output_len - 1):
-        with layerwise_profile() as decode_prof:
+    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
+        num_running_seqs = llm.llm_engine.scheduler[
+            0].get_num_unfinished_seq_groups()
+        with layerwise_profile(
+                num_running_seqs=num_running_seqs) as decode_prof:
             llm.llm_engine.step()
         decode_profs.append(decode_prof)
 
@@ -154,7 +305,8 @@ def abort_requests():
         decode_results_list[0].print_summary_table()
 
     if csv_output:
-        csv_filename_base = csv_output.rstrip(".csv")
+        csv_filename_base = csv_output[:-4] \
+                if csv_output.endswith('.csv') else csv_output
         prefill_results.export_model_stats_table_csv(
             csv_filename_base + "_prefill_model_table.csv")
         prefill_results.export_summary_stats_table_csv(
@@ -187,10 +339,10 @@ def abort_requests():
             for idx, dr in enumerate(decode_results_list):
                 json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
 
-        for idx, dr in enumerate(decode_results_list[1:]):
-            json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
-
-        with open(json_output.rstrip(".json") + ".json", "w+") as f:
+        # Add .json to json_output filename if it doesn't exist already.
+        json_output_file = json_output if json_output.endswith(
+            '.json') else json_output + '.json'
+        with open(json_output_file, "w+") as f:
             json.dump(json_dict, f, indent=2)
         pass
 
@@ -214,7 +366,7 @@ def abort_requests():
     python examples/offline_profile.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
-        --enforce-eager
+        --enforce-eager run_num_steps -n 2
     ```
 
     then you can use various tools to analyze the json output
@@ -261,17 +413,41 @@ def abort_requests():
                         default=BATCH_SIZE_DEFAULT,
                         help=f"Number of requests to run as a single batch, "
                         f"default={BATCH_SIZE_DEFAULT}")
-    parser.add_argument(
-        "--output-len",
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    run_num_steps_parser = subparsers.add_parser(
+        "run_num_steps",
+        help="This variation profiles n engine.step() invocations.")
+    run_num_steps_parser.add_argument(
+        '-n',
+        '--num-steps',
         type=int,
-        default=OUTPUT_LEN_DEFAULT,
-        help="Number of llm steps to run (includes prefill and decode) "
-        "- default={OUTPUT_LEN_DEFAULT}")
+        help="Number of engine steps to profile.\n"
+        "Setting it to 1, profiles only the prefill step.\n"
+        "Setting it to 2, profiles the prefill and first decode step\n"
+        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
+        "and so on ...")
+
+    run_to_completion_parser = subparsers.add_parser(
+        "run_to_completion",
+        help="This variation profiles all the engine.step() invocations"
+        "until the engine exhausts all submitted requests.")
+    run_to_completion_parser.add_argument(
+        '-n',
+        '--complete-num-requests-per-step',
+        type=int,
+        help=
+        "Complete complete_num_requests_per_step requests every decode step."
+        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
+        "the profiler is run for 6 engine steps, with the steps processing, "
+        "128, 128, 96, 64, 32, 1 requests respectively.\n"
+        "Note that we tack-on a one-request step at the end as it is often "
+        "useful.")
 
     EngineArgs.add_cli_args(parser)
 
     args = parser.parse_args()
-
     context = ProfileContext(
         engine_args=EngineArgs.from_cli_args(args),
         **{
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index 0ec4f71dddf93..6a160fd70423f 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -153,10 +153,37 @@ def run_multi_image() -> None:
 
 # Audio input inference
 def run_audio() -> None:
-    # Any format supported by librosa is supported
     audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        # Any format supported by librosa is supported
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
 
-    # Use audio url in the payload
+    # HTTP URL
     chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role":
@@ -169,6 +196,7 @@ def run_audio() -> None:
                 {
                     "type": "audio_url",
                     "audio_url": {
+                        # Any format supported by librosa is supported
                         "url": audio_url
                     },
                 },
@@ -181,7 +209,7 @@ def run_audio() -> None:
     result = chat_completion_from_url.choices[0].message.content
     print("Chat completion output from audio url:", result)
 
-    audio_base64 = encode_base64_content_from_url(audio_url)
+    # base64 URL
     chat_completion_from_base64 = client.chat.completions.create(
         messages=[{
             "role":
diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/openai_chat_completion_structured_outputs.py
new file mode 100644
index 0000000000000..8c059c7ca07ce
--- /dev/null
+++ b/examples/openai_chat_completion_structured_outputs.py
@@ -0,0 +1,94 @@
+from enum import Enum
+
+from openai import OpenAI
+from pydantic import BaseModel
+
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="-",
+)
+
+# Guided decoding by Choice (list of possible options)
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": "Classify this sentiment: vLLM is wonderful!"
+    }],
+    extra_body={"guided_choice": ["positive", "negative"]},
+)
+print(completion.choices[0].message.content)
+
+# Guided decoding by Regex
+prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+          "End in .com and new line. Example result:"
+          "alan.turing@enigma.com\n")
+
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={
+        "guided_regex": "\w+@\w+\.com\n",
+        "stop": ["\n"]
+    },
+)
+print(completion.choices[0].message.content)
+
+
+# Guided decoding by JSON using Pydantic schema
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+json_schema = CarDescription.model_json_schema()
+
+prompt = ("Generate a JSON with the brand, model and car_type of"
+          "the most iconic car from the 90's")
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_json": json_schema},
+)
+print(completion.choices[0].message.content)
+
+# Guided decoding by Grammar
+simplified_sql_grammar = """
+    ?start: select_statement
+
+    ?select_statement: "SELECT " column_list " FROM " table_name
+
+    ?column_list: column_name ("," column_name)*
+
+    ?table_name: identifier
+
+    ?column_name: identifier
+
+    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+"""
+
+prompt = ("Generate an SQL query to show the 'username' and 'email'"
+          "from the 'users' table.")
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_grammar": simplified_sql_grammar},
+)
+print(completion.choices[0].message.content)
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
index fff82020d9a30..a56e7429b7567 100644
--- a/examples/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -99,7 +99,7 @@ def dse_qwen2_vl(inp: dict):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
-        "the model with --task embedding before running this.")
+        "the model with --task embed before running this.")
     parser.add_argument("model",
                         type=str,
                         choices=["vlm2vec", "dse_qwen2_vl"],
diff --git a/examples/openai_cross_encoder_score.py b/examples/openai_cross_encoder_score.py
new file mode 100644
index 0000000000000..a06af8df5d3fe
--- /dev/null
+++ b/examples/openai_cross_encoder_score.py
@@ -0,0 +1,59 @@
+"""
+Example online usage of Score API.
+
+Run `vllm serve <model> --task score` to start up the server in vLLM.
+"""
+import argparse
+import pprint
+
+import requests
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+    args = parser.parse_args()
+    api_url = f"http://{args.host}:{args.port}/score"
+
+    model_name = args.model
+
+    text_1 = "What is the capital of Brazil?"
+    text_2 = "The capital of Brazil is Brasilia."
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Prompt when text_1 and text_2 are both strings:")
+    pprint.pprint(prompt)
+    print("Score Response:")
+    pprint.pprint(score_response.json())
+
+    text_1 = "What is the capital of France?"
+    text_2 = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Prompt when text_1 is string and text_2 is a list:")
+    pprint.pprint(prompt)
+    print("Score Response:")
+    pprint.pprint(score_response.json())
+
+    text_1 = [
+        "What is the capital of Brazil?", "What is the capital of France?"
+    ]
+    text_2 = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Prompt when text_1 and text_2 are both lists:")
+    pprint.pprint(prompt)
+    print("Score Response:")
+    pprint.pprint(score_response.json())
diff --git a/examples/tool_chat_template_granite.jinja b/examples/tool_chat_template_granite.jinja
index 2cc19e77188dc..467dcb2d10237 100644
--- a/examples/tool_chat_template_granite.jinja
+++ b/examples/tool_chat_template_granite.jinja
@@ -21,11 +21,7 @@
     {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>
 ' }}
     {%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %}
-    {{- '<|start_of_role|>assistant<|end_of_role|>' }}
-        {% for tc in message.tool_calls %}
-            {{- '<|tool_call|> ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson  }}
-        {% endfor %}
-    {{- '<|end_of_text|>
+    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message.tool_calls|map(attribute='function')|list|tojson(indent=4) + '<|end_of_text|>
 ' }}
     {%- elif message['role'] == 'assistant' %}
     {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>
diff --git a/examples/tool_chat_template_llama3.1_json.jinja b/examples/tool_chat_template_llama3.1_json.jinja
index c24a7e51335ef..033830936a56b 100644
--- a/examples/tool_chat_template_llama3.1_json.jinja
+++ b/examples/tool_chat_template_llama3.1_json.jinja
@@ -19,10 +19,18 @@
 
 {#- This block extracts the system message, so we can slot it into the right place. #}
 {%- if messages[0]['role'] == 'system' %}
-    {%- set system_message = messages[0]['content']|trim %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
     {%- set messages = messages[1:] %}
 {%- else %}
-    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
+    {%- if tools is not none %}
+        {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
+    {%- else %}
+        {%- set system_message = "" %}
+    {%- endif %}
 {%- endif %}
 
 {#- System message #}
@@ -33,8 +41,8 @@
 {{- "Cutting Knowledge Date: December 2023\n" }}
 {{- "Today Date: " + date_string + "\n\n" }}
 {%- if tools is not none and not tools_in_user_message %}
-    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
-    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
     {{- "Do not use variables.\n\n" }}
     {%- for t in tools %}
         {{- t | tojson(indent=4) }}
@@ -48,7 +56,11 @@
 {%- if tools_in_user_message and not tools is none %}
     {#- Extract the first user message so we can plug it in here #}
     {%- if messages | length != 0 %}
-        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- if messages[0]['content'] is string %}
+            {%- set first_user_message = messages[0]['content']|trim %}
+        {%- else %}
+            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
+        {%- endif %}
         {%- set messages = messages[1:] %}
     {%- else %}
         {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
@@ -56,7 +68,7 @@
     {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
     {{- "Given the following functions, please respond with a JSON for a function call " }}
     {{- "with its proper arguments that best answers the given prompt.\n\n" }}
-    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
     {{- "Do not use variables.\n\n" }}
     {%- for t in tools %}
         {{- t | tojson(indent=4) }}
@@ -67,7 +79,17 @@
 
 {%- for message in messages %}
     {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
-        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] | trim}}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'text' %}
+                    {{- content['text'] | trim }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|eot_id|>' }}
     {%- elif 'tool_calls' in message %}
         {%- if not message.tool_calls|length == 1 %}
             {{- raise_exception("This model only supports single tool-calls at once!") }}
@@ -81,10 +103,14 @@
         {{- "<|eot_id|>" }}
     {%- elif message.role == "tool" or message.role == "ipython" %}
         {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
-        {%- if message.content is mapping %}
-            {{- message.content | tojson }}
-        {%- else %}
+        {%- if message.content is string %}
             {{- { "output": message.content } | tojson }}
+        {%- else %}
+            {%- for content in message['content']  %}
+                {%- if content['type']  == 'text' %}
+                    {{- { "output": content['text']  } | tojson }}
+                {%- endif %}
+            {%- endfor %}
         {%- endif %}
         {{- "<|eot_id|>" }}
     {%- endif %}
diff --git a/examples/tool_chat_template_llama3.2_json.jinja b/examples/tool_chat_template_llama3.2_json.jinja
index 7e24777726a35..2b290c0eede03 100644
--- a/examples/tool_chat_template_llama3.2_json.jinja
+++ b/examples/tool_chat_template_llama3.2_json.jinja
@@ -16,38 +16,62 @@
     {%- set tools = none %}
 {%- endif %}
 
+{#- Find out if there are any images #}
+{% set image_ns = namespace(has_images=false) %}
+{%- for message in messages %}
+    {%- for content in message['content'] %}
+        {%- if content['type'] == 'image' %}
+            {%- set image_ns.has_images = true %}
+        {%- endif %}
+    {%- endfor %}
+{%- endfor %}
+
 {#- This block extracts the system message, so we can slot it into the right place. #}
 {%- if messages[0]['role'] == 'system' %}
-    {%- set system_message = messages[0]['content']|trim %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
     {%- set messages = messages[1:] %}
 {%- else %}
-    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
+    {%- if tools is not none %}
+        {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
+    {%- else %}
+        {%- set system_message = "" %}
+    {%- endif %}
 {%- endif %}
 
-{#- System message #}
-{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
-{%- if tools is not none %}
-    {{- "Environment: ipython\n" }}
-{%- endif %}
-{{- "Cutting Knowledge Date: December 2023\n" }}
-{{- "Today Date: " + date_string + "\n\n" }}
-{%- if tools is not none and not tools_in_user_message %}
-    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
-    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
-    {{- "Do not use variables.\n\n" }}
-    {%- for t in tools %}
-        {{- t | tojson(indent=4) }}
-        {{- "\n\n" }}
-    {%- endfor %}
+{#- System message if there are no images, if the user supplied one, or if tools are used (default tool system message) #}
+{%- if system_message or not image_ns.has_images %}
+    {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+    {%- if tools is not none %}
+        {{- "Environment: ipython\n" }}
+    {%- endif %}
+    {{- "Cutting Knowledge Date: December 2023\n" }}
+    {{- "Today Date: " + date_string + "\n\n" }}
+    {%- if tools is not none and not tools_in_user_message %}
+        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }}
+        {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
+        {{- "Do not use variables.\n\n" }}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+    {%- endif %}
+    {{- system_message }}
+    {{- "<|eot_id|>" }}
 {%- endif %}
-{{- system_message }}
-{{- "<|eot_id|>" }}
 
 {#- Custom tools are passed in a user message with some extra guidance #}
 {%- if tools_in_user_message and not tools is none %}
     {#- Extract the first user message so we can plug it in here #}
     {%- if messages | length != 0 %}
-        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- if messages[0]['content'] is string %}
+            {%- set first_user_message = messages[0]['content']|trim %}
+        {%- else %}
+            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
+        {%- endif %}
         {%- set messages = messages[1:] %}
     {%- else %}
         {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
@@ -55,7 +79,7 @@
     {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
     {{- "Given the following functions, please respond with a JSON for a function call " }}
     {{- "with its proper arguments that best answers the given prompt.\n\n" }}
-    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
     {{- "Do not use variables.\n\n" }}
     {%- for t in tools %}
         {{- t | tojson(indent=4) }}
@@ -66,7 +90,19 @@
 
 {%- for message in messages %}
     {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
-        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] | trim}}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] | trim }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|eot_id|>' }}
     {%- elif 'tool_calls' in message %}
         {%- if not message.tool_calls|length == 1 %}
             {{- raise_exception("This model only supports single tool-calls at once!") }}
@@ -80,10 +116,14 @@
         {{- "<|eot_id|>" }}
     {%- elif message.role == "tool" or message.role == "ipython" %}
         {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
-        {%- if message.content is mapping %}
-            {{- message.content | tojson }}
-        {%- else %}
+        {%- if message.content is string %}
             {{- { "output": message.content } | tojson }}
+        {%- else %}
+            {%- for content in message['content']  %}
+                {%- if content['type']  == 'text' %}
+                    {{- { "output": content['text']  } | tojson }}
+                {%- endif %}
+            {%- endfor %}
         {%- endif %}
         {{- "<|eot_id|>" }}
     {%- endif %}
diff --git a/format.sh b/format.sh
index a57882d2ac3f9..0b196de9d0773 100755
--- a/format.sh
+++ b/format.sh
@@ -41,6 +41,7 @@ MYPY_VERSION=$(mypy --version | awk '{print $2}')
 CODESPELL_VERSION=$(codespell --version)
 ISORT_VERSION=$(isort --vn)
 CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
+SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}')
 
 # # params: tool name, tool version, required version
 tool_version_check() {
@@ -57,6 +58,7 @@ tool_version_check "mypy" "$MYPY_VERSION"
 tool_version_check "isort" "$ISORT_VERSION"
 tool_version_check "codespell" "$CODESPELL_VERSION"
 tool_version_check "clang-format" "$CLANGFORMAT_VERSION"
+tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION"
 
 YAPF_FLAGS=(
     '--recursive'
@@ -299,6 +301,10 @@ echo 'vLLM shellcheck:'
 tools/shellcheck.sh
 echo 'vLLM shellcheck: Done'
 
+echo 'excalidraw png check:'
+tools/png-lint.sh
+echo 'excalidraw png check: Done'
+
 if ! git diff --quiet &>/dev/null; then
     echo 
     echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:"
@@ -309,3 +315,7 @@ if ! git diff --quiet &>/dev/null; then
 else
     echo "✨🎉 Format check passed! Congratulations! 🎉✨"
 fi
+
+echo 'vLLM sphinx-lint:'
+tools/sphinx-lint.sh
+echo 'vLLM sphinx-lint: Done'
diff --git a/pyproject.toml b/pyproject.toml
index 3c8c46cc8621e..45fa4bff4e680 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,7 @@ exclude = [
 ]
 
 [tool.codespell]
-ignore-words-list = "dout, te, indicies, subtile"
+ignore-words-list = "dout, te, indicies, subtile, ElementE"
 skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
 
 [tool.isort]
@@ -96,6 +96,8 @@ markers = [
     "core_model: enable this model test in each PR instead of only nightly",
     "cpu_model: enable this model test in CPU tests",
     "quant_model: run this model test under Quantized category",
-    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
+    "split: run this test as part of a split",
+    "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",
+    "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
diff --git a/python_only_dev.py b/python_only_dev.py
index 1ca0f5c30b741..f70b4984025b3 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -1,92 +1,14 @@
-# enable python only development
-# copy compiled files to the current directory directly
+msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
 
-import argparse
-import os
-import shutil
-import subprocess
-import sys
-import warnings
+TL;DR:
 
-parser = argparse.ArgumentParser(
-    description="Development mode for python-only code")
-parser.add_argument('-q',
-                    '--quit-dev',
-                    action='store_true',
-                    help='Set the flag to quit development mode')
-args = parser.parse_args()
+VLLM_USE_PRECOMPILED=1 pip install -e .
 
-# cannot directly `import vllm` , because it will try to
-# import from the current directory
-output = subprocess.run([sys.executable, "-m", "pip", "show", "vllm"],
-                        capture_output=True)
+or
 
-assert output.returncode == 0, "vllm is not installed"
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+pip install -e .
+""" # noqa
 
-text = output.stdout.decode("utf-8")
-
-package_path = None
-for line in text.split("\n"):
-    if line.startswith("Location: "):
-        package_path = line.split(": ")[1]
-        break
-
-assert package_path is not None, "could not find package path"
-
-cwd = os.getcwd()
-
-assert cwd != package_path, "should not import from the current directory"
-
-files_to_copy = [
-    "vllm/_C.abi3.so",
-    "vllm/_moe_C.abi3.so",
-    "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
-    "vllm/vllm_flash_attn/flash_attn_interface.py",
-    "vllm/vllm_flash_attn/__init__.py",
-    # "vllm/_version.py", # not available in nightly wheels yet
-]
-
-# Try to create _version.py to avoid version related warning
-# Refer to https://github.com/vllm-project/vllm/pull/8771
-try:
-    from setuptools_scm import get_version
-    get_version(write_to="vllm/_version.py")
-except ImportError:
-    warnings.warn(
-        "To avoid warnings related to vllm._version, "
-        "you should install setuptools-scm by `pip install setuptools-scm`",
-        stacklevel=2)
-
-if not args.quit_dev:
-    for file in files_to_copy:
-        src = os.path.join(package_path, file)
-        dst = file
-        print(f"Copying {src} to {dst}")
-        shutil.copyfile(src, dst)
-
-    pre_built_vllm_path = os.path.join(package_path, "vllm")
-    tmp_path = os.path.join(package_path, "vllm_pre_built")
-    current_vllm_path = os.path.join(cwd, "vllm")
-
-    print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup")
-    shutil.copytree(pre_built_vllm_path, tmp_path)
-    shutil.rmtree(pre_built_vllm_path)
-
-    print(f"Linking {current_vllm_path} to {pre_built_vllm_path}")
-    os.symlink(current_vllm_path, pre_built_vllm_path)
-else:
-    vllm_symlink_path = os.path.join(package_path, "vllm")
-    vllm_backup_path = os.path.join(package_path, "vllm_pre_built")
-    current_vllm_path = os.path.join(cwd, "vllm")
-
-    print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}")
-    assert os.path.islink(
-        vllm_symlink_path
-    ), f"not in dev mode: {vllm_symlink_path} is not a symbolic link"
-    assert current_vllm_path == os.readlink(
-        vllm_symlink_path
-    ), "current directory is not the source code of package"
-    os.unlink(vllm_symlink_path)
-
-    print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}")
-    os.rename(vllm_backup_path, vllm_symlink_path)
+print(msg)
diff --git a/requirements-build.txt b/requirements-build.txt
index fec01caaf25ef..388b193403e88 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1
+torch==2.5.1; platform_machine != 'aarch64'
 wheel
 jinja2
diff --git a/requirements-common.txt b/requirements-common.txt
index f62ad66a1ecc4..250e2b17ffc23 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -3,6 +3,7 @@ sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
 requests >= 2.26.0
 tqdm
+blake3
 py-cpuinfo
 transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
 tokenizers >= 0.19.1  # Required for Llama 3.
@@ -10,17 +11,19 @@ protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
 fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
-openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
+openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 uvicorn[standard]
 pydantic >= 2.9  # Required for fastapi >= 0.113.0
-pillow  # Required for image processing
 prometheus_client >= 0.18.0
+pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines >= 0.0.43, < 0.1
+outlines == 0.1.11
+lark == 1.2.2 
+xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq
 msgspec
@@ -31,4 +34,5 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.0 # required for compressed-tensors
+compressed-tensors == 0.8.1 # required for compressed-tensors
+depyf==0.18.0 # required for profiling and debugging torch.compile
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 749b03a0603d8..e62f313297762 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -1,6 +1,8 @@
 # Common dependencies
 -r requirements-common.txt
 
-# Dependencies for x86_64 CPUs
-torch == 2.5.1+cpu; platform_machine != "ppc64le"
-torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+# Dependencies for CPUs
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
+torch==2.5.1; platform_machine == "aarch64"
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+datasets # for benchmark scripts
\ No newline at end of file
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
new file mode 100644
index 0000000000000..bbcb5cb7012ce
--- /dev/null
+++ b/requirements-cuda-arm64.txt
@@ -0,0 +1,3 @@
+--index-url https://download.pytorch.org/whl/nightly/cu124
+torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
+torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 058ab7c1ee9df..9cbd5e32ec097 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,8 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1
+torch == 2.5.1; platform_machine != 'aarch64'
 # These must be updated alongside torch
-torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
+fastsafetensors # Required for model loading via gpu direct storage
\ No newline at end of file
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 4674efb812cfd..f4fb89ef42834 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
diff --git a/requirements-lint.txt b/requirements-lint.txt
index f9132bbf96437..711bb50a0e936 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -6,6 +6,7 @@ ruff==0.6.5
 codespell==2.3.0
 isort==5.13.2
 clang-format==18.1.5
+sphinx-lint==1.0.0
 
 # type checking
 mypy==1.11.1
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 121123611d2da..ccc9062341772 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -5,7 +5,8 @@
 awscli
 boto3
 botocore
+datasets
 ray >= 2.10.0
 peft
 pytest-asyncio
-tensorizer>=2.9.0
\ No newline at end of file
+tensorizer>=2.9.0
diff --git a/requirements-test.in b/requirements-test.in
index 44972866ddc4b..fb4179c3d8423 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -13,7 +13,7 @@ einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
 peft
-ray[adag]==2.35
+ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 timm # required for internvl test
@@ -24,11 +24,8 @@ mistral_common[opencv] >= 1.5.0 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 
-# TODO: Add this after fully implementing llava(mantis)
-# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
-
 # quantization
-bitsandbytes>=0.44.0
+bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
 
 numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index a59b85023948b..3771577fe8ed0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,8 +1,8 @@
 #
-# This file is autogenerated by pip-compile with Python 3.9
+# This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    pip-compile requirements-test.in
+#    python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
 #
 absl-py==2.1.0
     # via rouge-score
@@ -27,10 +27,6 @@ anyio==4.6.2.post1
     # via httpx
 argcomplete==3.5.1
     # via datamodel-code-generator
-async-timeout==4.0.3
-    # via
-    #   aiohttp
-    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -41,7 +37,7 @@ audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements-test.in
-bitsandbytes==0.44.1
+bitsandbytes>=0.45.0
     # via -r requirements-test.in
 black==24.10.0
     # via datamodel-code-generator
@@ -111,10 +107,6 @@ email-validator==2.2.0
     # via pydantic
 evaluate==0.4.3
     # via lm-eval
-exceptiongroup==1.2.2
-    # via
-    #   anyio
-    #   pytest
 fastrlock==0.8.2
     # via cupy-cuda12x
 filelock==3.16.1
@@ -165,8 +157,6 @@ idna==3.10
     #   httpx
     #   requests
     #   yarl
-importlib-resources==6.4.5
-    # via matplotlib
 inflect==5.6.2
     # via datamodel-code-generator
 iniconfig==2.0.0
@@ -420,7 +410,7 @@ pyyaml==6.0.2
     #   ray
     #   timm
     #   transformers
-ray[adag]==2.35.0
+ray[adag]==2.40.0
     # via -r requirements-test.in
 redis==5.2.0
     # via tensorizer
@@ -516,14 +506,8 @@ tiktoken==0.7.0
     #   mistral-common
 timm==1.0.11
     # via -r requirements-test.in
-tokenizers==0.20.3
+tokenizers==0.21.0
     # via transformers
-toml==0.10.2
-    # via datamodel-code-generator
-tomli==2.0.2
-    # via
-    #   black
-    #   pytest
 torch==2.5.1
     # via
     #   -r requirements-test.in
@@ -550,7 +534,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.45.2
+transformers==4.47.0
     # via
     #   lm-eval
     #   peft
@@ -567,12 +551,9 @@ typepy[datetime]==1.3.2
     #   tabledata
 typing-extensions==4.12.2
     # via
-    #   anyio
-    #   black
     #   huggingface-hub
     #   librosa
     #   mistral-common
-    #   multidict
     #   pydantic
     #   pydantic-core
     #   torch
@@ -590,8 +571,6 @@ xxhash==3.5.0
     #   evaluate
 yarl==1.17.1
     # via aiohttp
-zipp==3.20.2
-    # via importlib-resources
 zstandard==0.23.0
     # via lm-eval
 
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index f9a0770804e55..b8f0b15469e77 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -16,8 +16,8 @@ ray[default]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.6.0.dev20241028+cpu
-torchvision==0.20.0.dev20241028+cpu
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241028-cp310-cp310-linux_x86_64.whl
-jaxlib==0.4.32.dev20240829
-jax==0.4.32.dev20240829
+torch==2.6.0.dev20241126+cpu
+torchvision==0.20.0.dev20241126+cpu
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl
+jaxlib==0.4.36.dev20241122
+jax==0.4.36.dev20241122
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index e41295792283f..42c6c321d040c 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -9,8 +9,8 @@ setuptools-scm>=8
 wheel
 jinja2
 
-torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
+torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
 
 triton-xpu == 3.0.0b1
diff --git a/setup.py b/setup.py
index b936589869e76..fcfaa207c176a 100644
--- a/setup.py
+++ b/setup.py
@@ -249,6 +249,74 @@ def run(self):
             self.copy_file(file, dst_file)
 
 
+class repackage_wheel(build_ext):
+    """Extracts libraries and other files from an existing wheel."""
+    default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+
+    def run(self) -> None:
+        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
+                                   self.default_wheel)
+
+        assert _is_cuda(
+        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
+        import zipfile
+
+        if os.path.isfile(wheel_location):
+            wheel_path = wheel_location
+            print(f"Using existing wheel={wheel_path}")
+        else:
+            # Download the wheel from a given URL, assume
+            # the filename is the last part of the URL
+            wheel_filename = wheel_location.split("/")[-1]
+
+            import tempfile
+
+            # create a temporary directory to store the wheel
+            temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
+            wheel_path = os.path.join(temp_dir, wheel_filename)
+
+            print(f"Downloading wheel from {wheel_location} to {wheel_path}")
+
+            from urllib.request import urlretrieve
+
+            try:
+                urlretrieve(wheel_location, filename=wheel_path)
+            except Exception as e:
+                from setuptools.errors import SetupError
+
+                raise SetupError(
+                    f"Failed to get vLLM wheel from {wheel_location}") from e
+
+        with zipfile.ZipFile(wheel_path) as wheel:
+            files_to_copy = [
+                "vllm/_C.abi3.so",
+                "vllm/_moe_C.abi3.so",
+                "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
+                "vllm/vllm_flash_attn/flash_attn_interface.py",
+                "vllm/vllm_flash_attn/__init__.py",
+                # "vllm/_version.py", # not available in nightly wheels yet
+            ]
+            file_members = filter(lambda x: x.filename in files_to_copy,
+                                  wheel.filelist)
+
+            for file in file_members:
+                print(f"Extracting and including {file.filename} "
+                      "from existing wheel")
+                package_name = os.path.dirname(file.filename).replace("/", ".")
+                file_name = os.path.basename(file.filename)
+
+                if package_name not in package_data:
+                    package_data[package_name] = []
+
+                wheel.extract(file)
+                if file_name.endswith(".py"):
+                    # python files shouldn't be added to package_data
+                    continue
+
+                package_data[package_name].append(file_name)
+
+
 def _is_hpu() -> bool:
     is_hpu_available = True
     try:
@@ -397,12 +465,15 @@ def get_vllm_version() -> str:
         if envs.VLLM_TARGET_DEVICE == "empty":
             version += f"{sep}empty"
     elif _is_cuda():
-        cuda_version = str(get_nvcc_cuda_version())
-        if cuda_version != MAIN_CUDA_VERSION:
-            cuda_version_str = cuda_version.replace(".", "")[:3]
-            # skip this for source tarball, required for pypi
-            if "sdist" not in sys.argv:
-                version += f"{sep}cu{cuda_version_str}"
+        if envs.VLLM_USE_PRECOMPILED:
+            version += ".precompiled"
+        else:
+            cuda_version = str(get_nvcc_cuda_version())
+            if cuda_version != MAIN_CUDA_VERSION:
+                cuda_version_str = cuda_version.replace(".", "")[:3]
+                # skip this for source tarball, required for pypi
+                if "sdist" not in sys.argv:
+                    version += f"{sep}cu{cuda_version_str}"
     elif _is_hip():
         # Get the HIP version
         hipcc_version = get_hipcc_rocm_version()
@@ -514,13 +585,18 @@ def _read_requirements(filename: str) -> List[str]:
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
-if envs.VLLM_USE_PRECOMPILED:
-    ext_modules = []
-    package_data["vllm"].append("*.so")
 
 if _no_device():
     ext_modules = []
 
+if not ext_modules:
+    cmdclass = {}
+else:
+    cmdclass = {
+        "build_ext":
+        repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
+    }
+
 setup(
     name="vllm",
     version=get_vllm_version(),
@@ -557,7 +633,7 @@ def _read_requirements(filename: str) -> List[str]:
         "audio": ["librosa", "soundfile"],  # Required for audio processing
         "video": ["decord"]  # Required for video processing
     },
-    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
+    cmdclass=cmdclass,
     package_data=package_data,
     entry_points={
         "console_scripts": [
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 7f16baa65a644..11d05cefb7313 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -14,17 +14,26 @@
 from vllm.platforms import current_platform
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
+from ..conftest import VllmRunner
 from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 
 MODELS = [
-    "facebook/opt-125m",
+    "google/gemma-2-2b-it",
     "meta-llama/Llama-3.2-1B",
 ]
 
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
     llm = LLM("facebook/opt-125m")
@@ -35,6 +44,7 @@ def test_vllm_gc_ed():
     assert weak_llm() is None
 
 
+@pytest.mark.skip_v1
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("dtype", ["half"])
@@ -42,8 +52,6 @@ def test_vllm_gc_ed():
 @pytest.mark.parametrize("enforce_eager", [False, True])
 def test_models(
     hf_runner,
-    vllm_runner,
-    example_prompts,
     model: str,
     backend: str,
     dtype: str,
@@ -54,15 +62,27 @@ def test_models(
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
 
+    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
+        pytest.skip(
+            "XFORMERS does not support gemma2 with full context length.")
+
     os.environ["VLLM_ATTENTION_BACKEND"] = backend
 
+    # 5042 tokens for gemma2
+    # gemma2 has alternating sliding window size of 4096
+    # we need a prompt with more than 4096 tokens to test the sliding window
+    prompt = "The following numbers of the sequence " + ", ".join(
+        str(i) for i in range(1024)) + " are:"
+    example_prompts = [prompt]
+
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    with vllm_runner(model,
-                     dtype=dtype,
-                     enforce_eager=enforce_eager,
-                     gpu_memory_utilization=0.7) as vllm_model:
+    with VllmRunner(model,
+                    max_model_len=8192,
+                    dtype=dtype,
+                    enforce_eager=enforce_eager,
+                    gpu_memory_utilization=0.7) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
@@ -107,6 +127,11 @@ def test_models_distributed(
     if attention_backend:
         os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
 
+    # Import VLLM_USE_V1 dynamically to handle patching
+    from vllm.envs import VLLM_USE_V1
+    if VLLM_USE_V1 and distributed_executor_backend != "mp":
+        pytest.skip(f"Skip {distributed_executor_backend} for V1")
+
     dtype = "half"
     max_tokens = 5
 
@@ -132,6 +157,7 @@ def test_models_distributed(
     )
 
 
+@pytest.mark.skip_v1
 def test_model_with_failure(vllm_runner) -> None:
     try:
         with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
@@ -158,6 +184,7 @@ def test_model_with_failure(vllm_runner) -> None:
         os.remove(filename)
 
 
+@pytest.mark.skip_v1
 def test_failure_with_async_out_proc(vllm_runner) -> None:
 
     filename = None
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index cc5bc2aca27c9..5f90c52481793 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -7,11 +7,11 @@
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import os
-from contextlib import nullcontext
 
 import pytest
 
 from tests.kernels.utils import override_backend_env_variable
+from vllm.platforms import current_platform
 
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
@@ -206,12 +206,14 @@ def test_models_with_fp8_kv_cache(
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
     vllm_runner,
     max_tokens: int,
     enforce_eager: bool,
     chunk_size: int,
     tensor_parallel_size: int,
+    dtype: str,
 ) -> None:
     """
     Checks exact match decode with and without prefix caching
@@ -229,11 +231,10 @@ def test_with_prefix_caching(
 
     max_num_batched_tokens = max_num_seqs = chunk_size
     outputs = {}  # type: ignore
-    check_result = True
     for enable in (True, False):
         with vllm_runner(
                 model,
-                dtype="half",
+                dtype=dtype,
                 max_num_batched_tokens=max_num_batched_tokens,
                 enable_chunked_prefill=True,
                 enable_prefix_caching=enable,
@@ -241,22 +242,72 @@ def test_with_prefix_caching(
                 enforce_eager=enforce_eager,
                 max_num_seqs=max_num_seqs,
         ) as vllm_model:
-            # It should fail when prefix caching is enable and chunk
-            # size is not a multiple of block size (16).
-            should_fail = chunk_size % 16 != 0 and enable
-            check_result &= not should_fail
             outputs[enable] = []
-            # Send the request one-by-one to ensure the cache is populated.
-            with pytest.raises(ValueError) if should_fail else nullcontext():
-                for prompt in full_prompts:
-                    outputs[enable] += vllm_model.generate_greedy([prompt],
-                                                                  max_tokens)
-
-    # Check results only if we did not expect a failure.
-    if check_result:
-        check_outputs_equal(
-            outputs_0_lst=outputs[False],
-            outputs_1_lst=outputs[True],
-            name_0="w/o prefix caching",
-            name_1="with prefix caching",
-        )
+            for prompt in full_prompts:
+                outputs[enable] += vllm_model.generate_greedy([prompt],
+                                                              max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=outputs[False],
+        outputs_1_lst=outputs[True],
+        name_0="w/o prefix caching",
+        name_1="with prefix caching",
+    )
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_models_cpu(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    attention_backend: str,
+    monkeypatch,
+) -> None:
+    test_models(
+        hf_runner,
+        vllm_runner,
+        example_prompts,
+        model,
+        dtype,
+        max_tokens,
+        chunked_prefill_token_size,
+        enforce_eager,
+        1,
+        attention_backend,
+        monkeypatch,
+    )
+
+
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("chunk_size", [30, 32])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_with_prefix_caching_cpu(
+    vllm_runner,
+    max_tokens: int,
+    enforce_eager: bool,
+    chunk_size: int,
+    dtype: str,
+) -> None:
+    test_with_prefix_caching(
+        vllm_runner,
+        max_tokens,
+        enforce_eager,
+        chunk_size,
+        1,
+        dtype,
+    )
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index 9d5c68274374e..8fa10e5bd1b37 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -1,7 +1,9 @@
 from copy import deepcopy
-from typing import Callable
+from typing import Callable, Union
 
-import torch
+from torch import fx
+
+from vllm.compilation.inductor_pass import InductorPass
 
 
 class TestBackend:
@@ -11,19 +13,21 @@ class TestBackend:
     It also saves the graph before and after the custom passes for inspection.
     """
 
-    def __init__(self, *args: Callable[[torch.fx.Graph], None]):
-        self.custom_passes = args
+    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
+                                                             None]]):
+        self.custom_passes = list(passes)
         from torch._inductor import config
         self.current_config = config.shallow_copy_dict()
+        self.current_config['force_disable_caches'] = True
         self.current_config['post_grad_custom_post_pass'] = self.post_pass
 
-    def __call__(self, graph: torch.fx.GraphModule, example_inputs):
+    def __call__(self, graph: fx.GraphModule, example_inputs):
         from torch._inductor.compile_fx import compile_fx
         return compile_fx(graph,
                           example_inputs,
                           config_patches=self.current_config)
 
-    def post_pass(self, graph: torch.fx.Graph):
+    def post_pass(self, graph: fx.Graph):
         self.graph_pre_pass = deepcopy(graph)
         for pass_ in self.custom_passes:
             pass_(graph)
diff --git a/tests/compile/piecewise/piecewise_compilation_config.json b/tests/compile/piecewise/piecewise_compilation_config.json
deleted file mode 100644
index 798a34e8dd92d..0000000000000
--- a/tests/compile/piecewise/piecewise_compilation_config.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-    "use_cudagraph": true,
-    "non_cudagraph_ops": ["silly.attention"],
-    "cudagraph_copy_inputs": true
-}
\ No newline at end of file
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index c631850ecdedb..aa11524812cdd 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -2,17 +2,15 @@
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
 """
-import os
 
 import torch
 from torch import nn
 from torch.library import Library
 
-from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
 from vllm.utils import direct_register_custom_op
 
 global_counter = 0
@@ -77,12 +75,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 def test_simple_piecewise_compile():
 
-    directory = os.path.dirname(__file__)
-    config = os.path.join(directory, "piecewise_compilation_config.json")
-    os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config
-    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
-
-    model = SillyModel(vllm_config=VllmConfig(), prefix='')
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=True,
+        splitting_ops=["silly.attention"],
+        cudagraph_copy_inputs=True,
+        cudagraph_capture_sizes=[1, 2],
+    ))
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
 
     inputs = torch.randn(100).cuda()
 
@@ -95,11 +96,10 @@ def test_simple_piecewise_compile():
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
 
-        with set_compile_context([1, 2]):
-            model(inputs)
+        model(inputs)
 
-            model(torch.randn(2).cuda())
-            model(torch.randn(1).cuda())
+        model(torch.randn(2).cuda())
+        model(torch.randn(1).cuda())
 
         input = torch.zeros(2).cuda()
         global global_counter
@@ -107,6 +107,3 @@ def test_simple_piecewise_compile():
         output = model(input)
         assert global_counter == 2
         assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
-
-    # clean up to avoid side effects for other tests
-    del os.environ["VLLM_TORCH_COMPILE_CONFIG"]
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index c363a587a818e..07c10a3a18c55 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -6,7 +6,6 @@
 if the config `tractable_init` is set to True. Otherwise, the weights are
 initialized randomly with a fixed seed.
 """
-import os
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
@@ -14,13 +13,10 @@
 from torch import nn
 from torch.library import Library
 
-from vllm.compilation.compile_context import set_compile_context
-from vllm.compilation.config import CompilationConfig
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
-from vllm.plugins import set_compilation_config
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
 from vllm.utils import direct_register_custom_op
 
 # create a library to hold the custom op
@@ -256,42 +252,34 @@ def run_model(llama_config,
               split_attn: bool = False) -> torch.Tensor:
 
     if use_compile:
-        os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(
-            CompilationLevel.PIECEWISE)
-
+        compilation_config = CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            use_cudagraph=True,
+            cudagraph_capture_sizes=[1, 2],
+        )
         if split_attn:
-            set_compilation_config(
-                CompilationConfig(
-                    use_cudagraph=True,
-                    non_cudagraph_ops=["silly.attention"],
-                ))
-        else:
-            set_compilation_config(CompilationConfig(use_cudagraph=True, ))
+            compilation_config.splitting_ops = ["silly.attention"]
     else:
-        os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(
-            CompilationLevel.NO_COMPILATION)
-        set_compilation_config(None)
+        compilation_config = CompilationConfig(
+            level=CompilationLevel.NO_COMPILATION, )
 
-    model = LlamaModel(config=llama_config,
-                       vllm_config=VllmConfig(),
-                       prefix="").eval().cuda()
+    vllm_config = VllmConfig(compilation_config=compilation_config)
+    with set_current_vllm_config(vllm_config):
+        model = LlamaModel(config=llama_config,
+                           vllm_config=vllm_config,
+                           prefix="").eval().cuda()
 
     B = 16  # max batch size
     input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
     positions = torch.arange(B).cuda()
 
-    with set_compile_context([1, 2]):
-        model(input_ids, positions)
-        model(input_ids[:2], positions[:2])
-        model(input_ids[:1], positions[:1])
+    model(input_ids, positions)
+    model(input_ids[:2], positions[:2])
+    model(input_ids[:1], positions[:1])
 
     input_ids[:2].zero_()
     output = model(input_ids[:2], positions[:2])
 
-    # manual cleanup
-    del os.environ["VLLM_TORCH_COMPILE_LEVEL"]
-    set_compilation_config(None)
-
     output = output.cpu()
 
     if llama_config.tractable_init:
@@ -361,7 +349,6 @@ def test_toy_llama():
 
 @torch.inference_mode
 def benchmark():
-    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
     from triton.testing import do_bench
 
     # similar to llama 3.1-8B
@@ -387,17 +374,23 @@ def benchmark():
 
     for piecewise in [False, True]:
         if piecewise:
-            set_compilation_config(
-                CompilationConfig(
-                    use_cudagraph=True,
-                    non_cudagraph_ops=["silly.attention"],
-                ))
+            compilation_config = CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                use_cudagraph=True,
+                splitting_ops=["silly.attention"],
+                cudagraph_capture_sizes=cudagraph_sizes,
+            )
         else:
-            set_compilation_config(None)
+            compilation_config = CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                cudagraph_capture_sizes=cudagraph_sizes,
+            )
 
-        model = LlamaModel(config=llama_config,
-                           vllm_config=VllmConfig(),
-                           prefix="").eval().cuda().to(torch.bfloat16)
+        vllm_config = VllmConfig(compilation_config=compilation_config)
+        with set_current_vllm_config(vllm_config):
+            model = LlamaModel(config=llama_config,
+                               vllm_config=vllm_config,
+                               prefix="").eval().cuda().to(torch.bfloat16)
 
         B = 256  # max batch size
         input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
@@ -405,17 +398,16 @@ def benchmark():
 
         graphs = {}
 
-        with set_compile_context(cudagraph_sizes):
-            model(input_ids, positions)
-            for b in cudagraph_sizes[::-1]:
-                if not piecewise:
-                    graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph, pool=pool):
-                        output = model(input_ids[:b], positions[:b])
-                    graphs[b] = (graph, output)
-                else:
+        model(input_ids, positions)
+        for b in cudagraph_sizes[::-1]:
+            if not piecewise:
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph, pool=pool):
                     output = model(input_ids[:b], positions[:b])
-                    graphs[b] = (model, output)
+                graphs[b] = (graph, output)
+            else:
+                output = model(input_ids[:b], positions[:b])
+                graphs[b] = (model, output)
         for b in cudagraph_sizes:
             if piecewise:
                 # noqa is for `Function definition does not bind loop variable`
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 833589ba5dc9f..87d5aefea6cb4 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 from vllm.utils import cuda_device_count_stateless
 
 from ..utils import compare_all_settings
@@ -55,13 +55,23 @@ class TestSetting:
     # embedding model
     TestSetting(
         model="BAAI/bge-multilingual-gemma2",
-        model_args=["--task", "embedding"],
+        model_args=["--task", "embed"],
         pp_size=1,
         tp_size=1,
         attn_backend="FLASHINFER",
         method="encode",
         fullgraph=True,
     ),
+    # encoder-based embedding model (BERT)
+    TestSetting(
+        model="BAAI/bge-base-en-v1.5",
+        model_args=["--task", "embed"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="XFORMERS",
+        method="encode",
+        fullgraph=True,
+    ),
     # vision language model
     TestSetting(
         model="microsoft/Phi-3.5-vision-instruct",
@@ -96,31 +106,36 @@ def test_compile_correctness(test_setting: TestSetting):
     final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
                 ["-tp", str(tp_size)]
 
+    all_args: List[List[str]] = []
     all_envs: List[Optional[Dict[str, str]]] = []
 
     for level in [
             CompilationLevel.NO_COMPILATION,
             CompilationLevel.PIECEWISE,
     ]:
-        all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
+        all_args.append(final_args + [f"-O{level}"])
+        all_envs.append({})
 
     # inductor will change the output, so we only compare if the output
     # is close, not exactly the same.
     compare_all_settings(
-        model, [final_args] * 2,
+        model,
+        all_args,
         all_envs,
         method=method if method != "generate" else "generate_close")
     all_envs.clear()
+    all_args.clear()
 
     for level in [
             CompilationLevel.NO_COMPILATION,
             CompilationLevel.DYNAMO_AS_IS,
             CompilationLevel.DYNAMO_ONCE,
     ]:
-        all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
+        all_args.append(final_args + [f"-O{level}"])
+        all_envs.append({})
         if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
             # "DYNAMO_ONCE" will always use fullgraph
             all_envs[-1][
                 "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
 
-    compare_all_settings(model, [final_args] * 3, all_envs, method=method)
+    compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index f00334934cb46..4dfdfe21a67df 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,6 +1,6 @@
 import pytest
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 
 from ..utils import fork_new_process_for_each_test
 from .utils import TEST_MODELS, check_full_graph_support
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
new file mode 100644
index 0000000000000..ea3aaee9565ec
--- /dev/null
+++ b/tests/compile/test_functionalization.py
@@ -0,0 +1,100 @@
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm import LLM, SamplingParams
+from vllm.compilation.fix_functionalization import FixFunctionalizationPass
+from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
+                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.config import CompilationConfig
+
+from .backend import TestBackend
+
+OPS_IN_MODEL = [
+    torch.ops._C.rotary_embedding.default,
+    torch.ops._C.fused_add_rms_norm.default,
+    torch.ops._C.silu_and_mul.default,
+]
+
+RMS_OP = torch.ops._C.rms_norm.default
+
+RMS_QUANT_OPS = {
+    "static_fp8": [
+        torch.ops._C.rms_norm_static_fp8_quant.default,
+        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
+    ],
+}
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+@pytest.mark.parametrize(
+    "model, quant_key",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e",
+      kFp8DynamicTokenSym)])
+@pytest.mark.parametrize("do_fusion", [True, False])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
+                    reason="Only test on CUDA")
+def test_fix_functionalization(model: str, quant_key: QuantKey,
+                               do_fusion: bool):
+    torch.set_default_device("cuda")
+
+    config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
+                                          enable_reshape=True)
+    reshape_pass = RedundantReshapesPass(config)
+    fusion_pass = FusionPass.instance(config)
+
+    passes = [reshape_pass, fusion_pass] if do_fusion else [reshape_pass]
+    func_pass = FixFunctionalizationPass(config)
+    backend_func = TestBackend(*passes, func_pass)
+    backend_no_func = TestBackend(*passes)
+
+    # instantiate a full engine and manually compile the model 2x
+    # (with and without FixFunctionalizationPass)
+    llm = LLM(model=model, enforce_eager=True)
+    model_runner = llm.llm_engine.model_executor.driver_worker.model_runner
+    orig_model = model_runner.model
+    # TODO mark inputs dynamic? (currently torch.compile is triggered 4x)
+    # Can only do that by using the decorator but then we'd have to instantiate
+    # 2 LLM instances.
+
+    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
+    model_runner.model = torch.compile(orig_model,
+                                       fullgraph=True,
+                                       backend=backend_func)
+    gen_func = llm.generate(prompts, sampling_params)
+
+    model_runner.model = torch.compile(orig_model,
+                                       fullgraph=True,
+                                       backend=backend_no_func)
+    gen_no_func = llm.generate(prompts, sampling_params)
+
+    for output_func, output_no_func in zip(gen_func, gen_no_func):
+        assert output_func.outputs[0].text == output_no_func.outputs[0].text
+
+    # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
+    # and replaced by fused quantized ops in RMS_QUANT_OPS.
+    rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
+               ] if do_fusion else [RMS_OP]
+    ops = OPS_IN_MODEL + rms_ops
+
+    for op in ops:
+        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
+        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
+                                  op) is None  # noqa: E501
+
+    # make sure the ops were all de-functionalized
+    found = dict()
+    for node in backend_func.graph_post_pass.nodes:
+        for op in ops:
+            if is_func(node, op):
+                found[op] = True
+    assert all(found[op] for op in ops)
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index e4d3defafb951..b4266a4a7db94 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -3,10 +3,11 @@
 from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
-from vllm.compilation.config import CompilationConfig
-from vllm.compilation.fusion import (FusionPass, find_auto_fn,
-                                     find_auto_fn_maybe)
+from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
+                                     FusionPass, QuantKey)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
 from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.config import CompilationConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     apply_fp8_linear)
@@ -16,50 +17,61 @@
 
 class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, eps: float, *args, **kwargs):
+    def __init__(self, hidden_size: int, eps: float, static: bool, *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
-        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        if static:
+            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        else:
+            self.scale = [None for _ in range(2)]
         self.w = [
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
             for _ in range(2)
         ]
 
     def forward(self, x):
-        resid = torch.relu(x)
+        resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1])
+        x2 = apply_fp8_linear(y,
+                              self.w[0],
+                              self.wscale[0],
+                              self.scale[0],
+                              use_per_token_if_dynamic=True)
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3])
+        x3 = apply_fp8_linear(y2,
+                              self.w[1],
+                              self.wscale[1],
+                              self.scale[1],
+                              use_per_token_if_dynamic=True)
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
 
-# Init does pattern registration, which can only happen once
-config = CompilationConfig(enable_fusion=True)
-reshape_pass = RedundantReshapesPass(config)
-fusion_pass = FusionPass.instance(config)
-
-
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize("static", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
                     reason="Only test on CUDA")
-def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static):
     torch.set_default_device("cuda")
-    torch.set_default_dtype(torch.float16)
-
-    if eps != 1e-5:
-        pytest.skip("Only test eps=1e-5 for now")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(1)
 
     # Reshape pass is needed for the fusion pass to work
+    config = CompilationConfig.PassConfig(enable_fusion=True,
+                                          enable_reshape=True)
+    reshape_pass = RedundantReshapesPass(config)
+    fusion_pass = FusionPass.instance(config)
+
     backend = TestBackend(reshape_pass, fusion_pass)
-    model = TestModel(hidden_size, eps)
+    model = TestModel(hidden_size, eps, static)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size)
@@ -70,16 +82,28 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
     model2 = torch.compile(model, backend=backend)
     result2 = model2(x)
 
-    # Check that it gives the same answer
-    torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3)
+    # Higher tol for dynamic, even higher for bfloat16
+    if static:
+        ATOL, RTOL = (1e-3, 1e-3)
+    elif dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
+
+    torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
 
     # Check substitution worked
     pre_nodes = backend.graph_pre_pass.nodes
     post_nodes = backend.graph_post_pass.nodes
 
-    rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default
-    add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
-    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
+    # static is per-tensor, dynamic is per-token
+    key = QuantKey(dtype=FP8_DTYPE,
+                   static=static,
+                   per_tensor=static,
+                   symmetric=True)
+    rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
+    add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
+    fp8_quant = QUANT_OPS[key]
 
     # In pre-nodes, fp8 quant should be present and fused kernels should not
     assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
new file mode 100644
index 0000000000000..03e7535093c5d
--- /dev/null
+++ b/tests/compile/test_pass_manager.py
@@ -0,0 +1,35 @@
+import pickle
+
+import pytest
+import torch
+from torch._inductor.codecache import BypassFxGraphCache
+
+from vllm.compilation.config import CompilationConfig
+from vllm.compilation.inductor_pass import (CallableInductorPass,
+                                            as_inductor_pass)
+from vllm.compilation.pass_manager import PostGradPassManager
+
+
+def simple_callable(graph: torch.fx.Graph):
+    pass
+
+
+@as_inductor_pass(files=(__file__, ))
+def callable_decorated(graph: torch.fx.Graph):
+    pass
+
+
+@pytest.mark.parametrize(
+    "works, callable",
+    [(False, simple_callable), (True, callable_decorated),
+     (True, CallableInductorPass(simple_callable, "simple_callable"))])
+def test_pass_manager(works: bool, callable):
+    config = CompilationConfig().pass_config
+    pass_manager = PostGradPassManager([callable])
+    pass_manager.configure(config)  # Adds default passes
+
+    if works:
+        pickle.dumps(pass_manager)
+    else:
+        with pytest.raises(BypassFxGraphCache):
+            pickle.dumps(pass_manager)
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 3668c1fab6b89..74f66baaa5ea1 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -3,6 +3,7 @@
 import torch
 
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.config import CompilationLevel
 
 
 class MyMod(torch.nn.Module):
@@ -18,7 +19,8 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
     def __init__(self, model):
         self.model = model
         compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(compiled_callable)
+        super().__init__(compiled_callable,
+                         compilation_level=CompilationLevel.DYNAMO_ONCE)
 
     def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
         # this is the function to be compiled
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
index 222c63a342a4b..7c92d165d05f7 100644
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -4,7 +4,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
 
 TEST_MODELS = [
@@ -65,7 +65,6 @@ def check_full_graph_support(model,
                              optimization_level,
                              tp_size=1):
     # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
     os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
 
     # The base meta llama uses too much memory.
@@ -86,6 +85,7 @@ def check_full_graph_support(model,
               enforce_eager=True,
               tensor_parallel_size=tp_size,
               disable_custom_all_reduce=True,
+              compilation_config=optimization_level,
               **model_kwargs)
 
     outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/conftest.py b/tests/conftest.py
index 0dc1cc6e83c18..4e939221329cd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,7 +5,6 @@
 from enum import Enum
 from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
                     TypedDict, TypeVar, Union)
-from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -110,7 +109,7 @@ def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
 
 
 @pytest.fixture(params=[True, False])
-def run_with_both_engines(request):
+def run_with_both_engines(request, monkeypatch):
     # Automatically runs tests twice, once with V1 and once without
     use_v1 = request.param
     # Tests decorated with `@skip_v1` are only run without v1
@@ -119,11 +118,11 @@ def run_with_both_engines(request):
     if use_v1:
         if skip_v1:
             pytest.skip("Skipping test on vllm V1")
-        with patch('vllm.envs.VLLM_USE_V1', True):
-            yield
+        monkeypatch.setenv('VLLM_USE_V1', '1')
     else:
-        with patch('vllm.envs.VLLM_USE_V1', False):
-            yield
+        monkeypatch.setenv('VLLM_USE_V1', '0')
+
+    yield
 
 
 @pytest.fixture(autouse=True)
@@ -263,8 +262,8 @@ def __init__(
         dtype: str = "half",
         *,
         model_kwargs: Optional[Dict[str, Any]] = None,
-        is_embedding_model: bool = False,
         is_sentence_transformer: bool = False,
+        is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
         auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
         postprocess_inputs: Callable[..., BatchEncoding] = identity,
@@ -282,6 +281,14 @@ def __init__(
                     device="cpu",
                     trust_remote_code=True,
                 ).to(dtype=torch_dtype))
+        elif is_cross_encoder:
+            # Lazy init required for AMD CI
+            from sentence_transformers import CrossEncoder
+            self.model = CrossEncoder(model_name,
+                                      device="cpu",
+                                      trust_remote_code=True)
+            self.model.model = self.wrap_device(self.model.model)\
+                .to(dtype=torch_dtype)
         else:
             model_kwargs = model_kwargs if model_kwargs is not None else {}
             self.model = self.wrap_device(
@@ -625,6 +632,9 @@ def generate_encoder_decoder_greedy_logprobs_limit(
     def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
         return self.model.encode(prompts)
 
+    def predict(self, prompts: List[List[str]]) -> torch.Tensor:
+        return self.model.predict(prompts, convert_to_tensor=True)
+
     def __enter__(self):
         return self
 
@@ -645,6 +655,7 @@ def __init__(
         model_name: str,
         task: TaskOption = "auto",
         tokenizer_name: Optional[str] = None,
+        tokenizer_mode: str = "auto",
         # Use smaller max model length, otherwise bigger model cannot run due
         # to kv cache size limit.
         max_model_len: int = 1024,
@@ -661,6 +672,7 @@ def __init__(
             model=model_name,
             task=task,
             tokenizer=tokenizer_name,
+            tokenizer_mode=tokenizer_mode,
             trust_remote_code=True,
             dtype=dtype,
             swap_space=swap_space,
@@ -707,14 +719,6 @@ def get_inputs(
 
         return inputs
 
-    def classify(self, prompts: List[str]) -> List[str]:
-        req_outputs = self.model.encode(prompts)
-        outputs = []
-        for req_output in req_outputs:
-            embedding = req_output.outputs.embedding
-            outputs.append(embedding)
-        return outputs
-
     def generate(
         self,
         prompts: List[str],
@@ -831,6 +835,7 @@ def generate_greedy_logprobs(
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
         stop_token_ids: Optional[List[int]] = None,
+        stop: Optional[List[str]] = None,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
@@ -838,7 +843,8 @@ def generate_greedy_logprobs(
             max_tokens=max_tokens,
             logprobs=num_logprobs,
             prompt_logprobs=num_prompt_logprobs,
-            stop_token_ids=stop_token_ids)
+            stop_token_ids=stop_token_ids,
+            stop=stop)
 
         return self.generate_w_logprobs(prompts,
                                         greedy_logprobs_params,
@@ -883,6 +889,10 @@ def generate_beam_search(
             returned_outputs.append((token_ids, texts))
         return returned_outputs
 
+    def classify(self, prompts: List[str]) -> List[List[float]]:
+        req_outputs = self.model.classify(prompts)
+        return [req_output.outputs.probs for req_output in req_outputs]
+
     def encode(
         self,
         prompts: List[str],
@@ -895,9 +905,17 @@ def encode(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.encode(inputs)
+        req_outputs = self.model.embed(inputs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
+    def score(
+        self,
+        text_1: Union[str, List[str]],
+        text_2: Union[str, List[str]],
+    ) -> List[float]:
+        req_outputs = self.model.score(text_1, text_2)
+        return [req_output.outputs.score for req_output in req_outputs]
+
     def __enter__(self):
         return self
 
@@ -1010,3 +1028,22 @@ def dummy_gemma2_embedding_path():
         with open(json_path, "w") as f:
             json.dump(config, f)
     return _dummy_gemma2_embedding_path
+
+
+# Add the flag `--optional` to allow run tests
+# that are marked with @pytest.mark.optional
+def pytest_addoption(parser):
+    parser.addoption("--optional",
+                     action="store_true",
+                     default=False,
+                     help="run optional test")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--optional"):
+        # --optional given in cli: do not skip optional tests
+        return
+    skip_optional = pytest.mark.skip(reason="need --optional option to run")
+    for item in items:
+        if "optional" in item.keywords:
+            item.add_marker(skip_optional)
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index 9320a9ef62314..415d0bd8237df 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+from tests.kernels.utils import override_backend_env_variable
 from vllm import LLM, SamplingParams
 
 from .conftest import get_text_from_llm_generator
@@ -28,8 +29,9 @@
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
 def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
-                                 batch_size, seed):
+                                 batch_size, seed, backend, monkeypatch):
     """
     The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
     asks for value of one of them (which is outside the sliding window).
@@ -38,6 +40,8 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
 
     Additionally, we compare the results of the v1 and v2 managers.
     """
+    override_backend_env_variable(monkeypatch, backend)
+
     sampling_params = SamplingParams(
         max_tokens=1024,
         ignore_eos=True,
@@ -84,7 +88,9 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
 @pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
+                                        backend, monkeypatch):
     """
     This is similar to test_sliding_window_retrival, however, it doesn't
     compare against the v1 block manager since v1 doesn't support
@@ -93,6 +99,8 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
     The results with and without chunked prefill are not the same due to
     numerical instabilities.
     """
+    override_backend_env_variable(monkeypatch, backend)
+
     sampling_params = SamplingParams(
         max_tokens=10,
         ignore_eos=True,
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index d325b9606843e..29ac3a3c86cb4 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -5,9 +5,14 @@
 
 import pytest
 
+from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.block.interfaces import Block, BlockAllocator
-from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
+from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
+                                                  PrefixCachingBlock,
                                                   PrefixCachingBlockAllocator)
+from vllm.sequence import Logprob
+from vllm.utils import Device
 
 
 class TestPrefixCachingBlock:
@@ -726,23 +731,77 @@ def test_touch_block():
                 token_ids=common_token_ids,
                 allocator=allocator,
             )
-            block_ids = [block.block_id for block in blocks]
+            block_hashes = [block.content_hash for block in blocks]
             # The allocated blocks should  be marked as touched
             # but not computed.
-            computed_block_ids = allocator.get_computed_block_ids(
-                [], block_ids, skip_last_block_id=False)
+            computed_block_ids = allocator.find_cached_blocks_prefix(
+                block_hashes)
             assert len(computed_block_ids) == 0
 
         allocator.mark_blocks_as_computed([])
-        computed_block_ids = allocator.get_computed_block_ids(
-            [], block_ids, skip_last_block_id=False)
+        computed_block_ids = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes)
         assert len(computed_block_ids) == common_blocks
 
+    @staticmethod
+    def test_find_cached_blocks_prefix():
+        """
+        This test verifies the behavior of find_cached_blocks_prefix.
+        """
+        block_size = 4
+        num_blocks = 8
+        total_test_blocks = 12
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+
+        token_ids = list(range(total_test_blocks * block_size))
+        block_tokens_seq1 = token_ids[:num_blocks * block_size]
+        blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=block_tokens_seq1,
+            allocator=allocator,
+        )
+        block_hashes_seq1 = [block.content_hash for block in blocks_seq1]
+        allocator.mark_blocks_as_computed([])
+
+        # All blocks should be cached.
+        cached_blocks_seq1 = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq1)
+        assert len(cached_blocks_seq1) == num_blocks
+
+        # Free the first sequence.
+        for block in blocks_seq1:
+            allocator.free(block)
+
+        # All blocks should be still be cached if not required to be allocated.
+        cached_blocks = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq1)
+        assert len(cached_blocks) == num_blocks
+
+        block_tokens_seq2 = token_ids[num_blocks * block_size:]
+        blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=block_tokens_seq2,
+            allocator=allocator,
+        )
+        block_hashes_seq2 = [block.content_hash for block in blocks_seq2]
+        allocator.mark_blocks_as_computed([])
+        cached_blocks = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq2)
+        assert len(cached_blocks) == len(blocks_seq2)
+
+        # Half of the blocks from seq1 should still be cached.
+        num_evicted_blocks = len(blocks_seq2)
+        cached_blocks = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq1)
+        assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
+
     @staticmethod
     def create_immutable_chain(
         block_size: int,
         token_ids: List[int],
         allocator: PrefixCachingBlockAllocator,
+        extra_hash: Optional[int] = None,
     ) -> List[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
@@ -758,7 +817,178 @@ def create_immutable_chain(
                                         block_size:(block_number + 1) *
                                         block_size]
             prev_block = allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=block_token_ids)
+                prev_block=prev_block,
+                token_ids=block_token_ids,
+                extra_hash=extra_hash)
             blocks.append(prev_block)
 
         return blocks
+
+
+class TestComputedBlocksTracker:
+
+    @staticmethod
+    def _get_mock_allocator():
+        return MagicMock(spec=PrefixCachingBlockAllocator)
+
+    @staticmethod
+    def test_get_num_cached_tokens():
+        """
+        Test it correctly computes the number of cached tokens for a given
+        sequence:
+
+        - The cache token count is derived from the number of cached blocks.
+        - The cache token count is updated when the allocator is updated.
+        - When a sequence is removed, the cache token count should be updated
+        accordingly.
+
+        # TODO(rickyx): This behaviour for prefill sequence is a hack until
+        we fix the computed blocks tracking.
+        - The cache token count for prefill sequence doesn't change while
+        the sequence is in continuous prefill (chunked prefill).
+        """
+        block_size = 4
+        mock_allocator = TestComputedBlocksTracker._get_mock_allocator()
+        tracker = ComputedBlocksTracker(
+            allocator=mock_allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        # Not yet allocated.
+        tokens = [0, 1, 2, 3, 4, 5]
+        seq1 = create_dummy_sequence(request_id=0,
+                                     token_ids=tokens,
+                                     block_size=block_size)
+        mock_allocator.find_cached_blocks_prefix.return_value = []
+        assert tracker.get_num_cached_tokens(seq1) == 0
+
+        mock_allocator.find_cached_blocks_prefix.return_value = [
+            None
+        ]  # 1 block cached.
+        # Result is cached for prefill sequence.
+        assert tracker.get_num_cached_tokens(seq1) == 0
+
+        # Mark the sequence as non-prefill.
+        seq1.data.update_num_computed_tokens(len(tokens))  # 6 tokens computed.
+        assert not seq1.is_prefill()
+
+        # Recomputes for decoding sequence.
+        assert tracker.get_num_cached_tokens(seq1) == 4
+
+        # Append new tokens to the sequence.
+        num_new_tokens = 3
+        for i in range(num_new_tokens):
+            seq1.append_token_id(i, {i: Logprob(logprob=0.0)})
+
+        assert tracker.get_num_cached_tokens(seq1) == 4
+
+        # Update the allocator.
+        mock_allocator.find_cached_blocks_prefix.return_value = [
+            None
+        ] * 2  # 2 blocks cached.
+        assert tracker.get_num_cached_tokens(seq1) == 8
+
+        # Remove the sequence.
+        tracker.remove_seq(seq1.seq_id)
+
+        # Re-create the sequence with the same request id to simulate recompute.
+        seq1 = create_dummy_sequence(request_id=0,
+                                     token_ids=tokens,
+                                     block_size=block_size)
+        mock_allocator.find_cached_blocks_prefix.return_value = [
+        ]  # no cached block
+        assert tracker.get_num_cached_tokens(seq1) == 0
+
+    @staticmethod
+    def test_correct_block_hash():
+        """
+        Test that the block hash is correctly computed for a sequence (should
+        match the underlying block allocator's block hash). So the number of
+        cached tokens is correctly retrieved.
+        """
+        block_size = 4
+        allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching",
+            num_gpu_blocks=16,
+            num_cpu_blocks=16,
+            block_size=block_size,
+        )
+        gpu_allocator = allocator._allocators[Device.GPU]
+
+        tracker = ComputedBlocksTracker(
+            allocator=allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        tokens = list(range(block_size * 4))  # 4 blocks.
+        seq = create_dummy_sequence(request_id=0,
+                                    token_ids=tokens,
+                                    block_size=block_size)
+        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=tokens,
+            allocator=gpu_allocator,
+        )
+        allocator.mark_blocks_as_computed([])
+
+        assert tracker.get_num_cached_tokens(seq) == len(tokens)
+
+    @staticmethod
+    def test_correct_extra_hash():
+        """
+        Test that the block hash is correctly computed based on the extra hash,
+        ensuring it matches the allocator's block hash, specifically for the
+        LoRA case, and that the correct number of cached tokens is retrieved.
+        """
+        block_size = 4
+        allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching",
+            num_gpu_blocks=16,
+            num_cpu_blocks=16,
+            block_size=block_size,
+        )
+        gpu_allocator = allocator._allocators[Device.GPU]
+
+        tracker = ComputedBlocksTracker(
+            allocator=allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        tokens = list(range(block_size * 4))
+
+        # Create a dummy LoRA sequence with a specific LoRA ID.
+        lora_seq = create_dummy_lora_sequence(request_id=0,
+                                              token_ids=tokens,
+                                              block_size=block_size,
+                                              lora_int_id=1)
+
+        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=tokens,
+            allocator=gpu_allocator,
+            extra_hash=lora_seq.extra_hash(),
+        )
+
+        allocator.mark_blocks_as_computed([])
+
+        # Create different dummy sequences that have the same token IDs
+        # but different LoRA IDs.
+        seq = create_dummy_sequence(request_id=1,
+                                    token_ids=tokens,
+                                    block_size=block_size)
+
+        different_lora_seq = create_dummy_lora_sequence(request_id=2,
+                                                        token_ids=tokens,
+                                                        block_size=block_size,
+                                                        lora_int_id=2)
+
+        # Due to the different LoRA IDs, corresponding blocks are not cached.
+        assert tracker.get_num_cached_tokens(seq) == 0
+        assert tracker.get_num_cached_tokens(different_lora_seq) == 0
+
+        # The number of cached tokens matches the length of the tokens
+        # for the cached LoRA sequence.
+        assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index acd82065ae457..71a203ec8db2a 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -5,6 +5,9 @@
 
 from vllm.config import CacheConfig, SchedulerConfig
 from vllm.core.scheduler import Scheduler
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob, SequenceGroup
 
 from .utils import create_dummy_prompt
@@ -14,7 +17,7 @@ def get_sequence_groups(scheduler_output):
     return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
 
 
-def append_new_token(seq_group, token_id: int):
+def append_new_token(seq_group: SequenceGroup, token_id: int):
     for seq in seq_group.get_seqs():
         seq.append_token_id(token_id, {token_id: Logprob(token_id)})
 
@@ -121,6 +124,214 @@ def test_chunk():
     assert out.num_batched_tokens == 57
 
 
+def test_concurrent_chunking():
+    """Verify prefills are chunked properly when 
+    --max-num-partial-prefills is > 1"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 32
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Verify both requests are chunked with half of max_num_batched_tokens each
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 32
+    assert seq_group_meta[1].token_chunk_size == 32
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+
+    # After one iteration, both should have 60 - 32 = 28 tokens left to prefill
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 28
+    assert seq_group_meta[1].token_chunk_size == 28
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 56
+
+
+def test_concurrent_chunking_large_requests():
+    """Verify large prefill requests are run one at a time"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
+    cache_config.num_gpu_blocks = 3200
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=1200,  # Very large prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+
+    # Verify only a single request is chunked, and it gets all 64 tokens
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 64
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 64
+
+
+def test_short_prompts_jump_long_prompts_in_queue():
+    """Verify large prefill requests are punted behind smaller ones if 
+    another large prefill request is already running"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
+    cache_config.num_gpu_blocks = 3200
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add 2 large seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=1200,  # Very large prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # Add 2 small seq groups behind them
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i + 2),
+            prompt_length=40,  # Very small prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # Verify one large req and 1 small req chunked
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 32  # large req gets 32 tokens
+    assert seq_group_meta[1].token_chunk_size == 32  # small req gets 32 tokens
+
+    # all 4 are prefilling
+    assert running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert running[2].is_prefill()
+    assert running[3].is_prefill()
+
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+
+    # in the second iteration,
+    # the first small request had only 8 tokens left
+    # so it went to decode
+    # The other small req is scheduled
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # the new small req got 64 - (32+8) tokens
+    assert (seq_group_meta[0].token_chunk_size == 24)
+    assert seq_group_meta[1].token_chunk_size == 32  # large req still got 32
+    # the other small request had only 8 tokens left
+    assert seq_group_meta[2].token_chunk_size == 8  # 40-32
+
+    # notice the small request got to decode now
+    # this is because of max_num_partial_prefills logic
+    assert running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert not running[2].is_prefill()
+    assert running[3].is_prefill()
+
+    assert out.num_prefill_groups == 3
+    assert out.num_batched_tokens == 64
+    # the small seq group has a new token appended.
+    append_new_token(running[2], 1)
+
+    # in the third iteration,
+    # the first small request has entered decode
+    # and other small req had 16 tokens left
+    # so it went to decode
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 32  # large still got 32
+    # small req prefilled 40-24=16 tokens
+    assert (seq_group_meta[1].token_chunk_size == 16)
+    assert seq_group_meta[2].token_chunk_size == 1  # decode
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 49  # (32+16+1 decode)
+
+    # both small requests have now reached decode
+    assert running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert not running[2].is_prefill()
+    assert not running[3].is_prefill()
+
+    # the small seq group has a new token appended.
+    append_new_token(running[2], 1)
+
+    # in the fourth iteration, both small requests are decoding
+    # so large request gets all the budget
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # large req gets 63 tokens (minus 1 for decode)
+    assert seq_group_meta[0].token_chunk_size == 63
+    assert seq_group_meta[1].token_chunk_size == 1  # decode
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 64
+
+    assert running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert not running[2].is_prefill()
+    assert not running[3].is_prefill()
+
+    # both the small seq groups have a new token appended
+    append_new_token(running[2], 1)
+    append_new_token(running[3], 1)
+
+    # in the fifth iteration, large request gets all the budget
+    # while both small requests are decoding
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 62
+    assert seq_group_meta[1].token_chunk_size == 1  # decode
+    assert seq_group_meta[2].token_chunk_size == 1  # decode
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 64
+
+
 def test_complex():
     block_size = 4
     max_seqs = 60
@@ -413,6 +624,45 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots):
     assert out.num_batched_tokens == max_num_batched_tokens
 
 
+@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
+def test_chunked_prefill_spec_prefill(num_scheduler_steps):
+    """Verify that the num_lookahead_slots is set appropriately for an all"""
+    """prefill batch depending on whether multi-step scheduling is enabled"""
+    """or not"""
+    block_size = 4
+    max_seqs = 30
+    max_model_len = 200
+    max_num_batched_tokens = 30
+    num_lookahead_slots = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        num_lookahead_slots=num_lookahead_slots,
+        num_scheduler_steps=num_scheduler_steps,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=30,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    # The request is chunked.
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == max_num_batched_tokens
+    print(out.num_lookahead_slots)
+    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
+                                       num_lookahead_slots)
+
+
 def test_chunked_prefill_max_seqs():
     block_size = 4
     max_seqs = 2
@@ -467,7 +717,7 @@ def test_chunked_prefill_max_seqs():
     assert not running[1].is_prefill()
 
 
-def test_perfix_caching():
+def test_prefix_caching():
     """Verify allocating full blocks when prefix caching is enabled."""
     block_size = 4
     max_seqs = 10
@@ -507,3 +757,86 @@ def test_perfix_caching():
     assert seq_group_meta[1].token_chunk_size == 12
     assert out.num_prefill_groups == 2
     assert out.num_batched_tokens == 62
+
+
+def test_prefix_caching_with_concurrent_partial_prefills():
+    """Verify allocating full blocks when prefix caching is enabled with 
+    --max-num-partial-prefills > 1."""
+    block_size = 4
+    max_seqs = 10
+    max_model_len = 8000
+    max_num_batched_tokens = 60  # With two slots, each slot will get 30 tokens
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True,
+                                       max_num_partial_prefills=2)
+    cache_config = CacheConfig(block_size,
+                               1.0,
+                               1,
+                               "auto",
+                               enable_prefix_caching=True)
+    cache_config.num_cpu_blocks = 0
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           block_size=block_size,
+                                           prompt_length=50)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # To partially prefill both sequences, both can chunk up to 30 tokens
+    # But the next lowest multiple of the block size (4) is 28
+    assert seq_group_meta[0].token_chunk_size == 28
+    assert seq_group_meta[1].token_chunk_size == 28
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 56
+
+    # On the next iteration, both sequences should finish prefill
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # Both sequences have 50 - 28 = 22 tokens left to prefill.
+    # This is not a multiple of the block size, but we don't care since we don't
+    # cache the final partial block of prefix sequences
+    assert seq_group_meta[0].token_chunk_size == 22
+    assert seq_group_meta[1].token_chunk_size == 22
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 44
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
+def test_chunked_prefill_with_actual_engine(model: str,
+                                            max_num_partial_prefills: int):
+    """Make sure the model can actually sample with concurrent 
+    partial prefills
+    """
+
+    prompt = "hello" * 40
+
+    engine_args = EngineArgs(
+        model=model,
+        max_num_partial_prefills=max_num_partial_prefills,
+        max_num_batched_tokens=40,
+        max_num_seqs=8,
+        enable_chunked_prefill=True,
+        gpu_memory_utilization=0.8,
+    )
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams(temperature=0)
+
+    for req_num in range(max_num_partial_prefills):
+        engine.add_request(f"{req_num}", prompt, sampling_params)
+    # first step
+    request_outputs = engine.step()
+    # means all are prefilling
+    assert len(request_outputs) == 0
+    assert len(engine.scheduler[0].running) == max_num_partial_prefills
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 5ff32be611592..8f6de84e566e7 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -12,9 +12,9 @@
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SequenceGroup
 
-from .utils import (append_new_token, append_new_token_seq_group,
-                    create_dummy_prompt, get_sequence_groups,
-                    schedule_and_update_computed_tokens)
+from .utils import (append_new_token, append_new_token_seq,
+                    append_new_token_seq_group, create_dummy_prompt,
+                    get_sequence_groups, schedule_and_update_computed_tokens)
 
 
 def test_scheduler_add_seq_group():
@@ -305,6 +305,8 @@ def initialize_scheduler(
     block_size=4,
     num_cpu_blocks=8,
     num_gpu_blocks=8,
+    enable_prefix_caching=False,
+    enable_chunked_prefill=False,
 ):
     block_size = block_size
     scheduler_config = SchedulerConfig(
@@ -312,8 +314,15 @@ def initialize_scheduler(
         max_num_batched_tokens=max_token_budget,
         max_num_seqs=max_num_seqs,
         max_model_len=max_model_len,
+        enable_chunked_prefill=enable_chunked_prefill,
+    )
+    cache_config = CacheConfig(
+        block_size,
+        1.0,
+        1,
+        "auto",
+        enable_prefix_caching=enable_prefix_caching,
     )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = num_cpu_blocks
     cache_config.num_gpu_blocks = num_gpu_blocks
     scheduler = Scheduler(scheduler_config, cache_config, lora_config)
@@ -800,3 +809,165 @@ def test_scheduling_budget():
     assert budget.num_curr_seqs == 0
     budget.subtract_num_seqs(seq_group.request_id, 2)
     assert budget.num_curr_seqs == 0
+
+
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+def test_prefix_caching_aware_prefills(enable_prefix_caching):
+    """
+    Test the below scenario:
+
+    For 3 sequences, seqA, seqB, seqC, share the first block as prefix.
+
+    The test verifies the below scenarios:
+    1.  SeqA is first scheduled.
+    2.  SeqB and SeqC can be prefilled together in a single schedule round
+    even though there are not enough token budgets to prefill both without
+    considering prefix caching.
+    """
+
+    block_size = 4
+    max_num_batched_tokens = 12
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_token_budget=max_num_batched_tokens,
+        max_num_seqs=max_seq_group,
+        max_model_len=max_num_batched_tokens,
+        enable_prefix_caching=enable_prefix_caching,
+    )
+
+    seqA_tokens = list(range(8))
+    num_shared_tokens = 4
+    seqB_tokens = seqA_tokens[:num_shared_tokens] + list(range(
+        12, 16))  # Shared prefix first 4.
+    seqC_tokens = seqA_tokens[:num_shared_tokens] + list(range(
+        16, 20))  # Shared prefix first 4.
+
+    seqA, seqA_group = create_dummy_prompt("0",
+                                           prompt_tokens=seqA_tokens,
+                                           block_size=block_size)
+    seqB, seqB_group = create_dummy_prompt("1",
+                                           prompt_tokens=seqB_tokens,
+                                           block_size=block_size)
+    seqC, seqC_group = create_dummy_prompt("2",
+                                           prompt_tokens=seqC_tokens,
+                                           block_size=block_size)
+
+    # Schedule seqA prefill.
+    scheduler.add_seq_group(seqA_group)
+    metas, out, _ = scheduler.schedule()
+    assert (len(out.scheduled_seq_groups) == 1
+            and out.scheduled_seq_groups[0].seq_group == seqA_group)
+    assert out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens)
+
+    # Schedule seqA decode.
+    append_new_token_seq_group(len(seqA_tokens), seqA_group, 999)
+    metas, out, _ = scheduler.schedule()
+
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.scheduled_seq_groups[0].seq_group == seqA_group
+    assert out.scheduled_seq_groups[0].token_chunk_size == 1
+
+    # Schedule seqB and seqC prefills should work with prefix caching.
+    scheduler.add_seq_group(seqB_group)
+    scheduler.add_seq_group(seqC_group)
+    metas, out, _ = scheduler.schedule()
+
+    if enable_prefix_caching:
+        assert len(out.scheduled_seq_groups) == 2
+        assert set([
+            out.scheduled_seq_groups[0].seq_group,
+            out.scheduled_seq_groups[1].seq_group,
+        ]) == set([seqB_group, seqC_group])
+        assert len(metas) == 2
+        for meta in metas:
+            assert meta.token_chunk_size == 8
+            assert (len(meta.computed_block_nums) == num_shared_tokens //
+                    block_size)  # 1 Block for the 8 tokens.
+    else:
+        assert len(out.scheduled_seq_groups) == 1
+        assert len(metas) == 1
+        assert metas[0].token_chunk_size == 8
+        assert len(metas[0].computed_block_nums) == 0  # No blocks computed.
+
+
+def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
+):
+    """
+    This test verifies that we don't schedule new prefills if there's already
+    a continuous prefill in progress even though the new prefills with shared
+    prefix can fit in the token budget:
+
+    - SeqA is being chunked prefill.
+    - SeqB with the same prompt shouldn't be scheduled for prefill even though
+    there's enough token budget to prefill the cached tokens.
+    - Neither should seqC be scheduled.
+
+    - When seqA is in decoding phase, seqB and seqC can be scheduled.
+        - Entire seqB should be prefilled since it's a full prefix cache hit.
+        - SeqC would be partially prefilled with the prefix shared, and the
+        remaining unique tokens would be prefilled (rounded down to be
+        block-size aligned).
+    """
+
+    block_size = 2
+    max_num_batched_tokens = 4
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_token_budget=max_num_batched_tokens,
+        max_num_seqs=max_seq_group,
+        max_model_len=100,
+        enable_prefix_caching=True,
+        enable_chunked_prefill=True,
+    )
+
+    seqA_tokens = list(range(8))
+    seqB_tokens = seqA_tokens
+    seqC_shared_prefix_len = 4
+    seqC_tokens = seqA_tokens[:seqC_shared_prefix_len] + list(range(12, 20))
+
+    seqA, seqA_group = create_dummy_prompt("0",
+                                           prompt_tokens=seqA_tokens,
+                                           block_size=block_size)
+    seqB, seqB_group = create_dummy_prompt("1",
+                                           prompt_tokens=seqB_tokens,
+                                           block_size=block_size)
+
+    # Chunked prefill seqA.
+    scheduler.add_seq_group(seqA_group)
+    metas, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.scheduled_seq_groups[0].seq_group == seqA_group
+    assert out.scheduled_seq_groups[0].token_chunk_size == 4
+
+    # seqB should not be scheduled with ongoing prefills.
+    scheduler.add_seq_group(seqB_group)
+    metas, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.scheduled_seq_groups[0].seq_group == seqA_group
+    assert out.scheduled_seq_groups[0].token_chunk_size == 4
+
+    # both seqB and seqC can now be scheduled with seqA is over.
+    # seqA is in decoding phase.
+    append_new_token_seq(seqA, 999)
+    seqC, seqC_group = create_dummy_prompt("2",
+                                           prompt_tokens=seqC_tokens,
+                                           block_size=block_size)
+    scheduler.add_seq_group(seqC_group)
+    metas, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 3
+
+    metas = {meta.request_id: meta for meta in metas}
+    assert metas[seqA_group.request_id].token_chunk_size == 1  # Decode
+    assert (metas[seqB_group.request_id].token_chunk_size == 8
+            )  # Fully cached prefill
+    assert (
+        metas[seqC_group.request_id].token_chunk_size == 6
+    ), "A partial prefix of C (4 tokens) should be prefilled, with the "
+    "remaining tokens fit into 3 token budget (4-1 from the seqA). It will "
+    "then be rounded down to 2 tokens on block size, thus 6 tokens in total."
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
index 7cd0416d321ef..16bea54936bc8 100644
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -37,7 +37,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
     num_seq_group = 4
     max_model_len = 16
     scheduler_config = SchedulerConfig(
-        task="generate",
+        "generate",
         max_num_batched_tokens=64,
         max_num_seqs=num_seq_group,
         max_model_len=max_model_len,
diff --git a/tests/core/utils.py b/tests/core/utils.py
index cd0caa4704e11..16703cd19fa1e 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,17 +1,20 @@
 import time
-from typing import List, Optional
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Tuple
 
 from vllm import SamplingParams
+from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.inputs import EncoderDecoderInputs, token_inputs
 from vllm.lora.request import LoRARequest
-from vllm.sequence import Logprob, Sequence, SequenceGroup
+from vllm.sequence import (Logprob, Sequence, SequenceGroup,
+                           SequenceGroupMetadata)
 
 
 def create_dummy_prompt(
     request_id: str,
-    prompt_length: int,
+    prompt_length: int = -1,
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
     best_of: int = 1,
@@ -26,6 +29,7 @@ def create_dummy_prompt(
         # Create dummy prompt sequence with tokens 0...block_size-1
         # and prompt "0 ... block_size".
         prompt_tokens = list(range(prompt_length))
+
     prompt_str = " ".join([str(t) for t in prompt_tokens])
     prompt = Sequence(int(request_id),
                       inputs=token_inputs(prompt_tokens, prompt=prompt_str),
@@ -42,6 +46,25 @@ def create_dummy_prompt(
     return prompt, seq_group
 
 
+def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
+                               block_size: int, lora_int_id: int) -> Sequence:
+    return Sequence(seq_id=request_id,
+                    inputs=token_inputs(token_ids),
+                    block_size=block_size,
+                    lora_request=LoRARequest(lora_name="dummy",
+                                             lora_path="/dummy",
+                                             lora_int_id=lora_int_id))
+
+
+def create_dummy_sequence(request_id: int, token_ids: List[int],
+                          block_size: int) -> Sequence:
+    return Sequence(
+        seq_id=request_id,
+        inputs=token_inputs(token_ids),
+        block_size=block_size,
+    )
+
+
 def create_dummy_prompt_encoder_decoder(
     request_id: str,
     decoder_prompt_length: int,
@@ -194,12 +217,40 @@ def append_new_token(out, token_id: int):
 
 def schedule_and_update_computed_tokens(scheduler):
     metas, out, _ = scheduler.schedule()
-    for s, meta in zip(out.scheduled_seq_groups, metas):
-        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+    for s in out.scheduled_seq_groups:
+        s.seq_group.update_num_computed_tokens(s.token_chunk_size)
     return metas, out
 
 
+def append_new_token_seq(seq: Sequence, token_id: int):
+    seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
 def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
     seq_group.update_num_computed_tokens(token_chunk_size)
     for seq in seq_group.get_seqs():
         seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+class SchedulerProxy:
+    """
+    A proxy class to forward calls to the scheduler.
+    """
+
+    def __init__(self, scheduler: Scheduler):
+        self.scheduler_ = scheduler
+        self.call_history: Dict[str, List[Any]] = defaultdict(list)
+
+    def __getattr__(self, name: str) -> Any:
+
+        def wrapper(*args, **kwargs):
+            result = getattr(self.scheduler_, name)(*args, **kwargs)
+            self.call_history[name].append((args, kwargs, result))
+            return result
+
+        return wrapper
+
+    def last_schedule_ret(
+        self, ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, Any]:
+        _, _, ret = self.call_history["schedule"][-1]
+        return ret
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index c49ed9802cde8..ddbf40f089407 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -156,17 +156,18 @@ def iter_params(self, model_name: str):
     # "internlm/internlm-chat-7b": PPTestSettings.fast(),
     "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
-    # TODO: Implement PP
-    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
+    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
     "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
     "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
     # Uses Llama
     # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
+    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
     "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
     "mosaicml/mpt-7b": PPTestSettings.fast(),
     "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
     "allenai/OLMo-1B-hf": PPTestSettings.fast(),
+    "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
     "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
     "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
     "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
@@ -213,7 +214,7 @@ def iter_params(self, model_name: str):
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
     "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
-    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
+    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(trust_remote_code=True),
     # [Encoder-decoder]
     # TODO: Implement PP
     # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
@@ -233,6 +234,8 @@ def iter_params(self, model_name: str):
     "OpenGVLab/InternVL2-1B",
     "microsoft/Phi-3-vision-128k-instruct",
     "fixie-ai/ultravox-v0_3",
+    # [LANGUAGE GENERATION - HYBRID ARCH]
+    "ai21labs/Jamba-tiny-dev",
 ]
 
 
@@ -246,9 +249,19 @@ def _compare_tp(
     *,
     method: Literal["generate", "encode"],
 ):
-    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
-    multi_node_only, trust_remote_code, tokenizer_mode, \
-        load_format, hf_overrides = test_options
+    (
+        tp_size,
+        pp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+    (
+        multi_node_only,
+        trust_remote_code,
+        tokenizer_mode,
+        load_format,
+        hf_overrides,
+    ) = test_options
 
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index e0e424439e3a5..3e9b0e10a11d8 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -60,9 +60,9 @@ def worker_fn():
     tensor = torch.ones(16, 1024, 1024,
                         dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
-        pynccl_comm.all_reduce(tensor)
-    result = tensor.mean().cpu().item()
-    assert result == pynccl_comm.world_size
+        tensor = pynccl_comm.all_reduce(tensor)
+    torch.cuda.synchronize()
+    assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -84,14 +84,14 @@ def multiple_allreduce_worker_fn():
     with pynccl_comm.change_state(enable=True):
         # two groups can communicate independently
         if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.all_reduce(tensor)
-            pynccl_comm.all_reduce(tensor)
-            result = tensor.mean().cpu().item()
-            assert result == 4
+            tensor = pynccl_comm.all_reduce(tensor)
+            tensor = pynccl_comm.all_reduce(tensor)
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 4).cpu().item()
         else:
-            pynccl_comm.all_reduce(tensor)
-            result = tensor.mean().cpu().item()
-            assert result == 2
+            tensor = pynccl_comm.all_reduce(tensor)
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 2).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -112,12 +112,12 @@ def multiple_allreduce_with_vllm_worker_fn():
         if torch.distributed.get_rank() in [0, 1]:
             tensor = tensor_model_parallel_all_reduce(tensor)
             tensor = tensor_model_parallel_all_reduce(tensor)
-            result = tensor.mean().cpu().item()
-            assert result == 4
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 4).cpu().item()
         else:
             tensor = tensor_model_parallel_all_reduce(tensor)
-            result = tensor.mean().cpu().item()
-            assert result == 2
+            torch.cuda.synchronize()
+            assert torch.all(tensor == 2).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -140,14 +140,82 @@ def worker_fn_with_cudagraph():
         with torch.cuda.graph(
                 graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
                     enable=True):
-            # operation during the graph capture is recorded but not executed
-            # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
-            pynccl_comm.all_reduce(a)
-        pynccl_comm.stream.synchronize()
-        assert a.mean().cpu().item() == pynccl_comm.world_size**0
+            a_out = pynccl_comm.all_reduce(a)
+        torch.cuda.synchronize()
         graph.replay()
-        pynccl_comm.stream.synchronize()
-        assert a.mean().cpu().item() == pynccl_comm.world_size**1
+        torch.cuda.synchronize()
+        assert torch.all(a_out == pynccl_comm.world_size).cpu().item()
+
+
+@worker_fn_wrapper
+def all_gather_worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+
+    rank = pynccl_comm.rank
+    world_size = pynccl_comm.world_size
+    device = f'cuda:{pynccl_comm.rank}'
+
+    num_elems = 1000
+    tensor = torch.arange(num_elems, dtype=torch.float32,
+                          device=device) + rank * num_elems
+    result = torch.zeros(num_elems * world_size,
+                         dtype=torch.float32,
+                         device=device)
+
+    expected = torch.cat([
+        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
+        for r in range(world_size)
+    ]).to(device)
+
+    with pynccl_comm.change_state(enable=True):
+        pynccl_comm.all_gather(result, tensor)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_all_gather():
+    distributed_run(all_gather_worker_fn, 2)
+
+
+@worker_fn_wrapper
+def reduce_scatter_worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+
+    rank = pynccl_comm.rank
+    world_size = pynccl_comm.world_size
+    device = f'cuda:{pynccl_comm.rank}'
+
+    num_elems = 1000
+    tensor = torch.arange(num_elems, dtype=torch.float32,
+                          device=device) + rank * num_elems
+    assert (num_elems % world_size == 0)
+    result = torch.zeros(num_elems // world_size,
+                         dtype=torch.float32,
+                         device=device)
+
+    # Calculate expected result for this rank's chunk
+    scattered_size = num_elems // world_size
+    all_tensors = [
+        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
+        for r in range(world_size)
+    ]
+    expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
+                   for tensor in all_tensors).to(device)
+
+    with pynccl_comm.change_state(enable=True):
+        pynccl_comm.reduce_scatter(result, tensor)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_reduce_scatter():
+    distributed_run(reduce_scatter_worker_fn, 2)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -175,8 +243,8 @@ def send_recv_worker_fn():
             pynccl_comm.recv(tensor,
                              src=(pynccl_comm.rank - 1) %
                              pynccl_comm.world_size)
-    result = tensor.mean().cpu().item()
-    assert result == 1
+    torch.cuda.synchronize()
+    assert torch.all(tensor == 1).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -214,11 +282,11 @@ def multiple_send_recv_worker_fn():
             pynccl_comm.recv(tensor,
                              src=(pynccl_comm.rank - 1) %
                              pynccl_comm.world_size)
-    result = tensor.mean().cpu().item()
+    torch.cuda.synchronize()
     if torch.distributed.get_rank() in [0, 2]:
-        assert result == 1
+        assert torch.all(tensor == 1).cpu().item()
     else:
-        assert result == 2
+        assert torch.all(tensor == 2).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -227,6 +295,38 @@ def test_pynccl_multiple_send_recv():
     distributed_run(multiple_send_recv_worker_fn, 4)
 
 
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_broadcast():
+    distributed_run(broadcast_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def broadcast_worker_fn():
+    # Test broadcast for every root rank.
+    # Essentially this is an all-gather operation.
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+    recv_tensors = [
+        torch.empty(16,
+                    1024,
+                    1024,
+                    dtype=torch.float32,
+                    device=pynccl_comm.device)
+        for i in range(pynccl_comm.world_size)
+    ]
+    recv_tensors[pynccl_comm.rank] = torch.ones(
+        16, 1024, 1024, dtype=torch.float32,
+        device=pynccl_comm.device) * pynccl_comm.rank
+
+    for i in range(pynccl_comm.world_size):
+        pynccl_comm.broadcast(recv_tensors[i], src=i)
+        # the broadcast op might be launched in a different stream
+        # need to synchronize to make sure the tensor is ready
+        torch.cuda.synchronize()
+        assert torch.all(recv_tensors[i] == i).cpu().item()
+
+
 def test_ncclGetUniqueId():
     lib = NCCLLibrary()
     unique_id = lib.ncclGetUniqueId()
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index defc4e23c8ce2..62311a626bc47 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -3,11 +3,32 @@
 import torch.distributed as dist
 
 from vllm.distributed.parallel_state import in_the_same_node_as
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port
 
 if __name__ == "__main__":
     dist.init_process_group(backend="gloo")
-    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
 
-    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-    assert test_result == expected, f"Expected {expected}, got {test_result}"
-    print("Same node test passed!")
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
+    else:
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+        test_result = all(in_the_same_node_as(pg, source_rank=0))
+
+        expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+        assert test_result == expected, \
+            f"Expected {expected}, got {test_result}"
+        if pg == dist.group.WORLD:
+            print("Same node test passed! when using torch distributed!")
+        else:
+            print("Same node test passed! when using StatelessProcessGroup!")
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 2761b7f6c0644..723872682cf97 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -7,7 +7,8 @@
 import torch.distributed as dist
 
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
-from vllm.utils import update_environment_variables
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port, update_environment_variables
 
 
 def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
@@ -54,34 +55,61 @@ def wrapped_fn(env):
 
 @worker_fn_wrapper
 def worker_fn():
-    writer_rank = 2
-    broadcaster = MessageQueue.create_from_process_group(
-        dist.group.WORLD, 40 * 1024, 2, writer_rank)
-    if dist.get_rank() == writer_rank:
-        seed = random.randint(0, 1000)
-        dist.broadcast_object_list([seed], writer_rank)
-    else:
-        recv = [None]
-        dist.broadcast_object_list(recv, writer_rank)
-        seed = recv[0]  # type: ignore
-    dist.barrier()
-    # in case we find a race condition
-    # print the seed so that we can reproduce the error
-    print(f"Rank {dist.get_rank()} got seed {seed}")
-    # test broadcasting with about 400MB of data
-    N = 10_000
-    if dist.get_rank() == writer_rank:
-        arrs = get_arrays(N, seed)
-        for x in arrs:
-            broadcaster.broadcast_object(x)
-            time.sleep(random.random() / 1000)
+
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
     else:
-        arrs = get_arrays(N, seed)
-        for x in arrs:
-            y = broadcaster.broadcast_object(None)
-            assert np.array_equal(x, y)
-            time.sleep(random.random() / 1000)
-    dist.barrier()
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+
+        writer_rank = 2
+        broadcaster = MessageQueue.create_from_process_group(
+            pg, 40 * 1024, 2, writer_rank)
+        if rank == writer_rank:
+            seed = random.randint(0, 1000)
+            dist.broadcast_object_list([seed], writer_rank)
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, writer_rank)
+            seed = recv[0]  # type: ignore
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+        else:
+            pg.barrier()
+
+        # in case we find a race condition
+        # print the seed so that we can reproduce the error
+        print(f"Rank {rank} got seed {seed}")
+        # test broadcasting with about 400MB of data
+        N = 10_000
+        if rank == writer_rank:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                broadcaster.broadcast_object(x)
+                time.sleep(random.random() / 1000)
+        else:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                y = broadcaster.broadcast_object(None)
+                assert np.array_equal(x, y)
+                time.sleep(random.random() / 1000)
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+            print("torch distributed passed the test!")
+        else:
+            pg.barrier()
+            print("StatelessProcessGroup passed the test!")
 
 
 def test_shm_broadcast():
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 686b697c98e03..5fb1ae7b29fd2 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -70,14 +70,12 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     pynccl1 = PyNcclCommunicator(pg1, device=rank)
-    pynccl1.disabled = False
     if rank <= 2:
         pg2 = StatelessProcessGroup.create(host="127.0.0.1",
                                            port=port2,
                                            rank=rank,
                                            world_size=3)
         pynccl2 = PyNcclCommunicator(pg2, device=rank)
-        pynccl2.disabled = False
     data = torch.tensor([rank]).cuda()
     pynccl1.all_reduce(data)
     pg1.barrier()
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 7b1be5a9802fd..4e269de9fc40b 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -31,6 +31,53 @@ def test_limit_mm_per_prompt_parser(arg, expected):
     assert args.limit_mm_per_prompt == expected
 
 
+def test_compilation_config():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+    # default value
+    args = parser.parse_args([])
+    assert args.compilation_config is None
+
+    # set to O3
+    args = parser.parse_args(["-O3"])
+    assert args.compilation_config.level == 3
+
+    # set to O 3 (space)
+    args = parser.parse_args(["-O", "3"])
+    assert args.compilation_config.level == 3
+
+    # set to O 3 (equals)
+    args = parser.parse_args(["-O=3"])
+    assert args.compilation_config.level == 3
+
+    # set to string form of a dict
+    args = parser.parse_args(["--compilation-config", "{'level': 3}"])
+    assert args.compilation_config.level == 3
+
+    # set to string form of a dict
+    args = parser.parse_args(["--compilation-config={'level': 3}"])
+    assert args.compilation_config.level == 3
+
+
+def test_prefix_cache_default():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args([])
+
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert (not engine_args.enable_prefix_caching
+            ), "prefix caching defaults to off."
+
+    # with flag to turn it on.
+    args = parser.parse_args(["--enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert engine_args.enable_prefix_caching
+
+    # with disable flag to turn it off.
+    args = parser.parse_args(["--no-enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert not engine_args.enable_prefix_caching
+
+
 def test_valid_pooling_config():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index e7ef5637c8ccb..ef74062ce4b41 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -69,6 +69,76 @@ def sample_json_schema():
     }
 
 
+@pytest.fixture
+def sample_complex_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "score": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 100  # Numeric range
+            },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$"  # Regex pattern
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+            },
+            "tags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "pattern":
+                    "^[a-z]{1,10}$"  # Combining length and pattern restrictions
+                }
+            }
+        },
+        "required": ["score", "grade", "email", "tags"]
+    }
+
+
+@pytest.fixture
+def sample_definition_json_schema():
+    return {
+        '$defs': {
+            'Step': {
+                'properties': {
+                    'explanation': {
+                        'title': 'Explanation',
+                        'type': 'string'
+                    },
+                    'output': {
+                        'title': 'Output',
+                        'type': 'string'
+                    }
+                },
+                'required': ['explanation', 'output'],
+                'title': 'Step',
+                'type': 'object'
+            }
+        },
+        'properties': {
+            'steps': {
+                'items': {
+                    '$ref': '#/$defs/Step'
+                },
+                'title': 'Steps',
+                'type': 'array'
+            },
+            'final_answer': {
+                'title': 'Final Answer',
+                'type': 'string'
+            }
+        },
+        'required': ['steps', 'final_answer'],
+        'title': 'MathReasoning',
+        'type': 'object'
+    }
+
+
 @pytest.fixture
 def sample_guided_choice():
     return [
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index 4c9f796e5ed71..41163809237e9 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm import LLM, EmbeddingRequestOutput, PoolingParams
+from vllm import LLM, PoolingParams, PoolingRequestOutput
 from vllm.distributed import cleanup_dist_env_and_memory
 
 MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
@@ -43,8 +43,8 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
-                         o2: List[EmbeddingRequestOutput]):
+def assert_outputs_equal(o1: List[PoolingRequestOutput],
+                         o2: List[PoolingRequestOutput]):
     assert [o.outputs for o in o1] == [o.outputs for o in o2]
 
 
diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
new file mode 100644
index 0000000000000..c2dab300ecefb
--- /dev/null
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -0,0 +1,25 @@
+from vllm import LLM, SamplingParams
+
+
+def test_gpu_memory_utilization():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # makes sure gpu_memory_utilization is per-instance limit,
+    # not a global limit
+    llms = [
+        LLM(model="facebook/opt-125m",
+            gpu_memory_utilization=0.3,
+            enforce_eager=True) for i in range(3)
+    ]
+    for llm in llms:
+        outputs = llm.generate(prompts, sampling_params)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 67c79415f322a..ed50ec6bbc9eb 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -76,6 +76,62 @@ def test_guided_json_completion(sample_json_schema, llm):
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
 
 
+@pytest.mark.skip_global_cleanup
+def test_guided_complex_json_completion(sample_complex_json_schema, llm):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an assignment grade "
+        f"that fits this schema: {sample_complex_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_complex_json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_definition_json_completion(sample_definition_json_schema, llm):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_definition_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for solving 8x + 7 = -23 "
+        f"that fits this schema: {sample_definition_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_definition_json_schema)
+
+
 @pytest.mark.skip_global_cleanup
 def test_guided_choice_completion(sample_guided_choice, llm):
     sampling_params = SamplingParams(
@@ -159,3 +215,30 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
                      sampling_params=sampling_params,
                      use_tqdm=True,
                      guided_options_request=dict(guided_regex=sample_regex))
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_json_object(llm):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=100,
+        guided_decoding=GuidedDecodingParams(json_object=True))
+
+    outputs = llm.generate(
+        prompts=("Generate a JSON object describing a person with name "
+                 "and age for John Smith who is 31 years old."),
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index cbfb0cc32c1ce..bf609b38a94f5 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,12 +1,13 @@
 import sys
+from contextlib import nullcontext
+
+from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
-def test_lazy_outlines(sample_regex):
-    """If users don't use guided decoding, outlines should not be imported.
-    """
+def run_normal():
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -25,18 +26,17 @@ def test_lazy_outlines(sample_regex):
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-    # make sure outlines is not imported
-    assert 'outlines' not in sys.modules
-
     # Destroy the LLM object and free up the GPU memory.
     del llm
     cleanup_dist_env_and_memory()
 
+
+def run_lmfe(sample_regex):
     # Create an LLM with guided decoding enabled.
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
-              gpu_memory_utilization=0.6)
+              gpu_memory_utilization=0.3)
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     outputs = llm.generate(
         prompts=[
@@ -51,5 +51,26 @@ def test_lazy_outlines(sample_regex):
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
+
+def test_lazy_outlines(sample_regex):
+    """If users don't use guided decoding, outlines should not be imported.
+    """
     # make sure outlines is not imported
-    assert 'outlines' not in sys.modules
+    module_name = "outlines"
+    # In CI, we only check finally if the module is imported.
+    # If it is indeed imported, we can rerun the test with `use_blame=True`,
+    # which will trace every function call to find the first import location,
+    # and help find the root cause.
+    # We don't run it in CI by default because it is slow.
+    use_blame = False
+    context = blame(
+        lambda: module_name in sys.modules) if use_blame else nullcontext()
+    with context as result:
+        run_normal()
+        run_lmfe(sample_regex)
+    if use_blame:
+        assert isinstance(result, BlameResult)
+        print(f"the first import location is:\n{result.trace_stack}")
+    assert module_name not in sys.modules, (
+        f"Module {module_name} is imported. To see the first"
+        f" import location, run the test with `use_blame=True`.")
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
new file mode 100644
index 0000000000000..fcce8b46c4344
--- /dev/null
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -0,0 +1,137 @@
+import asyncio
+import contextlib
+import random
+import time
+from typing import Callable
+
+import openai
+import pytest
+import pytest_asyncio
+import requests
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        "--load-format",
+        "dummy",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ids=["completion", "chat"],
+    argnames=["create_func_gen", "content_body"],
+    argvalues=[
+        (lambda x: x.completions.create, {
+            "prompt": " ".join(['A'] * 10_000)
+        }),
+        (lambda x: x.chat.completions.create, {
+            "messages": [{
+                "role": "user",
+                "content": " ".join(['A'] * 10_000)
+            }]
+        }),
+    ],
+)
+async def test_with_and_without_truncate(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncOpenAI,
+    create_func_gen: Callable,
+    content_body: dict,
+):
+    create_func = create_func_gen(client)
+    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
+
+    num_requests = 10
+    truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] *
+                              (num_requests - num_requests // 2))
+    random.shuffle(truncate_prompt_tokens)
+
+    bodies = [{
+        **body, "extra_body": {
+            'truncate_prompt_tokens': t
+        }
+    } for t in truncate_prompt_tokens]
+
+    async def get_status_code(**kwargs):
+        try:
+            await create_func(**kwargs)
+            return 200
+        except openai.APIStatusError as e:
+            return e.status_code
+
+    responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
+    assert 500 not in responses
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ids=["single completion", "multiple completions", "chat"],
+    argnames=["create_func_gen", "content_body"],
+    argvalues=[
+        (lambda x: x.completions.create, {
+            "prompt": " ".join(['A'] * 300_000)
+        }),
+        (lambda x: x.completions.create, {
+            "prompt": [" ".join(['A'] * 300_000)] * 2
+        }),
+        (lambda x: x.chat.completions.create, {
+            "messages": [{
+                "role": "user",
+                "content": " ".join(['A'] * 300_000)
+            }]
+        }),
+    ],
+)
+async def test_healthcheck_response_time(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncOpenAI,
+    create_func_gen: Callable,
+    content_body: dict,
+):
+    num_requests = 50
+
+    create_func = create_func_gen(client)
+    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
+
+    def get_response_time(url):
+        start_time = time.monotonic()
+        res = requests.get(url)
+        end_time = time.monotonic()
+        assert res.status_code == 200
+        return end_time - start_time
+
+    no_load_response_time = get_response_time(server.url_for("health"))
+    tasks = [
+        asyncio.create_task(create_func(**body)) for _ in range(num_requests)
+    ]
+    await asyncio.sleep(1)  # give the tasks a chance to start running
+    load_response_time = get_response_time(server.url_for("health"))
+
+    with contextlib.suppress(openai.APIStatusError):
+        await asyncio.gather(*tasks)
+
+    assert load_response_time < 100 * no_load_response_time
+    assert load_response_time < 0.1
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a74109e2f5120..0a29d77e73abc 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -25,6 +25,7 @@ def server():
         "--max-num-seqs",
         "5",
         "--enforce-eager",
+        "--trust-remote-code",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -154,6 +155,61 @@ async def test_single_chat_session_audio_base64encoded(
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_input_audio(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: Dict[str, str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
@@ -211,11 +267,72 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
     assert "".join(chunks) == output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
+                                          model_name: str, audio_url: str,
+                                          base64_encoded_audio: Dict[str,
+                                                                     str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_url: str):
+                                 audio_url: str,
+                                 base64_encoded_audio: Dict[str, str]):
 
     messages = [{
         "role":
@@ -228,9 +345,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
                 }
             },
             {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
                 }
             },
             {
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index 4616f363cc04a..547c1fd020928 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,6 +1,8 @@
+import asyncio
 from http import HTTPStatus
 from typing import List
 
+import openai
 import pytest
 import pytest_asyncio
 import requests
@@ -103,3 +105,52 @@ async def test_check_health(server: RemoteOpenAIServer):
     response = requests.get(server.url_for("health"))
 
     assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--max-model-len", "10100"],
+                     id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
+            id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_request_cancellation(server: RemoteOpenAIServer):
+    # clunky test: send an ungodly amount of load in with short timeouts
+    # then ensure that it still responds quickly afterwards
+
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client(timeout=0.5)
+    tasks = []
+    # Request about 2 million tokens
+    for _ in range(200):
+        task = asyncio.create_task(
+            client.chat.completions.create(messages=chat_input,
+                                           model=MODEL_NAME,
+                                           max_tokens=10000,
+                                           extra_body={"min_tokens": 10000}))
+        tasks.append(task)
+
+    done, pending = await asyncio.wait(tasks,
+                                       return_when=asyncio.ALL_COMPLETED)
+
+    # Make sure all requests were sent to the server and timed out
+    # (We don't want to hide other errors like 400s that would invalidate this
+    # test)
+    assert len(pending) == 0
+    for d in done:
+        with pytest.raises(openai.APITimeoutError):
+            d.result()
+
+    # If the server had not cancelled all the other requests, then it would not
+    # be able to respond to this one within the timeout
+    client = server.get_async_client(timeout=5)
+    response = await client.chat.completions.create(messages=chat_input,
+                                                    model=MODEL_NAME,
+                                                    max_tokens=10)
+
+    assert len(response.choices) == 1
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 8d13f64dce01c..47c521a9b5eb5 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -482,6 +482,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice1 = chat_completion.choices[0].message.content
@@ -496,6 +497,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice2 = chat_completion.choices[0].message.content
@@ -829,6 +831,20 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
                     "name": "nondefined_function_name"
                 }
             })
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema
+                }
+            }],
+            tool_choice={})
 
 
 @pytest.mark.asyncio
@@ -899,19 +915,19 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_extra_fields(client: openai.AsyncOpenAI):
-    with pytest.raises(BadRequestError) as exc_info:
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[{
-                "role": "system",
-                "content": "You are a helpful assistant.",
-                "extra_field": "0",
-            }],  # type: ignore
-            temperature=0,
-            seed=0)
-
-    assert "extra_forbidden" in exc_info.value.message
+async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
+    resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?",
+            "extra_field": "0",
+        }],  # type: ignore
+        temperature=0,
+        seed=0)
+
+    content = resp.choices[0].message.content
+    assert content is not None
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
new file mode 100644
index 0000000000000..223ac5b41aa83
--- /dev/null
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -0,0 +1,79 @@
+from typing import NamedTuple
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# # any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--enforce-eager",
+        "--max-model-len",
+        "4080",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+class TestCase(NamedTuple):
+    model_name: str
+    echo: bool
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        TestCase(model_name=MODEL_NAME, echo=True),
+        TestCase(model_name=MODEL_NAME, echo=False)
+    ],
+)
+async def test_chat_session_with_echo_and_continue_final_message(
+        client: openai.AsyncOpenAI, test_case: TestCase):
+    saying: str = "Here is a common saying about apple. An apple a day, keeps"
+    # test echo with continue_final_message parameter
+    chat_completion = await client.chat.completions.create(
+        model=test_case.model_name,
+        messages=[{
+            "role": "user",
+            "content": "tell me a common saying"
+        }, {
+            "role": "assistant",
+            "content": saying
+        }],
+        extra_body={
+            "echo": test_case.echo,
+            "continue_final_message": True,
+            "add_generation_prompt": False
+        })
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "stop"
+
+    message = choice.message
+    if test_case.echo:
+        assert message.content is not None and saying in message.content
+    else:
+        assert message.content is not None and saying not in message.content
+    assert message.role == "assistant"
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
new file mode 100644
index 0000000000000..20f7960619efb
--- /dev/null
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -0,0 +1,103 @@
+import contextlib
+import os
+from typing import Any, List, NamedTuple
+
+import openai  # use the official client for correctness check
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+# # any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+API_KEY = "abc-123"
+ERROR_API_KEY = "abc"
+ROOT_PATH = "llm"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--enforce-eager",
+        "--max-model-len",
+        "4080",
+        "--root-path",  # use --root-path=/llm for testing
+        "/" + ROOT_PATH,
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+    envs = os.environ.copy()
+
+    envs["VLLM_API_KEY"] = API_KEY
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+class TestCase(NamedTuple):
+    model_name: str
+    base_url: List[str]
+    api_key: str
+    expected_error: Any
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=["v1"],  # http://localhost:8000/v1
+            api_key=ERROR_API_KEY,
+            expected_error=openai.AuthenticationError),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
+            api_key=ERROR_API_KEY,
+            expected_error=openai.AuthenticationError),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=["v1"],  # http://localhost:8000/v1
+            api_key=API_KEY,
+            expected_error=None),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
+            api_key=API_KEY,
+            expected_error=None),
+    ],
+)
+async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
+                                                   test_case: TestCase):
+    saying: str = "Here is a common saying about apple. An apple a day, keeps"
+    ctx = contextlib.nullcontext()
+    if test_case.expected_error is not None:
+        ctx = pytest.raises(test_case.expected_error)
+    with ctx:
+        client = openai.AsyncOpenAI(
+            api_key=test_case.api_key,
+            base_url=server.url_for(*test_case.base_url),
+            max_retries=0)
+        chat_completion = await client.chat.completions.create(
+            model=test_case.model_name,
+            messages=[{
+                "role": "user",
+                "content": "tell me a common saying"
+            }, {
+                "role": "assistant",
+                "content": saying
+            }],
+            extra_body={
+                "continue_final_message": True,
+                "add_generation_prompt": False
+            })
+
+        assert chat_completion.id is not None
+        assert len(chat_completion.choices) == 1
+        choice = chat_completion.choices[0]
+        assert choice.finish_reason == "stop"
+        message = choice.message
+        assert len(message.content) > 0
+        assert message.role == "assistant"
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
new file mode 100644
index 0000000000000..a803ea4a8d6ad
--- /dev/null
+++ b/tests/entrypoints/openai/test_score.py
@@ -0,0 +1,93 @@
+import pytest
+import requests
+
+from vllm.entrypoints.openai.protocol import ScoreResponse
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "BAAI/bge-reranker-v2-m3"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
+                                      model_name: str):
+    text_1 = "What is the capital of France?"
+    text_2 = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    score_response = requests.post(server.url_for("score"),
+                                   json={
+                                       "model": model_name,
+                                       "text_1": text_1,
+                                       "text_2": text_2,
+                                   })
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+    assert score.data[0].score <= 0.01
+    assert score.data[1].score >= 0.9
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
+                                       model_name: str):
+    text_1 = [
+        "What is the capital of the United States?",
+        "What is the capital of France?"
+    ]
+    text_2 = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    score_response = requests.post(server.url_for("score"),
+                                   json={
+                                       "model": model_name,
+                                       "text_1": text_1,
+                                       "text_2": text_2,
+                                   })
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+    assert score.data[0].score <= 0.01
+    assert score.data[1].score >= 0.9
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
+                                     model_name: str):
+    text_1 = "What is the capital of France?"
+    text_2 = "The capital of France is Paris."
+
+    score_response = requests.post(server.url_for("score"),
+                                   json={
+                                       "model": model_name,
+                                       "text_1": text_1,
+                                       "text_2": text_2,
+                                   })
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+    assert score.data[0].score >= 0.9
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index e969d33775d86..5b40a04db15ee 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -26,11 +26,11 @@ class MockModelConfig:
     tokenizer = MODEL_NAME
     trust_remote_code = False
     tokenizer_mode = "auto"
-    chat_template_text_format = "string"
     max_model_len = 100
     tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
+    logits_processor_pattern = None
 
 
 @dataclass
@@ -49,6 +49,7 @@ async def _async_serving_chat_init():
                                            BASE_MODEL_PATHS,
                                            response_role="assistant",
                                            chat_template=CHAT_TEMPLATE,
+                                           chat_template_content_format="auto",
                                            lora_modules=None,
                                            prompt_adapters=None,
                                            request_logger=None)
@@ -70,6 +71,7 @@ def test_serving_chat_should_set_correct_max_tokens():
                                      BASE_MODEL_PATHS,
                                      response_role="assistant",
                                      chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
                                      lora_modules=None,
                                      prompt_adapters=None,
                                      request_logger=None)
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 6199a75b5b4f8..096ab6fa0ac09 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -9,6 +9,7 @@
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.lora.request import LoRARequest
 
 MODEL_NAME = "meta-llama/Llama-2-7b"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@@ -33,6 +34,16 @@ async def _async_serving_engine_init():
     return serving_engine
 
 
+@pytest.mark.asyncio
+async def test_serving_model_name():
+    serving_engine = await _async_serving_engine_init()
+    assert serving_engine._get_model_name(None) == MODEL_NAME
+    request = LoRARequest(lora_name="adapter",
+                          lora_path="/path/to/adapter2",
+                          lora_int_id=1)
+    assert serving_engine._get_model_name(request) == request.lora_name
+
+
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
     serving_engine = await _async_serving_engine_init()
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 157d873a75b4d..a0b6edd566561 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -89,7 +89,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=772, total_tokens=782)
+        completion_tokens=10, prompt_tokens=775, total_tokens=785)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -181,7 +181,7 @@ async def test_single_chat_session_image_base64encoded(
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=772, total_tokens=782)
+        completion_tokens=10, prompt_tokens=775, total_tokens=785)
 
     message = choice.message
     message = chat_completion.choices[0].message
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index d0c43b47bf0af..43c63daacb17f 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -27,7 +27,7 @@
 def server():
     args = [
         "--task",
-        "embedding",
+        "embed",
         "--dtype",
         "bfloat16",
         "--max-model-len",
@@ -95,5 +95,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     assert len(embeddings["data"]) == 1
     assert len(embeddings["data"][0]["embedding"]) == 3072
     assert embeddings["usage"]["completion_tokens"] == 0
-    assert embeddings["usage"]["prompt_tokens"] == 762
-    assert embeddings["usage"]["total_tokens"] == 762
+    assert embeddings["usage"]["prompt_tokens"] == 765
+    assert embeddings["usage"]["total_tokens"] == 765
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 5fa466f8f041f..996e60bfee592 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -6,15 +6,24 @@
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (parse_chat_messages,
-                                         parse_chat_messages_futures)
+from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
+                                         parse_chat_messages,
+                                         parse_chat_messages_futures,
+                                         resolve_chat_template_content_format)
 from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
+from ..utils import VLLM_PATH
+
+EXAMPLES_DIR = VLLM_PATH / "examples"
+
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_3"
+QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
 
 
 @pytest.fixture(scope="function")
@@ -26,7 +35,6 @@ def phi3v_model_config():
                        trust_remote_code=True,
                        dtype="bfloat16",
                        seed=0,
-                       chat_template_text_format="string",
                        limit_mm_per_prompt={
                            "image": 2,
                        })
@@ -94,19 +102,24 @@ def test_parse_chat_messages_single_image(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "text",
-            "text": "What's in the image?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in the image?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role": "user",
@@ -121,19 +134,24 @@ async def test_parse_chat_messages_single_image_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "text",
-            "text": "What's in the image?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_future = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in the image?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role": "user",
@@ -147,24 +165,29 @@ def test_parse_chat_messages_multiple_images(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "text",
-            "text": "What's in these images?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in these images?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role":
@@ -181,24 +204,29 @@ async def test_parse_chat_messages_multiple_images_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "text",
-            "text": "What's in these images?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_future = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in these images?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role":
@@ -214,27 +242,31 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type":
-            "text",
-            "text":
-            "What's in <|image_1|> and how does it compare to <|image_2|>?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
-
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type":
+                "text",
+                "text":
+                "What's in <|image_1|> and how does it compare to <|image_2|>?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
     assert conversation == [{
         "role":
         "user",
@@ -249,26 +281,35 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type":
-            "text",
-            "text":
-            "What's in <|image_1|> and how does it compare to the other one?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type":
+                    "text",
+                    "text":
+                    "What's in <|image_1|> and how does it compare to the other one?"  # noqa: E501
+                }
+            ]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role":
@@ -285,34 +326,39 @@ def test_parse_chat_messages_multiple_images_across_messages(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in this image?"
+            }]
         }, {
-            "type": "text",
-            "text": "What's in this image?"
-        }]
-    }, {
-        "role": "assistant",
-        "content": "Some stuff."
-    }, {
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
+            "role": "assistant",
+            "content": "Some stuff."
         }, {
-            "type": "text",
-            "text": "What about this one?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What about this one?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [
         {
@@ -335,7 +381,6 @@ def test_parse_chat_messages_context_text_format(
     phi3v_model_config,
     phi3v_tokenizer,
 ):
-    phi3v_model_config.chat_template_text_format = "openai"
     conversation, mm_data = parse_chat_messages(
         [{
             "role": "user",
@@ -349,7 +394,11 @@ def test_parse_chat_messages_context_text_format(
         }, {
             "role": "user",
             "content": "What about this one?"
-        }], phi3v_model_config, phi3v_tokenizer)
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="openai",
+    )
 
     assert conversation == [
         {
@@ -389,29 +438,34 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                 ValueError,
                 match="At most 2 image\\(s\\) may be provided in one request\\."
         ):
-            parse_chat_messages([{
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                }, {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                }, {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                }, {
-                    "type": "text",
-                    "text": "What's in these images?"
-                }]
-            }], phi3v_model_config, phi3v_tokenizer)
+            parse_chat_messages(
+                [{
+                    "role":
+                    "user",
+                    "content": [{
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "text",
+                        "text": "What's in these images?"
+                    }]
+                }],
+                phi3v_model_config,
+                phi3v_tokenizer,
+                content_format="string",
+            )
 
 
 def test_parse_chat_messages_rejects_too_many_images_across_messages(
@@ -427,39 +481,44 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                 ValueError,
                 match="At most 2 image\\(s\\) may be provided in one request\\."
         ):
-            parse_chat_messages([{
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
+            parse_chat_messages(
+                [{
+                    "role":
+                    "user",
+                    "content": [{
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "text",
+                        "text": "What's in this image?"
+                    }]
                 }, {
-                    "type": "text",
-                    "text": "What's in this image?"
-                }]
-            }, {
-                "role": "assistant",
-                "content": "Some stuff."
-            }, {
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
+                    "role": "assistant",
+                    "content": "Some stuff."
                 }, {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                }, {
-                    "type": "text",
-                    "text": "What about these two?"
-                }]
-            }], phi3v_model_config, phi3v_tokenizer)
+                    "role":
+                    "user",
+                    "content": [{
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "text",
+                        "text": "What about these two?"
+                    }]
+                }],
+                phi3v_model_config,
+                phi3v_tokenizer,
+                content_format="string",
+            )
 
 
 def test_parse_chat_messages_multiple_images_uncommon_input(
@@ -467,17 +526,22 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [
-            "What's in these images?", {
-                "image_url": image_url
-            }, {
-                "image_url": image_url
-            }
-        ]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                "What's in these images?", {
+                    "image_url": image_url
+                }, {
+                    "image_url": image_url
+                }
+            ]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role":
@@ -495,16 +559,21 @@ def test_mllama_single_image(
     image_url,
 ):
     """Ensures that a single image is parsed correctly mllama."""
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            'type': 'text',
-            'text': 'The content of this image is:'
-        }, {
-            "image_url": image_url
-        }]
-    }], mllama_model_config, mllama_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                'type': 'text',
+                'text': 'The content of this image is:'
+            }, {
+                "image_url": image_url
+            }]
+        }],
+        mllama_model_config,
+        mllama_tokenizer,
+        content_format="openai",
+    )
     _assert_mm_data_is_image_input(mm_data, 1)
     assert conversation == [{
         'role':
@@ -524,26 +593,31 @@ def test_mllama_interleaved_images(
     image_url,
 ):
     """Ensures that multiple image are parsed as interleaved dicts."""
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [
-            {
-                'type': 'text',
-                'text': 'The content of the first image is:'
-            },
-            {
-                "image_url": image_url
-            },
-            {
-                'type': 'text',
-                'text': 'The content of the second image is:'
-            },
-            {
-                "image_url": image_url
-            },
-        ]
-    }], mllama_model_config, mllama_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    'type': 'text',
+                    'text': 'The content of the first image is:'
+                },
+                {
+                    "image_url": image_url
+                },
+                {
+                    'type': 'text',
+                    'text': 'The content of the second image is:'
+                },
+                {
+                    "image_url": image_url
+                },
+            ]
+        }],
+        mllama_model_config,
+        mllama_tokenizer,
+        content_format="openai",
+    )
     _assert_mm_data_is_image_input(mm_data, 2)
     assert conversation == [{
         'role':
@@ -626,6 +700,7 @@ def get_conversation(is_hf: bool):
         vllm_conversation,
         model_config,
         tokenizer_group,
+        content_format="openai",
     )
 
     vllm_result = apply_hf_chat_template(
@@ -636,3 +711,89 @@ def get_conversation(is_hf: bool):
     )
 
     assert hf_result == vllm_result
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("model", "expected_format"),
+    [(PHI3V_MODEL_ID, "string"),
+     (QWEN2VL_MODEL_ID, "openai"),
+     (ULTRAVOX_MODEL_ID, "string"),
+     (MLLAMA_MODEL_ID, "openai"),
+     (LLAMA_GUARD_MODEL_ID, "openai")],
+)
+# yapf: enable
+def test_resolve_content_format_hf_defined(model, expected_format):
+    tokenizer_group = TokenizerGroup(
+        model,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+    tokenizer = tokenizer_group.tokenizer
+
+    chat_template = tokenizer.chat_template
+    assert isinstance(chat_template, str)
+
+    print("[TEXT]")
+    print(chat_template)
+    print("[AST]")
+    print(_try_extract_ast(chat_template))
+
+    resolved_format = resolve_chat_template_content_format(
+        None,  # Test detecting the tokenizer's chat_template
+        "auto",
+        tokenizer,
+    )
+
+    assert resolved_format == expected_format
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("template_path", "expected_format"),
+    [("template_alpaca.jinja", "string"),
+     ("template_baichuan.jinja", "string"),
+     ("template_blip2.jinja", "string"),
+     ("template_chatglm.jinja", "string"),
+     ("template_chatglm2.jinja", "string"),
+     ("template_chatml.jinja", "string"),
+     ("template_falcon_180b.jinja", "string"),
+     ("template_falcon.jinja", "string"),
+     ("template_inkbot.jinja", "string"),
+     ("template_llava.jinja", "string"),
+     ("template_vlm2vec.jinja", "openai"),
+     ("tool_chat_template_granite_20b_fc.jinja", "string"),
+     ("tool_chat_template_hermes.jinja", "string"),
+     ("tool_chat_template_internlm2_tool.jinja", "string"),
+     ("tool_chat_template_llama3.1_json.jinja", "openai"),
+     ("tool_chat_template_llama3.2_json.jinja", "openai"),
+     ("tool_chat_template_mistral_parallel.jinja", "string"),
+     ("tool_chat_template_mistral.jinja", "string")],
+)
+# yapf: enable
+def test_resolve_content_format_examples(template_path, expected_format):
+    tokenizer_group = TokenizerGroup(
+        PHI3V_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+    dummy_tokenizer = tokenizer_group.tokenizer
+    dummy_tokenizer.chat_template = None
+
+    chat_template = load_chat_template(EXAMPLES_DIR / template_path)
+    assert isinstance(chat_template, str)
+
+    print("[TEXT]")
+    print(chat_template)
+    print("[AST]")
+    print(_try_extract_ast(chat_template))
+
+    resolved_format = resolve_chat_template_content_format(
+        chat_template,
+        "auto",
+        dummy_tokenizer,
+    )
+
+    assert resolved_format == expected_format
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 169ce040d370c..d37f95d48d5b2 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -5,6 +5,7 @@
 
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import which_attn_to_use
+from vllm.platforms import cpu, cuda, openvino, rocm
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
@@ -19,26 +20,28 @@ def test_env(name: str, device: str, monkeypatch):
     override_backend_env_variable(monkeypatch, name)
 
     if device == "cpu":
-        with patch("vllm.attention.selector.current_platform.is_cpu",
-                   return_value=True):
+        with patch("vllm.attention.selector.current_platform",
+                   cpu.CpuPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "TORCH_SDPA"
     elif device == "hip":
-        with patch("vllm.attention.selector.current_platform.is_rocm",
-                   return_value=True):
+        with patch("vllm.attention.selector.current_platform",
+                   rocm.RocmPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "ROCM_FLASH"
     elif device == "openvino":
-        with patch("vllm.attention.selector.current_platform.is_openvino",
-                   return_value=True):
+        with patch("vllm.attention.selector.current_platform",
+                   openvino.OpenVinoPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "OPENVINO"
     else:
-        backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                    False)
+        with patch("vllm.attention.selector.current_platform",
+                   cuda.CudaPlatform()):
+            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
+                                        False)
         assert backend.name == name
 
 
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py
index f9b11018288be..51be2425d7dd7 100644
--- a/tests/kernels/test_causal_conv1d.py
+++ b/tests/kernels/test_causal_conv1d.py
@@ -149,13 +149,14 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
 @pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
 @pytest.mark.parametrize("silu_activation", [True])
 @pytest.mark.parametrize("has_bias", [True])
+@pytest.mark.parametrize("has_initial_state", [True, False])
 @pytest.mark.parametrize("width", [4])
 @pytest.mark.parametrize(
     'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096])
 @pytest.mark.parametrize('dim', [64])
 @pytest.mark.parametrize('batch', [1])
 def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
-                       itype):
+                       has_initial_state, itype):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
@@ -167,11 +168,18 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
 
     weight = torch.randn(dim, width, device=device, dtype=itype)
     bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
-    initial_states = torch.randn(batch,
-                                 dim,
-                                 width - 1,
-                                 device=device,
-                                 dtype=itype)
+    if has_initial_state:
+        initial_states = torch.randn(batch,
+                                     dim,
+                                     width - 1,
+                                     device=device,
+                                     dtype=itype)
+        has_initial_state_tensor = torch.ones(batch,
+                                              dtype=torch.bool,
+                                              device=x.device)
+    else:
+        initial_states = None
+        has_initial_state_tensor = None
     x_ref = x.clone()
     weight_ref = weight.clone()
     bias_ref = bias.clone() if bias is not None else None
@@ -183,9 +191,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
                            bias,
                            activation=activation,
                            conv_states=initial_states,
-                           has_initial_state=torch.ones(batch,
-                                                        dtype=torch.bool,
-                                                        device=x.device))
+                           has_initial_state=has_initial_state_tensor)
     out_ref, final_states_ref = causal_conv1d_ref(
         x_ref,
         weight_ref,
@@ -193,11 +199,12 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
         initial_states=initial_states_ref,
         return_final_states=True,
         activation=activation)
-    assert initial_states is not None and final_states_ref is not None
-    assert torch.allclose(initial_states,
-                          final_states_ref,
-                          rtol=rtol,
-                          atol=atol)
+    if has_initial_state:
+        assert initial_states is not None and final_states_ref is not None
+        assert torch.allclose(initial_states,
+                              final_states_ref,
+                              rtol=rtol,
+                              atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
 
     causal_conv1d_opcheck_fn(x,
@@ -205,9 +212,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
                              bias,
                              activation=activation,
                              conv_states=initial_states,
-                             has_initial_state=torch.ones(batch,
-                                                          dtype=torch.bool,
-                                                          device=x.device))
+                             has_initial_state=has_initial_state_tensor)
 
 
 @pytest.mark.parametrize("itype", [torch.bfloat16])
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index 3d3724c50421d..d943b048b7934 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -18,6 +18,7 @@
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.forward_context import set_forward_context
 from vllm.platforms import current_platform
 
@@ -594,6 +595,7 @@ def _run_encoder_attention_test(
     encoder_test_params: PhaseTestParameters,
     attn_metadata: AttentionMetadata,
     test_pt: TestPoint,
+    vllm_config: VllmConfig,
 ) -> torch.Tensor:
     '''
     Run encoder attention.
@@ -623,7 +625,7 @@ def _run_encoder_attention_test(
     attn_type = AttentionType.ENCODER
     packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
-    with set_forward_context(attn_metadata):
+    with set_forward_context(attn_metadata, vllm_config):
         # In the test setup the shape of the query is
         # [batch_size, seq_len, num_heads, head_size]. However
         # the attention backend expect the shape to be
@@ -648,6 +650,7 @@ def _run_decoder_self_attention_test(
     decoder_test_params: PhaseTestParameters,
     attn_metadata: AttentionMetadata,
     test_pt: TestPoint,
+    vllm_config: VllmConfig,
 ) -> torch.Tensor:
     '''
     Run decoder self-attention test.
@@ -677,7 +680,7 @@ def _run_decoder_self_attention_test(
     kv_cache = test_rsrcs.kv_cache
     packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
-    with set_forward_context(attn_metadata):
+    with set_forward_context(attn_metadata, vllm_config):
         # In the test setup the shape of the query is
         # [batch_size, seq_len, num_heads, head_size]. However
         # the attention backend expect the shape to be
@@ -701,6 +704,7 @@ def _run_encoder_decoder_cross_attention_test(
     cross_test_params: Optional[PhaseTestParameters],
     attn_metadata: AttentionMetadata,
     test_pt: TestPoint,
+    vllm_config: VllmConfig,
 ) -> torch.Tensor:
     '''
     Run encoder/decoder cross-attention test.
@@ -748,7 +752,7 @@ def _run_encoder_decoder_cross_attention_test(
         cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
         key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
         value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
-    with set_forward_context(attn_metadata):
+    with set_forward_context(attn_metadata, vllm_config):
         # In the test setup the shape of the query is
         # [batch_size, seq_len, num_heads, head_size]. However
         # the attention backend expect the shape to be
@@ -839,7 +843,9 @@ def test_encoder_only(
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
-        test_rsrcs = _make_test_resources(test_pt)
+        vllm_config = VllmConfig()
+        with set_current_vllm_config(vllm_config):
+            test_rsrcs = _make_test_resources(test_pt)
 
         # Construct encoder attention test params (only used
         # during prefill)
@@ -863,7 +869,8 @@ def test_encoder_only(
             test_rsrcs.attn,
             enc_test_params,
             prephase_attn_metadata,
-            test_pt=test_pt))
+            test_pt=test_pt,
+            vllm_config=vllm_config))
 
         # - Is encoder attention result correct?
         assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
@@ -960,7 +967,9 @@ def test_e2e_enc_dec_attn(
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
-        test_rsrcs = _make_test_resources(test_pt)
+        vllm_config = VllmConfig()
+        with set_current_vllm_config(vllm_config):
+            test_rsrcs = _make_test_resources(test_pt)
 
         # Construct encoder attention test params (only used
         # during prefill)
@@ -1011,7 +1020,8 @@ def test_e2e_enc_dec_attn(
         enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
                                                        enc_test_params,
                                                        prephase_attn_metadata,
-                                                       test_pt=test_pt)
+                                                       test_pt=test_pt,
+                                                       vllm_config=vllm_config)
 
         # - Is encoder attention result correct?
         assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
@@ -1023,7 +1033,8 @@ def test_e2e_enc_dec_attn(
             test_rsrcs,
             prephase_dec_test_params,
             prephase_attn_metadata,
-            test_pt=test_pt)
+            test_pt=test_pt,
+            vllm_config=vllm_config)
 
         # - Is prefill decoder self-attention correct?
         assert_actual_matches_ideal(prephase_dec_test_params,
@@ -1037,7 +1048,8 @@ def test_e2e_enc_dec_attn(
             prephase_dec_test_params,
             prephase_cross_test_params,
             prephase_attn_metadata,
-            test_pt=test_pt)
+            test_pt=test_pt,
+            vllm_config=vllm_config)
 
         # - Is prefill encoder/decoder cross-attention correct?
         assert_actual_matches_ideal(prephase_cross_test_params,
@@ -1061,7 +1073,8 @@ def test_e2e_enc_dec_attn(
             test_rsrcs,
             decphase_dec_test_params,
             decphase_attn_metadata,
-            test_pt=test_pt)
+            test_pt=test_pt,
+            vllm_config=vllm_config)
 
         # - Is decode-phase decoder self-attention correct?
         assert_actual_matches_ideal(decphase_dec_test_params,
@@ -1075,7 +1088,8 @@ def test_e2e_enc_dec_attn(
             decphase_dec_test_params,
             None,
             decphase_attn_metadata,
-            test_pt=test_pt)
+            test_pt=test_pt,
+            vllm_config=vllm_config)
 
         # - Is decode-phase encoder/decoder cross-attention correct?
         assert_actual_matches_ideal(decphase_cross_test_params,
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index a20c73345218f..1ae78d7b46c5b 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -71,6 +71,7 @@ def ref_paged_attn(
     return torch.cat(outputs, dim=0)
 
 
+@pytest.mark.parametrize("use_out", [True, False])
 @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -81,6 +82,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("sliding_window", [None, 256])
 @torch.inference_mode()
 def test_flash_attn_with_paged_kv(
+    use_out: bool,
     kv_lens: List[int],
     num_heads: Tuple[int, int],
     head_size: int,
@@ -116,17 +118,22 @@ def test_flash_attn_with_paged_kv(
                                  (num_seqs, max_num_blocks_per_seq),
                                  dtype=torch.int32)
 
+    q = query.unsqueeze(1)
+    out = torch.empty_like(q) if use_out else None
     output = flash_attn_with_kvcache(
-        q=query.unsqueeze(1),
+        q=q,
         k_cache=key_cache,
         v_cache=value_cache,
+        out=out,
         softmax_scale=scale,
         causal=True,
         block_table=block_tables,
         cache_seqlens=kv_lens_tensor,
         softcap=soft_cap if soft_cap is not None else 0,
         window_size=window_size,
-    ).squeeze(1)
+    )
+    output = output if not use_out else out
+    output = output.squeeze(1)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache,
@@ -141,7 +148,10 @@ def test_flash_attn_with_paged_kv(
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
-@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
+@pytest.mark.parametrize("use_out", [True, False])
+@pytest.mark.parametrize("seq_lens",
+                         [[(1, 1328), (5, 18),
+                           (129, 463)], [(1, 523), (1, 37), (1, 2011)]])
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
@@ -151,6 +161,7 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @torch.inference_mode()
 def test_varlen_with_paged_kv(
+    use_out: bool,
     seq_lens: List[Tuple[int, int]],
     num_heads: Tuple[int, int],
     head_size: int,
@@ -197,10 +208,12 @@ def test_varlen_with_paged_kv(
                                  (num_seqs, max_num_blocks_per_seq),
                                  dtype=torch.int32)
 
+    out = torch.empty_like(query) if use_out else None
     output = flash_attn_varlen_func(
         q=query,
         k=key_cache,
         v=value_cache,
+        out=out,
         cu_seqlens_q=cu_query_lens,
         cu_seqlens_k=cu_kv_lens,
         max_seqlen_q=max_query_len,
@@ -211,6 +224,7 @@ def test_varlen_with_paged_kv(
         block_table=block_tables,
         softcap=soft_cap if soft_cap is not None else 0,
     )
+    output = output if not use_out else out
 
     ref_output = ref_paged_attn(
         query=query,
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py
new file mode 100644
index 0000000000000..baf8d73fdbffb
--- /dev/null
+++ b/tests/kernels/test_fused_quant_layernorm.py
@@ -0,0 +1,171 @@
+from typing import Optional, Tuple, Union
+
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+DTYPES = [torch.bfloat16, torch.float]
+QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
+VEC_HIDDEN_SIZES = range(1024, 1030)
+# Avoid combinatorial explosion with full Cartesian product
+NUM_TOKENS_HIDDEN_SIZES = [
+    *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
+    *[(83, i) for i in [1, 1033, 2048, 5120]],
+    *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]],
+    *[(4096, i) for i in [1, 64, 5137]],
+]
+
+ADD_RESIDUAL = [False, True]
+SCALE_UBS = [True, False]
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+EPS = 1e-6
+
+## Helpers
+
+
+def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+
+
+def ref_rms_norm(rms_norm_layer: RMSNorm,
+                 x: torch.Tensor,
+                 residual: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+        out, residual = rms_norm_layer.forward_native(x, residual)
+    else:
+        out = rms_norm_layer.forward_native(x)
+
+    return out, residual
+
+
+def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if scale_ub is not None:
+        assert quant_dtype == torch.float8_e4m3fn
+
+    # Norm
+    torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual)
+
+    # Quant
+    if quant_dtype == torch.float8_e4m3fn:
+        torch_out, scales = ops.scaled_fp8_quant(torch_out,
+                                                 scale_ub=scale_ub,
+                                                 use_per_token_if_dynamic=True)
+    else:
+        assert quant_dtype == torch.int8
+        torch_out, scales = ops.scaled_int8_quant(torch_out)
+
+    return torch_out, scales, residual
+
+
+def ref_impl(rms_norm_layer: RMSNorm,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
+                                       residual, scale_ub)
+
+
+def ops_dynamic_per_token_quant(weight: torch.Tensor,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+    out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
+                                                       quant_dtype, scale_ub,
+                                                       residual)
+    return out, scales, residual
+
+
+def ops_impl(weight: torch.Tensor,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
+                                       scale_ub)
+
+
+@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    scale_ub: bool,
+    dtype: torch.dtype,
+    quant_dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+
+    if scale_ub is not None and quant_dtype != torch.float8_e4m3fn:
+        # skip
+        return
+
+    layer = RMSNorm(hidden_size, EPS).to(dtype=dtype)
+
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+
+    # Make inputs
+    scale = 1 / (hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+    if scale_ub is not None:
+        rms_x, _ = ref_rms_norm(layer, x, residual)
+        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
+
+    ref_out, ref_scales, ref_residual = \
+        ref_impl(layer, x, quant_dtype, residual, scale_ub)
+    ops_out, ops_scales, ops_residual = \
+        ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
+
+    assert ref_out.dtype == quant_dtype
+    assert ops_out.dtype == quant_dtype
+    assert torch.allclose(ref_scales, ops_scales)
+    if quant_dtype == torch.int8:
+        # big atol to account for round-off errors.
+        assert torch.allclose(ref_out, ops_out, atol=1)
+    else:
+        assert torch.allclose(ref_out.to(dtype=torch.float32),
+                              ops_out.to(dtype=torch.float32))
+    if add_residual:
+        assert torch.allclose(ref_residual, ops_residual)
+
+    output = torch.empty_like(x, dtype=quant_dtype)
+    scales = torch.empty((x.numel() // x.shape[-1], 1),
+                         device=x.device,
+                         dtype=torch.float32)
+
+    opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
+            (output, x, layer.weight, scales, 1e-5, scale_ub, residual))
diff --git a/tests/kernels/test_machete_gemm.py b/tests/kernels/test_machete_gemm.py
deleted file mode 100644
index 59c0a24753c3b..0000000000000
--- a/tests/kernels/test_machete_gemm.py
+++ /dev/null
@@ -1,284 +0,0 @@
-"""Tests for the machete kernel.
-
-Run `pytest tests/kernels/test_machete_gemm.py`.
-"""
-
-import math
-from typing import Optional, Tuple
-
-import pytest
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    pack_rows, quantize_weights)
-from vllm.platforms import current_platform
-from vllm.scalar_type import ScalarType, scalar_types
-
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-MNK_SHAPES = [
-    (1, 128, 128),
-    (1, 512, 1024),
-    (1, 4096, 4096),
-    (1, 8192, 28672),
-    (13, 8192, 4096),
-    (26, 4096, 8192),
-    (64, 4096, 4096),
-    (64, 8192, 28672),
-    (257, 128, 4096),
-    (257, 4224, 4160),
-    (257, 4096, 4096),
-    (1024, 4096, 8192),
-    (1024, 8192, 4096),
-]
-
-ACT_TYPES = [torch.float16, torch.bfloat16]
-WTYPE_ZEROPOINTS = [
-    # GPTQ style
-    (scalar_types.uint4b8, False),
-    (scalar_types.uint8b128, False),
-    # AWQ style
-    (scalar_types.uint4, True),
-    (scalar_types.uint8, True),
-]
-
-# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
-#  unit tests to a common utility function. Currently the use of
-#  `is_quant_method_supported` conflates kernels with quantization methods
-#  an assumption which is breaking down as quantizations methods can have
-#  have kernels and some kernels support multiple quantization methods.
-IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
-
-
-def rand_data(shape, dtype=torch.float16):
-    return 10 * (torch.rand(shape, dtype=dtype, device="cuda") - 0.3)
-
-
-def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
-    return zps if zps is None else -1 * s * (zps.to(s.dtype))
-
-
-def machete_quantize_and_pack(w: torch.Tensor,
-                              wtype: ScalarType,
-                              group_size: int,
-                              zero_points: bool = False):
-    assert wtype.is_integer(), "TODO: support floating point weights"
-
-    w_ref, w_q, w_s, w_zp = quantize_weights(
-        w,
-        wtype,
-        group_size,
-        zero_points=zero_points,
-        # to match how the kernel applies zps
-        ref_zero_points_after_scales=True)
-
-    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
-    w_q = w_q.t().contiguous().t()  # convert to col major
-    w_q_machete = ops.machete_prepack_B(w_q, wtype)
-
-    opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype.id))
-
-    return w_ref, w_q_machete, w_s, w_zp
-
-
-def machete_gemm_test_helper(a: torch.Tensor, b: torch.Tensor,
-                             wtype: ScalarType, group_size: int,
-                             zero_points: bool):
-    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
-        b, wtype, group_size, zero_points)
-
-    output_ref = torch.matmul(a, w_ref)
-
-    output = ops.machete_gemm(
-        a=a,
-        b_q=w_q_packed,
-        b_type=wtype,
-        b_scales=w_s,
-        b_zeros=maybe_convert_zeropoints(w_zp, w_s),
-        b_group_size=group_size,
-    )
-
-    # Relax atol as our reduction dim becomes larger (more rounding error)
-    # Relax atol when we have zeropoints since the way machete applies
-    #  zeropoints (after scales) causes noise around 0
-    atol = 1 if zero_points else min(5e-2 * math.sqrt(a.shape[1]), 1)
-    torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
-
-
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("shape",
-                         MNK_SHAPES,
-                         ids=lambda x: "x".join(str(v) for v in x))
-@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
-@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
-@pytest.mark.parametrize("group_size", [128, None])
-def test_machete_all_schedules(shape, atype: torch.dtype,
-                               wtype_zeropoints: Tuple[ScalarType, bool],
-                               group_size: Optional[int]):
-    m, n, k = shape
-    wtype, zero_points = wtype_zeropoints
-
-    if group_size is not None and k % group_size != 0:
-        return
-
-    print(f"MNK = {m} {n} {k}")
-
-    # Normalize group_size
-    if group_size is None:
-        group_size = k
-    assert group_size <= k
-
-    a = rand_data((m, k), atype)
-    w = rand_data((k, n), atype)
-
-    w_ref, w_q_machete, w_s, w_zp = machete_quantize_and_pack(
-        w, wtype, group_size, zero_points)
-
-    output_ref = torch.matmul(a, w_ref)
-
-    for schedule in ops.machete_supported_schedules(wtype):
-        print(f"Testing schedule {schedule}")
-        output = ops.machete_gemm(
-            a,
-            b_q=w_q_machete,
-            b_type=wtype,
-            b_scales=w_s,
-            b_zeros=maybe_convert_zeropoints(w_zp, w_s),
-            b_group_size=group_size,
-            schedule=schedule,
-        )
-
-        opcheck(
-            torch.ops._C.machete_gemm,
-            (a, w_q_machete, wtype.id, w_s, maybe_convert_zeropoints(
-                w_zp, w_s), group_size, None, None, None, schedule))
-
-        # Relax atol as our reduction dim becomes larger (more rounding error)
-        # Relax atol when we have zeropoints since the way machete applies
-        #  zeropoints (after scales) causes noise around 0
-        atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
-        torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol),\
-               f"Schedule failed {schedule}"
-
-
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("shape",
-                         MNK_SHAPES,
-                         ids=lambda x: "x".join(str(v) for v in x))
-@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
-@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
-@pytest.mark.parametrize("group_size", [128, None])
-def test_machete_heuristic(shape, atype: torch.dtype,
-                           wtype_zeropoints: Tuple[ScalarType, bool],
-                           group_size: Optional[int]):
-    m, n, k = shape
-    wtype, zero_points = wtype_zeropoints
-
-    if group_size is not None and k % group_size != 0:
-        return
-
-    # Normalize group_size
-    if group_size is None:
-        group_size = k
-    assert group_size <= k
-
-    a = rand_data((m, k), atype)
-    b = rand_data((k, n), atype)
-
-    machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
-
-
-# Test working on other devices
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_machete_devices(device: str):
-    m, n, k = 512, 4096, 4096
-    wtype = scalar_types.uint4b8
-    group_size = 128
-    zero_points = False
-
-    print(f"MNK = {m} {n} {k}, device = {device}")
-
-    a = rand_data((m, k), torch.float16).to(device)
-    b = rand_data((k, n), torch.float16).to(device)
-
-    machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
-
-
-# Test working with a subset of A and B
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-def test_machete_subset():
-    big_m, big_n, big_k = 1024, 1024, 1024
-    m, n, k = 512, 512, 512
-    wtype = scalar_types.uint4b8
-    group_size = 128
-    zero_points = False
-
-    whole_a = rand_data((big_m, big_k), torch.float16)
-    whole_b = rand_data((big_k, big_n), torch.float16)
-
-    a = whole_a[0:m, 0:k]
-    b = whole_b[0:k, 0:n]
-
-    machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
-
-
-# Test to make sure cuda graphs work
-class MacheteLayer(torch.nn.Module):
-
-    def __init__(self, **kwargs):
-        super().__init__()
-        self.kwargs = kwargs
-
-    def forward(self, a):
-        return ops.machete_gemm(**self.kwargs)
-
-
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-def test_machete_cuda_graph():
-    m, n, k = 512, 4096, 4096
-
-    a = rand_data((m, k), torch.float16)
-    b = rand_data((k, n), torch.float16)
-    wtype = scalar_types.uint4b8
-    group_size = 128
-    zero_points = False
-
-    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
-        b, wtype, group_size, zero_points)
-
-    # Construct a trivial model with a single layer that calls a machete kernel
-    model = MacheteLayer(
-        a=a,
-        b_q=w_q_packed,
-        b_type=wtype,
-        b_scales=w_s,
-        b_zeros=maybe_convert_zeropoints(w_zp, w_s),
-        b_group_size=group_size,
-    )
-
-    output_ref = torch.matmul(a, w_ref)
-
-    # Run the model with a cuda graph
-    stream = torch.cuda.Stream()
-    with torch.cuda.stream(stream):
-        g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g):
-            output = model(a)
-    output.zero_()
-    g.replay()
-
-    # Relax atol as our reduction dim becomes larger (more rounding error)
-    # Relax atol when we have zeropoints since the way machete applies
-    #  zeropoints (after scales) causes noise around 0
-    atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
-    torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py
new file mode 100644
index 0000000000000..1c6eb2dd9a228
--- /dev/null
+++ b/tests/kernels/test_machete_mm.py
@@ -0,0 +1,406 @@
+"""Tests for the machete kernel.
+
+Run `pytest tests/kernels/test_machete_mm.py`.
+"""
+
+import math
+from dataclasses import dataclass, fields
+from typing import List, Optional, Tuple
+
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows, quantize_weights)
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+
+MNK_SHAPES = [
+    (1, 128, 128),
+    (1, 512, 1024),
+    (1, 4096, 4096),
+    (1, 8192, 28672),
+    (13, 8192, 4096),
+    (26, 4096, 8192),
+    (64, 4096, 4096),
+    (64, 8192, 28672),
+    (257, 128, 4096),
+    (257, 4224, 4160),
+    (257, 4096, 4096),
+    (1024, 4096, 8192),
+    (1024, 8192, 4096),
+]
+
+GROUP_SIZES_TO_TEST: List[Optional[int]] = [128, -1]
+
+
+@dataclass
+class TypeConfig:
+    act_type: torch.dtype
+    weight_type: ScalarType
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    group_zero_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]
+
+
+@dataclass
+class Tensors:
+    w_ref: torch.Tensor
+    a_ref: torch.Tensor
+    a: torch.Tensor
+    w_q: torch.Tensor
+    w_g_s: Optional[torch.Tensor]
+    w_g_zp: Optional[torch.Tensor]
+    w_ch_s: Optional[torch.Tensor]
+    w_tok_s: Optional[torch.Tensor]
+
+
+# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
+#  Ch Scales Type, Tok Scales Type)
+# NOTE: None "Scale Type" means the act type is floating point
+#       None "Output Type" means the output type is the same as the act type
+TestTypeTuple = Tuple[List[torch.dtype], ScalarType, Optional[torch.dtype],
+                      Optional[torch.dtype], bool]
+TEST_TYPES = [
+    # GPTQ style
+    *(TypeConfig(act_type=a_type,
+                 weight_type=w_type,
+                 output_type=None,
+                 group_scale_type=a_type,
+                 group_zero_type=None,
+                 channel_scale_type=None,
+                 token_scale_type=None)
+      for w_type in [scalar_types.uint4b8, scalar_types.uint8b128]
+      for a_type in [torch.float16, torch.bfloat16]),
+    # AWQ style
+    *(TypeConfig(act_type=a_type,
+                 weight_type=w_type,
+                 output_type=None,
+                 group_scale_type=a_type,
+                 group_zero_type=a_type,
+                 channel_scale_type=None,
+                 token_scale_type=None)
+      for w_type in [scalar_types.uint4, scalar_types.uint8]
+      for a_type in [torch.float16, torch.bfloat16]),
+    # QQQ style
+    *(TypeConfig(act_type=torch.int8,
+                 weight_type=scalar_types.uint4b8,
+                 output_type=torch.float16,
+                 group_scale_type=group_scale_type,
+                 group_zero_type=None,
+                 channel_scale_type=torch.float,
+                 token_scale_type=torch.float)
+      for group_scale_type in [None, torch.float16]),
+    *(TypeConfig(act_type=torch.float8_e4m3fn,
+                 weight_type=scalar_types.uint4b8,
+                 output_type=torch.float16,
+                 group_scale_type=group_scale_type,
+                 group_zero_type=None,
+                 channel_scale_type=torch.float,
+                 token_scale_type=torch.float)
+      for group_scale_type in [None, torch.float16]),
+]
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
+
+
+def rand_data(shape, dtype=torch.float16, scale=1, offset=0):
+    if dtype.is_floating_point:
+        return (scale * torch.rand(shape, device="cuda") - offset).to(dtype)
+    else:
+        return torch.randint(-8, 7, shape, dtype=dtype, device="cuda")
+
+
+def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
+    return zps if zps is None else -1 * s * (zps.to(s.dtype))
+
+
+def group_size_valid(shape: Tuple[int, int, int],
+                     group_size: Optional[int]) -> bool:
+    return group_size is None or group_size == -1 or group_size % shape[2] == 0
+
+
+def machete_quantize_and_pack(atype: torch.dtype,
+                              w: torch.Tensor,
+                              wtype: ScalarType,
+                              stype: Optional[torch.dtype],
+                              group_size: Optional[int],
+                              zero_points: bool = False):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w,
+        wtype,
+        group_size=group_size,
+        zero_points=zero_points,
+        # to match how the kernel applies zps
+        ref_zero_points_after_scales=True)
+
+    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
+    w_q = w_q.t().contiguous().t()  # convert to col major
+
+    w_q_machete = ops.machete_prepack_B(w_q, atype, wtype, stype)
+    opcheck(torch.ops._C.machete_prepack_B, (w_q, atype, wtype.id, stype))
+
+    return w_ref, w_q_machete, w_s, w_zp
+
+
+def create_test_tensors(shape: Tuple[int, int, int],
+                        types: TypeConfig,
+                        group_size: Optional[int],
+                        subset_stride_factor: Optional[int] = None) -> Tensors:
+    m, n, k = shape
+    factor = subset_stride_factor or 1
+
+    print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
+          group_size)
+
+    a = rand_data((m * factor, k * factor), types.act_type, scale=3, offset=2)
+    w = rand_data((k * factor, n * factor), types.act_type, scale=3, offset=1)
+
+    if factor > 1:
+        a = a[0:m, 0:k]
+        w = w[0:k, 0:n]
+
+    if types.group_scale_type is not None:
+        w = w.to(types.group_scale_type)
+    if w.dtype.itemsize == 1:
+        w = w.to(torch.float16)
+
+    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
+        a.dtype, w, types.weight_type, types.group_scale_type, group_size,
+        types.group_zero_type is not None)
+
+    if not a.dtype.is_floating_point:
+        aiinfo = torch.iinfo(a.dtype)
+        w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
+
+    a_ref = a.to(torch.float32)
+    w_ref = w_ref.to(torch.float32)
+
+    w_ch_s = None if types.channel_scale_type is None else\
+        rand_data((n,), types.channel_scale_type)
+    w_tok_s = None if types.token_scale_type is None else\
+        rand_data((m,), types.token_scale_type)
+
+    return Tensors(w_ref=w_ref,
+                   a_ref=a_ref,
+                   a=a,
+                   w_q=w_q_packed,
+                   w_g_s=w_s,
+                   w_g_zp=maybe_convert_zeropoints(w_zp, w_s),
+                   w_ch_s=w_ch_s,
+                   w_tok_s=w_tok_s)
+
+
+# None stype means scales use the same dtype as a
+def machete_mm_test_helper(types: TypeConfig,
+                           tensors: Tensors,
+                           group_size: Optional[int] = None,
+                           schedule: Optional[str] = None):
+    output_ref = torch.matmul(tensors.a_ref, tensors.w_ref)
+    output_ref_type = output_ref.dtype
+
+    if tensors.w_ch_s is not None:
+        output_ref = (output_ref.to(tensors.w_ch_s.dtype) *
+                      tensors.w_ch_s.unsqueeze(0)).to(output_ref_type)
+    if tensors.w_tok_s is not None:
+        output_ref = (output_ref.to(tensors.w_tok_s.dtype) *
+                      tensors.w_tok_s.unsqueeze(1)).to(output_ref_type)
+
+    output = ops.machete_mm(
+        a=tensors.a,
+        b_q=tensors.w_q,
+        b_type=types.weight_type,
+        b_group_scales=tensors.w_g_s,
+        b_group_zeros=tensors.w_g_zp,
+        b_group_size=group_size,
+        b_channel_scales=tensors.w_ch_s,
+        a_token_scales=tensors.w_tok_s,
+        out_type=types.output_type,
+        schedule=schedule,
+    )
+
+    print(output)
+    print(output_ref)
+
+    # Relax atol as our reduction dim becomes larger (more rounding error)
+    # Relax atol when we have zeropoints since the way machete applies
+    #  zeropoints (after scales) causes noise around 0
+    atol = 1 if tensors.w_g_zp is not None\
+        else min(5e-2 * math.sqrt(tensors.a.shape[1]), 1)
+    rtol = 1e-1 if tensors.a.element_size() >= 2 else 2e-1
+    torch.testing.assert_close(output,
+                               output_ref.to(output.dtype),
+                               rtol=rtol,
+                               atol=atol)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.parametrize("shape",
+                         MNK_SHAPES,
+                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.parametrize("types", TEST_TYPES)
+def test_machete_all_schedules(shape, types: TypeConfig):
+
+    group_sizes: List[Optional[int]] = []
+    if types.group_scale_type is None:
+        group_sizes = [None]
+    else:
+        group_sizes = GROUP_SIZES_TO_TEST
+
+    for group_size in group_sizes:
+        if not group_size_valid(shape, group_size):
+            continue
+
+        tensors = create_test_tensors(shape, types, group_size)
+        print(f"MNK = {shape}")
+        for schedule in ops.machete_supported_schedules(
+                types.act_type,
+                types.weight_type,
+                group_scales_type=types.group_scale_type,
+                group_zeros_type=types.group_scale_type,
+                out_type=types.output_type):
+            print(f"Testing schedule {schedule}")
+            machete_mm_test_helper(types, tensors, group_size, schedule)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.parametrize("shape",
+                         MNK_SHAPES,
+                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.parametrize("types", TEST_TYPES)
+def test_machete_heuristic(shape, types: TypeConfig):
+    group_sizes: List[Optional[int]] = []
+    if types.group_scale_type is None:
+        group_sizes = [None]
+    else:
+        group_sizes = GROUP_SIZES_TO_TEST
+
+    for group_size in group_sizes:
+        if not group_size_valid(shape, group_size):
+            continue
+
+        tensors = create_test_tensors(shape, types, group_size)
+        machete_mm_test_helper(types, tensors, group_size)
+
+
+# Test working on other devices
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_machete_devices(device: str):
+    group_size = 128
+
+    type_config = TypeConfig(act_type=torch.float16,
+                             weight_type=scalar_types.uint4b8,
+                             output_type=None,
+                             group_scale_type=torch.float16,
+                             group_zero_type=None,
+                             channel_scale_type=None,
+                             token_scale_type=None)
+
+    tensors = create_test_tensors((512, 4096, 4096), type_config, group_size)
+
+    for field in fields(Tensors):
+        tensor = getattr(tensors, field.name)
+        if isinstance(tensor, torch.Tensor):
+            setattr(tensors, field.name, tensor.to(device))
+
+    machete_mm_test_helper(type_config, tensors, group_size)
+
+
+# Test working with a subset of A and B
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+def test_machete_subset():
+    group_size = 128
+
+    type_config = TypeConfig(act_type=torch.float16,
+                             weight_type=scalar_types.uint4b8,
+                             output_type=None,
+                             group_scale_type=torch.float16,
+                             group_zero_type=None,
+                             channel_scale_type=None,
+                             token_scale_type=None)
+
+    tensors = create_test_tensors((512, 4096, 4096),
+                                  type_config,
+                                  group_size,
+                                  subset_stride_factor=2)
+    machete_mm_test_helper(type_config, tensors, group_size)
+
+
+# Test to make sure cuda graphs work
+class MacheteLayer(torch.nn.Module):
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.kwargs = kwargs
+
+    def forward(self, a):
+        return ops.machete_mm(a=a, **self.kwargs)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+def test_machete_cuda_graph():
+    m, n, k = 512, 4096, 4096
+
+    a = rand_data((m, k), torch.float16)
+    b = rand_data((k, n), torch.float16)
+    wtype = scalar_types.uint4b8
+    stype = torch.float16
+    group_size = 128
+    zero_points = False
+
+    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
+        a.dtype, b, wtype, stype, group_size, zero_points)
+
+    # Construct a trivial model with a single layer that calls a machete kernel
+    model = MacheteLayer(
+        b_q=w_q_packed,
+        b_type=wtype,
+        b_group_scales=w_s,
+        b_group_zeros=maybe_convert_zeropoints(w_zp, w_s),
+        b_group_size=group_size,
+    )
+
+    output_ref = torch.matmul(a, w_ref)
+
+    # Run the model with a cuda graph
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            output = model(a)
+    output.zero_()
+    g.replay()
+
+    # Relax atol as our reduction dim becomes larger (more rounding error)
+    # Relax atol when we have zeropoints since the way machete applies
+    #  zeropoints (after scales) causes noise around 0
+    atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
+    torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index b6dd68cc51a9f..5e047f4b099f1 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -29,6 +29,7 @@
     marlin_qqq_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
+from vllm.scalar_type import scalar_types
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
@@ -40,6 +41,8 @@
 MARLIN_24_K_CHUNKS = [128]
 MARLIN_24_N_CHUNKS = [512]
 
+HQQ_SUPPORTED_GROUP_SIZES = [64]
+
 MNK_FACTORS = [
     (1, 1, 1),
     (1, 4, 8),
@@ -47,6 +50,8 @@
     (13, 17, 67),
     (26, 37, 13),
     (67, 13, 11),
+    (257, 13, 11),
+    (658, 13, 11),
 ]
 
 DTYPES = [torch.float16, torch.bfloat16]
@@ -226,7 +231,7 @@ def test_gptq_marlin_gemm(
         torch.ops._C.gptq_marlin_gemm,
         (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
          workspace.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1],
-         a_input.shape[1], is_k_full, False, use_fp32_reduce),
+         a_input.shape[1], is_k_full, False, use_fp32_reduce, False),
         test_utils=DEFAULT_OPCHECK_TEST_UTILS)
 
     output = ops.gptq_marlin_gemm(
@@ -244,6 +249,7 @@ def test_gptq_marlin_gemm(
         is_k_full=is_k_full,
         has_zp=False,
         use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
     )
     output_ref = torch.matmul(a_input, w_ref)
 
@@ -441,6 +447,7 @@ def test_awq_marlin_gemm(
         is_k_full=is_k_full,
         has_zp=has_zp,
         use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
     )
     output_ref = torch.matmul(a_input, w_ref)
 
@@ -451,6 +458,87 @@ def test_awq_marlin_gemm(
     assert max_diff < 0.04
 
 
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("group_size", HQQ_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
+def test_hqq_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    group_size,
+    mnk_factors,
+    use_fp32_reduce,
+):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    quant_type = scalar_types.uint4
+
+    a_input = rand_data((size_m, size_k))
+    dev = a_input.device
+
+    b_weight = torch.randint(0,
+                             10, (size_n, size_k),
+                             dtype=torch.uint8,
+                             device=dev)
+    scale = rand_data((size_n, size_k // group_size))
+    zero = rand_data((size_n, size_k // group_size))
+
+    gptq_w_q = gptq_pack(b_weight.transpose(1, 0), 4, size_k, size_n)
+
+    sort_indices = torch.empty(0, dtype=torch.int, device=dev)
+    marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n,
+                                        4).to(dev)
+    marlin_s = marlin_permute_scales(scale.transpose(1, 0), size_k, size_n,
+                                     group_size).to(dev)
+    marlin_zp = marlin_permute_scales(zero.transpose(1, 0), size_k, size_n,
+                                      group_size).to(dev)
+
+    g_idx = marlin_make_empty_g_idx(dev)
+    g_idx_sort_indices = marlin_make_empty_g_idx(dev)
+
+    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        marlin_w_q,
+        marlin_s,
+        marlin_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace.scratch,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[0],
+        a_input.shape[1],
+        is_k_full=True,
+        has_zp=True,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=True,
+    )
+
+    b_flat = b_weight.reshape(-1, group_size)
+    zp_flat = zero.reshape(-1, 1)
+    s_flat = scale.reshape(-1, 1)
+    dequant = (b_flat - zp_flat) * s_flat
+
+    output_ref = torch.matmul(a_input,
+                              dequant.reshape(b_weight.shape).transpose(1, 0))
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
 @pytest.mark.skipif(not is_quant_method_supported("qqq"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index a8a187ebaede4..3fdb7996ba4e0 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -40,6 +40,13 @@ def test_contexted_kv_attention(
     kv_cache_dtype: str,
     device: str,
 ) -> None:
+
+    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
+            89):
+        pytest.skip(
+            'Triton limitation: fp8e4nv data type is not supported on CUDA'
+            ' arch < 89')
+
     current_platform.seed_everything(0)
     torch.set_default_device(device)
 
@@ -235,6 +242,13 @@ def test_contexted_kv_attention_alibi(
     kv_cache_dtype: str,
     device: str,
 ) -> None:
+
+    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
+            89):
+        pytest.skip(
+            'Triton limitation: fp8e4nv data type is not supported on CUDA'
+            ' arch < 89')
+
     current_platform.seed_everything(0)
     torch.set_default_device(device)
 
@@ -462,3 +476,52 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
+
+
+# These tests are optional to only run when explicitly invoked
+#
+# pytest -v -s --optional \
+# tests/kernels/test_prefix_prefill.py::test_contexted_kv_attention_f32
+#
+# These tests are useful to test model dtype float32 on Turing devices.
+# We skip them to not increase the time when running tests on CI
+@pytest.mark.optional
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@torch.inference_mode()
+def test_contexted_kv_attention_f32(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    sliding_window: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+) -> None:
+    test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size,
+                                sliding_window, dtype, kv_cache_dtype, device)
+
+
+@pytest.mark.optional
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_contexted_kv_attention_alibi_f32(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+) -> None:
+    test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size,
+                                      dtype, kv_cache_dtype, device)
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
new file mode 100644
index 0000000000000..34244a8fe4ca7
--- /dev/null
+++ b/tests/kernels/test_semi_structured.py
@@ -0,0 +1,131 @@
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Optional, Tuple, Type
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def rand_int8(shape: tuple, device: str = "cuda"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
new file mode 100644
index 0000000000000..adc6150edece6
--- /dev/null
+++ b/tests/kv_transfer/disagg_test.py
@@ -0,0 +1,119 @@
+import os
+import subprocess
+import sys
+import time
+from subprocess import Popen
+
+import pytest
+import requests
+import torch
+
+
+# Fixture to set up environment variables and teardown servers after tests
+@pytest.fixture(scope="module", autouse=True)
+def setup_servers():
+    if torch.cuda.device_count() < 4:
+        pytest.skip("Skipping test: fewer than 4 GPUs available")
+
+    # Set up environment variables
+    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
+                                           shell=True).decode().strip()
+    os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
+
+    # Start prefill instance
+    prefill_cmd = [
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "--model",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port",
+        "8100",
+        "--gpu-memory-utilization",
+        "0.5",
+        "--max-model-len",
+        "1000",
+        "--kv-transfer-config",
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\
+        '"kv_rank":0,"kv_parallel_size":2}',
+    ]
+    prefill_env = os.environ.copy()
+    prefill_env["CUDA_VISIBLE_DEVICES"] = "0"
+    prefill_proc = Popen(prefill_cmd, env=prefill_env)
+
+    # Start decode instance
+    decode_cmd = [
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "--model",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port",
+        "8200",
+        "--gpu-memory-utilization",
+        "0.5",
+        "--max-model-len",
+        "1000",
+        "--kv-transfer-config",
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\
+        '"kv_rank":1,"kv_parallel_size":2}',
+    ]
+    decode_env = os.environ.copy()
+    decode_env["CUDA_VISIBLE_DEVICES"] = "1"
+    decode_proc = Popen(decode_cmd, env=decode_env)
+
+    # Wait for servers to be ready
+    assert wait_for_server(8100), "Prefill server did not start in time"
+    assert wait_for_server(8200), "Decode server did not start in time"
+
+    # Yield to the test function and handle teardown after tests
+    yield
+
+    # Cleanup: kill the processes
+    prefill_proc.terminate()
+    decode_proc.terminate()
+
+    # Additional cleanup if needed
+    prefill_proc.wait()
+    decode_proc.wait()
+
+
+# Helper function to wait for server
+def wait_for_server(port, timeout=240):
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(f"http://localhost:{port}/v1/completions")
+            if response.status_code in [200, 405]:
+                return True
+        except requests.ConnectionError:
+            time.sleep(1)
+    return False
+
+
+# Test function to send curl requests and validate responses
+@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"])
+def test_disaggregated_prefilling(prompt):
+    # Send to prefill
+    response = requests.post("http://localhost:8100/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model":
+                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 1,
+                                 "temperature": 0
+                             })
+    assert response.status_code == 200
+
+    # Send to decode
+    response = requests.post("http://localhost:8200/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model":
+                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 10,
+                                 "temperature": 0
+                             })
+    assert response.status_code == 200
diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py
new file mode 100644
index 0000000000000..355461919cd7c
--- /dev/null
+++ b/tests/kv_transfer/module_test.py
@@ -0,0 +1,64 @@
+import subprocess
+import sys
+
+import pytest
+import torch
+
+
+def run_python_script(script_name, timeout):
+    script_name = f'kv_transfer/{script_name}'
+    try:
+        # Start both processes asynchronously using Popen
+        process0 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK":
+                 "0"},  # Set the RANK environment variable for process 0
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+
+        process1 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK":
+                 "1"},  # Set the RANK environment variable for process 1
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+
+        # Wait for both processes to complete, with a timeout
+        process0.wait(timeout=timeout)
+        process1.wait(timeout=timeout)
+
+        # Check the return status of both processes
+        if process0.returncode != 0:
+            pytest.fail(
+                f"Test {script_name} failed for RANK=0, {process0.returncode}")
+        if process1.returncode != 0:
+            pytest.fail(
+                f"Test {script_name} failed for RANK=1, {process1.returncode}")
+
+    except subprocess.TimeoutExpired:
+        # If either process times out, terminate both and fail the test
+        process0.terminate()
+        process1.terminate()
+        pytest.fail(f"Test {script_name} timed out")
+    except Exception as e:
+        pytest.fail(f"Test {script_name} failed with error: {str(e)}")
+
+
+# Define the test cases using pytest's parametrize
+@pytest.mark.parametrize(
+    "script_name,timeout",
+    [
+        ("test_lookup_buffer.py",
+         60),  # Second test case with a 60-second timeout
+        ("test_send_recv.py", 120)  # First test case with a 120-second timeout
+    ])
+def test_run_python_script(script_name, timeout):
+    # Check the number of GPUs
+    if torch.cuda.device_count() < 2:
+        pytest.skip(
+            f"Skipping test {script_name} because <2 GPUs are available")
+
+    # Run the test if there are at least 2 GPUs
+    run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
new file mode 100644
index 0000000000000..96b0e58713332
--- /dev/null
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -0,0 +1,160 @@
+import os
+import random
+
+import torch
+from tqdm import tqdm
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
+    SimpleBuffer)
+from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
+
+# TODO: the test depends on a lot of fields in the current implementation.
+# We should have standard interface instead direct field access
+
+
+def test_run(my_rank, buffer, device):
+
+    # buffer should be empty in the beginning
+    if my_rank == 0:
+        assert buffer.buffer_size == 0
+        assert len(buffer.buffer) == 0
+
+    print("My rank: %d, device: %s" % (my_rank, device))
+
+    # insert
+    tokens = torch.tensor([1, 2, 3]).to(device)
+    roi = (tokens > 0)
+    if my_rank == 0:
+        key = 2.0 * torch.ones([5, 6]).to(device)
+        value = 3.0 * torch.ones([5, 6]).to(device)
+
+        placeholder = torch.tensor([1]).to(device)
+
+        buffer.insert(tokens, roi, key, value, placeholder)
+
+    torch.distributed.barrier()
+
+    # drop_select
+    if my_rank == 1:
+        tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
+        assert torch.allclose(tokens, tok)
+        assert torch.allclose(roi, roi_)
+        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
+        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
+    torch.distributed.barrier()
+
+    if my_rank == 0:
+        assert buffer.buffer_size == 0
+        assert len(buffer.buffer) == 0
+
+    print("Test run passed!")
+
+
+def stress_test(my_rank, buf, device):
+
+    torch.distributed.barrier()
+    torch.manual_seed(100)
+
+    reqs = [
+        (
+            torch.rand(100).to(device),  # tokens
+            torch.ones(100).bool().to(device),  # roi
+            torch.rand(100).to(device),  # key
+            torch.rand(100).to(device),  # value
+            torch.rand(100).to(device),  # hidden
+        ) for i in tqdm(range(200))
+    ]
+
+    random.seed(my_rank)
+    random.shuffle(reqs)
+
+    torch.distributed.barrier()
+
+    n = 0
+
+    # the buffer size can only store 100 reqs
+    # so the sender will occasionally block to wait for the receiver.
+    for req in tqdm(reqs):
+        if my_rank == 0:
+            buf.insert(*req)
+        else:
+            tok, roi, k, v, h = req
+            tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
+
+            if tok_ is None:
+                assert roi_ is None
+                assert k_ is None
+                assert v_ is None
+                assert h_ is None
+                n += 1
+            else:
+                assert torch.allclose(tok, tok_)
+                assert torch.allclose(roi, roi_)
+                assert torch.allclose(k, k_)
+                assert torch.allclose(v, v_)
+                assert torch.allclose(h, h_)
+    print('Rank %d done' % my_rank)
+    torch.distributed.barrier()
+
+    if my_rank == 0:
+        x = torch.tensor([0])
+        torch.distributed.recv(x, 1)
+        # the # of None received is the kv that are not selected
+        assert x.item() == len(buf.buffer)
+        # and the size of the buffer should be 2000 * buffer len
+        print(buf.buffer_size)
+        assert buf.buffer_size == 1700 * len(buf.buffer)
+    else:
+        torch.distributed.send(torch.tensor([n]), 0)
+
+    print("Passed stress test!")
+
+
+if __name__ == "__main__":
+
+    my_rank = int(os.environ['RANK'])
+
+    torch.distributed.init_process_group(
+        backend='gloo',
+        init_method='tcp://localhost:12398',
+        world_size=2,
+        rank=my_rank,
+    )
+
+    print("initialized! My rank is %d" % my_rank)
+
+    config = KVTransferConfig(
+        kv_connector='PyNcclConnector',
+        kv_buffer_device='cuda',
+        kv_buffer_size=1e9,
+        kv_rank=my_rank,
+        kv_role="kv_both",  # this arg doesn't matter in this test
+        kv_parallel_size=2,
+        kv_ip="127.0.0.1",
+        kv_port=12345,
+    )
+
+    data_pipe = PyNcclPipe(
+        local_rank=my_rank,
+        config=config,
+        device="cuda",
+        port_offset=0,
+    )
+    cpu_pipe = PyNcclPipe(
+        local_rank=my_rank,
+        config=config,
+        device="cpu",
+        port_offset=1,
+    )
+
+    buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
+
+    test_run(my_rank, buffer, data_pipe.device)
+
+    stress_test(my_rank, buffer, data_pipe.device)
+
+    buffer.close()
+    data_pipe.close()
+    cpu_pipe.close()
+    print('Done')
diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
new file mode 100644
index 0000000000000..09d7ee018c3f4
--- /dev/null
+++ b/tests/kv_transfer/test_lookup_buffer.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+RANK=0 python test_lookup_buffer.py &
+RANK=1 python test_lookup_buffer.py &
\ No newline at end of file
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
new file mode 100644
index 0000000000000..65973bf10a4d7
--- /dev/null
+++ b/tests/kv_transfer/test_send_recv.py
@@ -0,0 +1,155 @@
+import os
+import time
+from typing import List
+
+import torch
+from tqdm import tqdm
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
+
+
+def test_run(my_rank, pipe):
+    # test run
+    x = torch.tensor([1]).to(pipe.device)
+    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
+    if my_rank == 0:
+        pipe.send_tensor(x)
+        print("sent tensor x")
+        pipe.send_tensor(y)
+        print("sent tensor y")
+        x2 = pipe.recv_tensor()
+        print("received x2 = ", x2)
+        y2 = pipe.recv_tensor()
+        print("received y2 = ", x2)
+
+    else:
+        x2 = pipe.recv_tensor()
+        print("received x2 = ", x2)
+        y2 = pipe.recv_tensor()
+        print("received y2 = ", x2)
+        pipe.send_tensor(x)
+        print("sent tensor x")
+        pipe.send_tensor(y)
+        print("sent tensor y")
+
+    assert torch.allclose(x, x2)
+    assert torch.allclose(y, y2)
+
+
+def stress_test(my_rank, pipe):
+
+    torch.distributed.barrier()
+
+    tensors: List[torch.Tensor] = []
+
+    torch.manual_seed(0)
+
+    for i in tqdm(range(500)):
+        mean = torch.rand(1).item() * 100
+        std = torch.rand(1).item() * 100
+        size = torch.randint(900, 1000, (2, ))
+        x = torch.normal(mean * 1.0, std * 1.0,
+                         size=size.tolist()).to(pipe.device)
+
+        # 5% probability of sending a None
+        if torch.rand(1).item() < 0.05:
+            tensors.append(None)
+            tensors.append(None)
+            tensors.append(None)
+        else:
+            tensors.append(x)
+            tensors.append(x.mean().unsqueeze(0))
+            tensors.append(x.std().unsqueeze(0))
+
+    torch.distributed.barrier()
+
+    for i in tqdm(range(500)):
+        if my_rank == int((i % 10) > 3):
+            pipe.send_tensor(tensors[3 * i])
+            pipe.send_tensor(tensors[3 * i + 1])
+            pipe.send_tensor(tensors[3 * i + 2])
+        else:
+            x = pipe.recv_tensor()
+            mean = pipe.recv_tensor()
+            std = pipe.recv_tensor()
+
+            if x is None:
+                assert mean is None
+                assert std is None
+            else:
+                assert torch.allclose(x, tensors[3 * i])
+                assert x.mean() == mean[0]
+                assert x.std() == std[0]
+
+        torch.distributed.barrier()
+
+
+def latency_test(my_rank, pipe, nelement, ntensor):
+
+    latencies = []
+
+    torch.distributed.barrier()
+
+    for i in tqdm(range(500)):
+
+        tensors = []
+
+        if my_rank == 0:
+            # create tensor
+            tensors = [
+                torch.rand(nelement).to(pipe.device) for _ in range(ntensor)
+            ]
+
+        torch.distributed.barrier()
+
+        if my_rank == 0:
+            t = torch.tensor([time.time()],
+                             dtype=torch.float64).to(pipe.device)
+            for tensor in tensors:
+                pipe.send_tensor(tensor)
+            pipe.send_tensor(t)
+        else:
+            for _ in range(ntensor):
+                pipe.recv_tensor()
+            t = pipe.recv_tensor()
+            latencies.append(time.time() - t.item())
+
+    torch.distributed.barrier()
+
+    print('Latency test passed.')
+    print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
+
+
+if __name__ == "__main__":
+
+    my_rank = int(os.environ['RANK'])
+
+    torch.distributed.init_process_group(
+        backend='gloo',
+        init_method='tcp://localhost:12398',
+        world_size=2,
+        rank=my_rank,
+    )
+
+    config = KVTransferConfig(
+        kv_connector='PyNcclConnector',
+        kv_buffer_device='cuda',
+        kv_buffer_size=1e9,
+        kv_rank=my_rank,
+        kv_role="kv_both",  # this arg doesn't matter in this test
+        kv_parallel_size=2,
+        kv_ip="127.0.0.1",
+        kv_port=12345,
+    )
+
+    pipe = PyNcclPipe(
+        local_rank=my_rank,
+        config=config,
+    )
+
+    test_run(my_rank, pipe)
+    stress_test(my_rank, pipe)
+
+    # Use this function if you want to test the latency of pipe impl.
+    # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
new file mode 100644
index 0000000000000..1e89e246b4992
--- /dev/null
+++ b/tests/kv_transfer/test_send_recv.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+RANK=0 python3 test_send_recv.py &
+RANK=1 python3 test_send_recv.py &
\ No newline at end of file
diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3_tp.py
similarity index 53%
rename from tests/lora/test_chatglm3.py
rename to tests/lora/test_chatglm3_tp.py
index de4cbea80924e..49a527b99ac16 100644
--- a/tests/lora/test_chatglm3.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -1,12 +1,21 @@
 from typing import List
 
 import vllm
+from tests.utils import fork_new_process_for_each_test
 from vllm.lora.request import LoRARequest
 
+from ..utils import multi_gpu_test
+
 MODEL_PATH = "THUDM/chatglm3-6b"
 
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
+EXPECTED_LORA_OUTPUT = [
+    "SELECT count(*) FROM singer",
+    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
+    "SELECT name ,  country ,  age FROM singer ORDER BY age",
+]
+
 
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
@@ -20,7 +29,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
             "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
         ),
     ]
-    print(prompts)
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
     outputs = llm.generate(
         prompts,
@@ -37,23 +45,61 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
+@fork_new_process_for_each_test
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,
-                   trust_remote_code=True)
+                   tensor_parallel_size=1,
+                   trust_remote_code=True,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
-    expected_lora_output = [
-        "SELECT count(*) FROM singer",
-        "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
-        "SELECT name ,  country ,  age FROM singer ORDER BY age",
-    ]
 
+@multi_gpu_test(num_gpus=4)
+@fork_new_process_for_each_test
+def test_chatglm3_lora_tp4(chatglm3_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=64,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=False,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+
+
+@multi_gpu_test(num_gpus=4)
+@fork_new_process_for_each_test
+def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=64,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=True,
+                   enable_chunked_prefill=True)
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i] == expected_lora_output[i]
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
     output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i] == expected_lora_output[i]
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 15ec66b0f5502..5ae705e474ec6 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -37,7 +37,8 @@ def test_gemma_lora(gemma_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
                    enable_lora=True,
-                   max_loras=4)
+                   max_loras=4,
+                   enable_chunked_prefill=True)
 
     expected_lora_output = [
         "more important than knowledge.\nAuthor: Albert Einstein\n",
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 15e576cb065c7..fb8c0b2a7ba26 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -28,7 +28,7 @@
 # yapf: enable
 from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
                               PackedLoRALayerWeights)
-from vllm.lora.punica import PunicaWrapper
+from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -48,11 +48,12 @@
     torch.float32: (5e-3, 5e-3),
     torch.bfloat16: (3e-2, 2e-2),
 }
-CUDA_DEVICES = [
+# TODO: Modify this based on platform
+DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 
-# We will launch different triton kernels between the prefill and decode
+#For GPU, we will launch different triton kernels between the prefill and decode
 # stages, so we need to verify this. prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
@@ -192,9 +193,18 @@ def create_random_inputs(
     return inputs, index_mapping, prompt_mapping
 
 
+def check_punica_wrapper(punica_wrapper) -> bool:
+    if current_platform.is_cuda_alike():
+        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
+
+        return type(punica_wrapper) is PunicaWrapperGPU
+    else:
+        return False
+
+
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
 def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@@ -205,7 +215,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -296,7 +307,7 @@ def create_random_embedding_layer():
 # @pytest.mark.skip(
 #     reason="Fails when loras are in any slot other than the first.")
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
@@ -305,7 +316,8 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
     torch.cuda.set_device(device)
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -432,7 +444,7 @@ def create_random_embedding_layer():
 
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
 def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
@@ -441,7 +453,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
     torch.cuda.set_device(device)
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -563,17 +576,21 @@ def _pretest():
 
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-def test_linear_replicated(dist_init, num_loras, device, stage) -> None:
+@pytest.mark.parametrize("bias_enabled", [True, False])
+def test_linear_replicated(dist_init, num_loras, device, stage,
+                           bias_enabled) -> None:
 
     torch.cuda.set_device(device)
     torch.set_default_device(device)
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
-                             lora_dtype=torch.float16)
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
 
     def create_random_linear_replicated_layer():
 
@@ -585,7 +602,12 @@ def create_random_linear_replicated_layer():
         lora_linear = ReplicatedLinearWithLoRA(linear)
 
         lora_linear.create_lora_weights(max_loras, lora_config)
-
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked) == 1)
+        if bias_enabled:
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -667,19 +689,22 @@ def create_random_linear_replicated_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
+@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage) -> None:
+                         device, stage, bias_enabled) -> None:
 
     torch.cuda.set_device(device)
     torch.set_default_device(device)
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16)
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
@@ -700,7 +725,12 @@ def create_random_linear_parallel_layer():
                            if not fully_shard else
                            ColumnParallelLinearWithShardedLoRA(linear))
         lora_linear.create_lora_weights(max_loras, lora_config)
-
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked) == 1)
+        if bias_enabled:
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -782,19 +812,22 @@ def create_random_linear_parallel_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
+@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage) -> None:
+                                device, stage, bias_enabled) -> None:
 
     torch.cuda.set_device(device)
     torch.set_default_device(device)
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16)
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
 
     def create_column_parallel_packed_layer():
         if repeats == 2:
@@ -832,10 +865,16 @@ class FakeConfig:
             num_key_value_heads = 32
             num_attention_heads = 32
 
+        n_slices = repeats
         lora_linear.create_lora_weights(max_loras,
                                         lora_config,
                                         model_config=FakeConfig())
-
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked) == n_slices)
+        if bias_enabled:
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -911,7 +950,6 @@ class FakeConfig:
             512,
             lora_config.lora_extra_vocab_size,
         )
-        # lora_linear.set_mapping(*mapping_info)
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
@@ -941,7 +979,8 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
     seed = 0
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
-    punica_wrapper = PunicaWrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    assert check_punica_wrapper(punica_wrapper)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
deleted file mode 100644
index e2a4f1ed0496a..0000000000000
--- a/tests/lora/test_llama.py
+++ /dev/null
@@ -1,146 +0,0 @@
-from typing import List
-
-import pytest
-import ray
-
-import vllm
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    prompts = [
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=256,
-                                          stop=["[/assistant]"])
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: List[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-@pytest.mark.parametrize("tp_size", [1, 2, 4])
-def test_llama_lora(sql_lora_files, tp_size, num_gpus_available):
-    if num_gpus_available < tp_size:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-
-    llm = vllm.LLM(MODEL_PATH,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   tensor_parallel_size=tp_size)
-
-    expected_no_lora_output = [
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
-        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
-    ]
-    expected_lora_output = [
-        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
-        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
-        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
-        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
-        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
-    ]
-
-    print("lora adapter created")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
-
-    print("lora 1")
-    assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output
-
-    print("no lora")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
-
-    print("lora 2")
-    assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output
-
-    print("removing lora")
-
-
-def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
-    if num_gpus_available < 4:
-        pytest.skip("Not enough GPUs for tensor parallelism 4")
-
-    llm_tp1 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=1)
-    output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)
-
-    del llm_tp1
-    cleanup_dist_env_and_memory()
-
-    llm_tp2 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=2)
-    output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)
-
-    del llm_tp2
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp2
-
-    llm_tp4 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=4)
-    output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)
-
-    del llm_tp4
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp4
-
-
-def test_llama_lora_warmup(sql_lora_files):
-    """Test that the LLM initialization works with a warmup LORA path and
-    is more conservative"""
-
-    @ray.remote(num_gpus=1)
-    def get_num_gpu_blocks_lora():
-        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
-        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
-        return num_gpu_blocks_lora_warmup
-
-    @ray.remote(num_gpus=1)
-    def get_num_gpu_blocks_no_lora():
-        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
-        num_gpu_blocks_no_lora_warmup = (
-            llm.llm_engine.cache_config.num_gpu_blocks)
-        return num_gpu_blocks_no_lora_warmup
-
-    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
-    num_gpu_blocks_no_lora_warmup = ray.get(
-        get_num_gpu_blocks_no_lora.remote())
-    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
-        "The warmup with lora should be more "
-        "conservative than without lora, therefore the number of "
-        "memory blocks for the KV cache should be "
-        "less when using lora than when not using lora")
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
new file mode 100644
index 0000000000000..dfeac380951d8
--- /dev/null
+++ b/tests/lora/test_llama_tp.py
@@ -0,0 +1,159 @@
+from typing import List
+
+import ray
+
+import vllm
+from tests.utils import fork_new_process_for_each_test
+from vllm.lora.request import LoRARequest
+
+from ..utils import multi_gpu_test
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+
+EXPECTED_NO_LORA_OUTPUT = [
+    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
+    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
+    "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
+    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
+    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
+    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
+]
+EXPECTED_LORA_OUTPUT = [
+    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
+    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
+    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
+    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
+    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    prompts = [
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=256,
+                                          stop=["[/assistant]"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def generate_and_test(llm, sql_lora_files):
+    print("lora adapter created")
+    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+
+    print("lora 1")
+    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
+
+    print("no lora")
+    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+
+    print("lora 2")
+    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
+
+    print("removing lora")
+
+
+@fork_new_process_for_each_test
+def test_llama_lora(sql_lora_files):
+
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=1,
+                   enable_chunked_prefill=True)
+    generate_and_test(llm, sql_lora_files)
+
+
+@fork_new_process_for_each_test
+def test_llama_lora_warmup(sql_lora_files):
+    """Test that the LLM initialization works with a warmup LORA path and
+    is more conservative"""
+
+    @ray.remote(num_gpus=1)
+    def get_num_gpu_blocks_lora():
+        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
+        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
+        return num_gpu_blocks_lora_warmup
+
+    @ray.remote(num_gpus=1)
+    def get_num_gpu_blocks_no_lora():
+        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
+        num_gpu_blocks_no_lora_warmup = (
+            llm.llm_engine.cache_config.num_gpu_blocks)
+        return num_gpu_blocks_no_lora_warmup
+
+    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
+    num_gpu_blocks_no_lora_warmup = ray.get(
+        get_num_gpu_blocks_no_lora.remote())
+    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
+        "The warmup with lora should be more "
+        "conservative than without lora, therefore the number of "
+        "memory blocks for the KV cache should be "
+        "less when using lora than when not using lora")
+
+
+@multi_gpu_test(num_gpus=4)
+@fork_new_process_for_each_test
+def test_llama_lora_tp4(sql_lora_files):
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=4,
+        enable_chunked_prefill=True,
+    )
+    generate_and_test(llm, sql_lora_files)
+
+
+@multi_gpu_test(num_gpus=4)
+@fork_new_process_for_each_test
+def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=4,
+        fully_sharded_loras=True,
+        enable_chunked_prefill=True,
+    )
+    generate_and_test(llm, sql_lora_files)
+
+
+@multi_gpu_test(num_gpus=4)
+@fork_new_process_for_each_test
+def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=4,
+        fully_sharded_loras=True,
+        enable_lora_bias=True,
+        enable_chunked_prefill=True,
+    )
+    generate_and_test(llm, sql_lora_files)
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index eada902c891f7..e7a34f2ced7ed 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -124,7 +124,8 @@ def lora_llm(long_context_infos):
         tensor_parallel_size=4,
         # FIXME enable async output processor
         disable_async_output_proc=True,
-        distributed_executor_backend="mp")
+        distributed_executor_backend="mp",
+        enable_chunked_prefill=True)
     yield llm
     del llm
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 8d109b2c81503..0b76f466702fc 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,3 +1,4 @@
+import json
 import os
 from typing import Dict, List
 
@@ -13,6 +14,7 @@
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager)
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                       WorkerLoRAManager)
@@ -30,18 +32,68 @@
 ]
 
 
+def test_peft_helper(sql_lora_files):
+    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
+    with open(lora_config_path) as f:
+        config = json.load(f)
+    peft_helper = PEFTHelper.from_dict(config)
+    assert peft_helper.r == 8
+    assert peft_helper.lora_alpha == 16
+    assert peft_helper.target_modules == [
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+
+    expected_error = "vLLM only supports modules_to_save being None."
+    with pytest.raises(ValueError, match=expected_error):
+        config = dict(
+            r=8,
+            lora_alpha=16,
+            target_modules=["gate_proj"],
+            modules_to_save=["lm_head"],
+        )
+        PEFTHelper.from_dict(config)
+    expected_error = "vLLM does not yet support RSLoRA."
+    with pytest.raises(ValueError, match=expected_error):
+        config = dict(r=8,
+                      lora_alpha=16,
+                      target_modules=["gate_proj"],
+                      use_rslora=True)
+        PEFTHelper.from_dict(config)
+
+    expected_error = "vLLM does not yet support DoRA."
+    with pytest.raises(ValueError, match=expected_error):
+        config = dict(r=8,
+                      lora_alpha=16,
+                      target_modules=["gate_proj"],
+                      use_dora=True)
+        PEFTHelper.from_dict(config)
+
+
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
         os.path.join(sql_lora_files, "adapter_model.safetensors"))
     new_embeddings = load_file(
         os.path.join(sql_lora_files, "new_embeddings.safetensors"))
+
+    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
+    with open(lora_config_path) as f:
+        config = json.load(f)
+
+    peft_helper = PEFTHelper.from_dict(config)
     lora_model = LoRAModel.from_lora_tensors(
         1,
-        8,
-        16,
         tensors,
-        device,
+        peft_helper=peft_helper,
+        device=device,
         embeddings=new_embeddings,
         embedding_modules=EMBEDDING_MODULES,
         embedding_padding_modules=EMBEDDING_PADDING_MODULES)
diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index 2c45ce5141f7d..78bf5a1617233 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -67,7 +67,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
         max_loras=4,
         max_lora_rank=64,
         trust_remote_code=True,
-        gpu_memory_utilization=0.97  # This model is pretty big for CI gpus
+        enable_chunked_prefill=True,
     )
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index ba29e562e58ec..930f177953a5f 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -69,6 +69,7 @@ def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
         tensor_parallel_size=2,
         trust_remote_code=True,
         fully_sharded_loras=fully_sharded,
+        enable_chunked_prefill=True,
     )
 
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -89,6 +90,7 @@ def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
         tensor_parallel_size=4,
         trust_remote_code=True,
         fully_sharded_loras=fully_sharded,
+        enable_chunked_prefill=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index dddc299da446b..150221dfce6ab 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -47,6 +47,7 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
         max_loras=4,
         distributed_executor_backend="ray",
         tensor_parallel_size=tp_size,
+        enable_chunked_prefill=True,
     )
 
     expected_lora_output = [
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 733eff48a9bf3..5a3fcb8d690d9 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -53,7 +53,8 @@ def test_phi2_lora(phi2_lora_files):
                    max_model_len=1024,
                    enable_lora=True,
                    max_loras=2,
-                   enforce_eager=True)
+                   enforce_eager=True,
+                   enable_chunked_prefill=True)
 
     expected_lora_output = [
         "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",  # noqa: E501
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py
index 52b82f25d23e1..3b20033271d26 100644
--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_variation.py
@@ -6,12 +6,13 @@
 import pytest
 import torch
 
-from vllm.lora.ops.bgmv_expand import bgmv_expand
-from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
-from vllm.lora.ops.bgmv_shrink import bgmv_shrink
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+# Enable custom op register
+import vllm.lora.ops.bgmv_expand
+import vllm.lora.ops.bgmv_expand_slice
+import vllm.lora.ops.bgmv_shrink
+import vllm.lora.ops.sgmv_expand
+import vllm.lora.ops.sgmv_expand_slice
+import vllm.lora.ops.sgmv_shrink  # noqa: F401
 from vllm.platforms import current_platform
 
 from .utils import (generate_data, generate_data_for_expand_nslices,
@@ -37,6 +38,16 @@ def assert_close(a, b):
     torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 
 
+# Unlike test_punica_sizes.py, we directly utilize custom op for
+# testing, which verifies the correct registration of these ops.
+bgmv_expand = torch.ops.vllm.bgmv_expand
+bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
+bgmv_shrink = torch.ops.vllm.bgmv_shrink
+sgmv_expand = torch.ops.vllm.sgmv_expand
+sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
+sgmv_shrink = torch.ops.vllm.sgmv_shrink
+
+
 @pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 5432fa4ad0d3a..026269667b473 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -84,7 +84,8 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
         tensor_parallel_size=tp_size,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
-        trust_remote_code=True)
+        trust_remote_code=True,
+        enable_chunked_prefill=True)
 
     if model.quantization is None:
         expected_no_lora_output = [
@@ -176,7 +177,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         tensor_parallel_size=1,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
-        trust_remote_code=True)
+        trust_remote_code=True,
+        enable_chunked_prefill=True)
     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
 
     del llm_tp1
@@ -189,7 +191,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         max_loras=4,
         tensor_parallel_size=2,
         gpu_memory_utilization=0.2,  #avoid OOM
-        quantization=model.quantization)
+        quantization=model.quantization,
+        enable_chunked_prefill=True)
     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
 
     del llm_tp2
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index daa39b2a3dba1..d225a3f7d6c06 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -17,6 +17,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
         tokenizer_id="gpt2",
         enable_lora=True,
         max_num_seqs=1,
+        max_loras=1,
         max_input_length=None,
     )
     lora_request = LoRARequest("1", 1, sql_lora_files)
@@ -53,3 +54,22 @@ def test_get_lora_tokenizer(sql_lora_files, tmp_path):
     lora_request = LoRARequest("1", 1, str(tmp_path))
     tokenizer = get_lora_tokenizer(lora_request)
     assert not tokenizer
+
+
+@pytest.mark.parametrize("enable_lora", [True, False])
+@pytest.mark.parametrize("max_num_seqs", [1, 2])
+@pytest.mark.parametrize("max_loras", [1, 2])
+def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
+    tokenizer_group = get_tokenizer_group(
+        get_tokenizer_pool_config(None),
+        tokenizer_id="gpt2",
+        enable_lora=enable_lora,
+        max_num_seqs=max_num_seqs,
+        max_loras=max_loras,
+        max_input_length=None,
+    )
+    if enable_lora:
+        assert tokenizer_group.lora_tokenizers.capacity == max(
+            max_num_seqs, max_loras)
+    else:
+        assert tokenizer_group.lora_tokenizers.capacity == 0
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 4a824c7acef21..b3c7850556f90 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -411,7 +411,7 @@ def log(self, *args, **kwargs):
         logger = _RayPrometheusStatLogger(
             local_interval=0.5,
             labels=dict(model_name=engine.model_config.served_model_name),
-            max_model_len=engine.model_config.max_model_len)
+            vllm_config=engine.vllm_config)
         engine.add_logger("ray", logger)
         for i, prompt in enumerate(example_prompts):
             engine.add_request(
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index af267f804ffa7..0a3aba255fd76 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,8 +1,8 @@
-import os
 from typing import List
 
 import pytest
 
+from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
@@ -51,42 +51,39 @@ class Relu3(ReLUSquaredActivation):
     ])
 def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
                      default_on: bool):
-    os.environ["VLLM_CUSTOM_OPS"] = env
-    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level)
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=torch_level, custom_ops=env.split(",")))
+    with set_current_vllm_config(vllm_config):
+        assert CustomOp.default_on() == default_on
 
-    # Reset default_on (computed once):
-    CustomOp.default_on.cache_clear()
+        ops_enabled = [bool(x) for x in ops_enabled]
 
-    assert CustomOp.default_on() == default_on
+        assert RMSNorm(1024).enabled() == ops_enabled[0]
+        assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
 
-    ops_enabled = [bool(x) for x in ops_enabled]
+        assert SiluAndMul().enabled() == ops_enabled[1]
+        assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
 
-    assert RMSNorm(1024).enabled() == ops_enabled[0]
-    assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
+        assert GeluAndMul().enabled() == ops_enabled[2]
+        assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
 
-    assert SiluAndMul().enabled() == ops_enabled[1]
-    assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
+        # If registered, subclasses should follow their own name
+        assert Relu3().enabled() == ops_enabled[3]
+        assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
 
-    assert GeluAndMul().enabled() == ops_enabled[2]
-    assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
+        # Unregistered subclass
+        class SiluAndMul2(SiluAndMul):
+            pass
 
-    # If registered, subclasses should follow their own name
-    assert Relu3().enabled() == ops_enabled[3]
-    assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
-
-    # Unregistered subclass
-    class SiluAndMul2(SiluAndMul):
-        pass
-
-    # Subclasses should not require registration
-    assert SiluAndMul2().enabled() == SiluAndMul().enabled()
+        # Subclasses should not require registration
+        assert SiluAndMul2().enabled() == SiluAndMul().enabled()
 
 
 @pytest.mark.parametrize(
     "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
 def test_enabled_ops_invalid(env: str):
-    os.environ["VLLM_CUSTOM_OPS"] = env
-    CustomOp.default_on.cache_clear()
-
-    with pytest.raises(AssertionError):
-        RMSNorm(1024).enabled()
+    with pytest.raises(Exception):  # noqa
+        vllm_config = VllmConfig(compilation_config=CompilationConfig(
+            custom_ops=env.split(",")))
+        with set_current_vllm_config(vllm_config):
+            RMSNorm(1024).enabled()
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 45fab8e96b968..3334c0df149b5 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,13 +1,19 @@
+import pickle
+
 import pytest
 import torch
 from transformers import AutoTokenizer
 
+from vllm.config import ModelConfig
 from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
+    get_guided_decoding_logits_processor,
+    get_local_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     JSONLogitsProcessor, RegexLogitsProcessor)
 from vllm.sampling_params import GuidedDecodingParams
 
+MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
+
 
 def test_guided_logits_processors(sample_regex, sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
@@ -36,15 +42,31 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
-async def test_guided_logits_processor_black_box(backend: str, sample_regex,
+@pytest.mark.parametrize("backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
+@pytest.mark.parametrize("is_local", [True, False])
+async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
+                                                 sample_regex,
                                                  sample_json_schema):
-    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+
+    config = ModelConfig(
+        MODEL_NAME,
+        task="generate",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     token_ids = tokenizer.encode(
         f"Give an example IPv4 address with this regex: {sample_regex}")
     regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-    regex_lp = await get_guided_decoding_logits_processor(
-        regex_request, tokenizer)
+
+    regex_lp = get_local_guided_decoding_logits_processor(
+            regex_request, tokenizer, config) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, tokenizer, config)
     assert regex_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -58,7 +80,7 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
     json_request = GuidedDecodingParams(json=sample_json_schema,
                                         backend=backend)
     json_lp = await get_guided_decoding_logits_processor(
-        json_request, tokenizer)
+        json_request, tokenizer, config)
     assert json_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -83,3 +105,24 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
     with pytest.raises(ValueError,
                        match="You can only use one kind of guided"):
         GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
+
+
+def test_pickle_xgrammar_tokenizer_data():
+
+    # TODO: move to another test file for xgrammar
+    try:
+        import xgrammar as xgr
+    except ImportError:
+        pytest.skip("Could not import xgrammar to run test")
+
+    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+        TokenizerData)
+    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
+    pickled = pickle.dumps(tokenizer_data)
+
+    assert pickled is not None
+
+    depickled: TokenizerData = pickle.loads(pickled)
+
+    assert depickled is not None
+    assert depickled.vocab_type == xgr.VocabType.RAW
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index e100c6b9bb906..c548cfdf53414 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -16,7 +16,7 @@
 
 AudioTuple = Tuple[np.ndarray, int]
 
-VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
+VLLM_PLACEHOLDER = "<|audio|>"
 HF_PLACEHOLDER = "<|audio|>"
 
 CHUNKED_PREFILL_KWARGS = {
@@ -46,7 +46,8 @@ def audio(request):
 def server(request, audio_assets):
     args = [
         "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
-        f"--limit-mm-per-prompt=audio={len(audio_assets)}"
+        f"--limit-mm-per-prompt=audio={len(audio_assets)}",
+        "--trust-remote-code"
     ] + [
         f"--{key.replace('_','-')}={value}"
         for key, value in request.param.items()
diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index 6542689c3f277..057b04349e8b7 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,8 +1,8 @@
 import pytest
 
 from tests.utils import multi_gpu_test
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
-from vllm.worker.model_runner import _get_graph_batch_size
 
 from ...utils import check_outputs_equal
 
@@ -189,7 +189,9 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
+    vllm_config = EngineArgs(model=model).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
+            len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
     try:
@@ -275,6 +277,44 @@ def test_state_cleanup(
                     "could be related to finished_requests_ids")
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_multistep(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is verifying that multistep works correctly
+    #on mamba-like models
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [64])
+def test_multistep_correctness(vllm_runner, model: str, dtype: str,
+                               max_tokens: int, example_prompts) -> None:
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_multistep = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with vllm_runner(model, num_scheduler_steps=1,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_single_step = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_outputs_multistep,
+        outputs_1_lst=vllm_outputs_single_step,
+        name_0="vllm_outputs_multistep",
+        name_1="vllm_outputs_single_step",
+    )
+
+
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 78eab8d5354fd..06739e8f02253 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -5,8 +5,8 @@
 import pytest
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
-from vllm.worker.model_runner import _get_graph_batch_size
 
 from ...utils import check_outputs_equal
 
@@ -200,7 +200,9 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
+    vllm_config = EngineArgs(model=model).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
+            len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
     try:
@@ -283,3 +285,39 @@ def test_state_cleanup(
     except ValueError:
         pytest.fail("Mamba inner state wasn't cleaned up between states, "
                     "could be related to finished_requests_ids")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_multistep(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [64])
+def test_multistep_correctness(vllm_runner, model: str, dtype: str,
+                               max_tokens: int, example_prompts) -> None:
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_multistep = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with vllm_runner(model, num_scheduler_steps=1,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_single_step = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_outputs_multistep,
+        outputs_1_lst=vllm_outputs_single_step,
+        name_0="vllm_outputs_multistep",
+        name_1="vllm_outputs_single_step",
+    )
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 99b5d5694f9f7..bdc1571784b5d 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -3,17 +3,20 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import copy
+import json
 
+import jsonschema
+import jsonschema.exceptions
 import pytest
 
-from vllm import SamplingParams
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
     MistralToolParser)
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 from ...utils import check_logprobs_close
 
 MODELS = [
-    "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 
 MISTRAL_FORMAT_MODELS = [
@@ -126,6 +129,45 @@
     }
 ]
 
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "number"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work_history"]
+}
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -251,3 +293,43 @@ def test_mistral_function_calling(
         assert parsed_message.tool_calls[
             0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
         assert parsed_message.content is None
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("guided_backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
+def test_mistral_guided_decoding(
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with vllm_runner(model, dtype='bfloat16',
+                     tokenizer_mode="mistral") as vllm_model:
+
+        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
+                                               backend=guided_backend)
+        params = SamplingParams(max_tokens=512,
+                                temperature=0.7,
+                                guided_decoding=guided_decoding)
+
+        messages = [{
+            "role": "system",
+            "content": "you are a helpful assistant"
+        }, {
+            "role":
+            "user",
+            "content":
+            f"Give an example JSON for an employee profile that "
+            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+        }]
+        outputs = vllm_model.model.chat(messages, sampling_params=params)
+
+        generated_text = outputs[0].outputs[0].text
+        json_response = json.loads(generated_text)
+        assert outputs is not None
+
+        try:
+            jsonschema.validate(instance=json_response,
+                                schema=SAMPLE_JSON_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            pytest.fail("Generated response is not valid with JSON schema")
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
index 31896bfd13e8c..c71a2d359043d 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
@@ -3,7 +3,6 @@
 
 import pytest
 import torch
-import transformers
 from transformers import AutoImageProcessor, AutoTokenizer
 
 from vllm.inputs import InputContext, token_inputs
@@ -36,8 +35,6 @@ def get_max_idefics3_image_tokens():
     return get_max_idefics3_image_tokens
 
 
-@pytest.mark.skipif(transformers.__version__ < "4.46.0",
-                    reason="Model introduced in HF >= 4.46.0")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
 def test_input_mapper_override(model: str, image_assets: _ImageAssets,
@@ -77,8 +74,6 @@ def test_input_mapper_override(model: str, image_assets: _ImageAssets,
     assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
 
 
-@pytest.mark.skipif(transformers.__version__ < "4.46.0",
-                    reason="Model introduced in HF >= 4.46.0")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("longest_edge, expected_max_tokens", [
     (None, 2873),
@@ -107,8 +102,6 @@ def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
     assert expected_max_tokens == actual_max_tokens
 
 
-@pytest.mark.skipif(transformers.__version__ < "4.46.0",
-                    reason="Model introduced in HF >= 4.46.0")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
     (168, 169, 1),
@@ -143,8 +136,6 @@ def test_dummy_data_override(dummy_data_for_idefics3, model: str,
     assert img_tok_count == toks_per_img * num_imgs
 
 
-@pytest.mark.skipif(transformers.__version__ < "4.46.0",
-                    reason="Model introduced in HF >= 4.46.0")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
     (336, 169 * (1**2 + 1), 1),
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py
new file mode 100644
index 0000000000000..af0c2aa211998
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py
@@ -0,0 +1,206 @@
+"""Tests for InternVL's multimodal preprocessing kwargs."""
+from typing import Callable, Optional
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.inputs import InputContext, token_inputs
+from vllm.multimodal import MultiModalRegistry
+
+from .....conftest import _ImageAssets
+from ....utils import build_model_context
+
+models = ["OpenGVLab/InternVL2-2B"]
+
+
+# Wrap lazy imports to avoid initializing CUDA during test collection
+@pytest.fixture()
+def input_processor_for_internvl():
+    from vllm.model_executor.models.internvl import InternVLInputPipeline
+
+    pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
+    return pipeline.input_processor
+
+
+@pytest.fixture()
+def dummy_data_for_internvl():
+    from vllm.model_executor.models.internvl import InternVLInputPipeline
+
+    pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
+    return pipeline.dummy_data
+
+
+@pytest.fixture()
+def get_max_internvl_image_tokens():
+    from vllm.model_executor.models.internvl import (
+        get_max_internvl_image_tokens)
+    return get_max_internvl_image_tokens
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
+@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
+def test_input_mapper_override(
+    model: str,
+    image_assets: _ImageAssets,
+    max_dynamic_patch: int,
+    dynamic_image_size: Optional[bool],
+):
+    mm_processor_kwargs = {
+        "max_dynamic_patch": max_dynamic_patch,
+    }
+    if dynamic_image_size is not None:
+        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
+
+    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
+    if dynamic_image_size is False:
+        expected_num_patches = 1
+
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=mm_processor_kwargs,
+    )
+
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+
+    image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
+    vllm_result = mm_registry.map_input(
+        ctx.model_config,
+        {"image": image},
+    )
+    assert vllm_result["pixel_values"].size(1) == expected_num_patches
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
+@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
+def test_max_tokens_override(
+    get_max_internvl_image_tokens: Callable,
+    model: str,
+    max_dynamic_patch: Optional[int],
+    dynamic_image_size: Optional[bool],
+):
+    """Ensure get_max_internvl_image_tokens handles mm_processor_kwargs."""
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+    )
+
+    if max_dynamic_patch is None:
+        max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
+    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
+    if dynamic_image_size is False:
+        expected_num_patches = 1
+    expected_max_tokens = 256 * expected_num_patches
+
+    actual_max_tokens = get_max_internvl_image_tokens(
+        ctx=InputContext(ctx.model_config),
+        max_dynamic_patch=max_dynamic_patch,
+        dynamic_image_size=dynamic_image_size,
+    )
+    assert expected_max_tokens == actual_max_tokens
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
+@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
+def test_dummy_data_override(
+    dummy_data_for_internvl: Callable,
+    model: str,
+    num_imgs: int,
+    max_dynamic_patch: Optional[int],
+    dynamic_image_size: Optional[bool],
+):
+    """Ensure dummy_data_for_internvl handles kwargs properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the dummy data func.
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+    )
+
+    if max_dynamic_patch is None:
+        max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
+    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
+    if dynamic_image_size is False:
+        expected_num_patches = 1
+    expected_max_tokens = 256 * expected_num_patches
+
+    dummy_data = dummy_data_for_internvl(
+        ctx=ctx,
+        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
+        mm_counts={"image": num_imgs},
+        max_dynamic_patch=max_dynamic_patch,
+        dynamic_image_size=dynamic_image_size,
+    )
+    sequence_data = dummy_data.seq_data
+
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    image_token_id = tokenizer.encode('<IMG_CONTEXT>',
+                                      add_special_tokens=False)[0]
+
+    # Ensure we have the right number of placeholders per size
+    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
+    assert img_tok_count == expected_max_tokens * num_imgs
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
+@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_input_processor_override(
+    input_processor_for_internvl: Callable,
+    image_assets: _ImageAssets,
+    model: str,
+    num_imgs: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: Optional[bool],
+):
+    """Ensure input_processor_for_internvl handles kwargs properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
+    if dynamic_image_size is False:
+        expected_num_patches = 1
+
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+    )
+    expected_toks_per_img = 256 * expected_num_patches
+
+    # Build the image str / prompt based on the number of images we pass
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    placeholders = "<image>" if num_imgs == 1 else "\n".join(
+        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    prompt = placeholders
+    images = [image_assets[0].pil_image.resize((448 * 2, 448 * 2))] * num_imgs
+
+    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
+                          prompt=prompt,
+                          multi_modal_data={"image": images})
+
+    processed_inputs = input_processor_for_internvl(
+        ctx,
+        inputs,
+        max_dynamic_patch=max_dynamic_patch,
+        dynamic_image_size=dynamic_image_size,
+    )
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.encode('<IMG_CONTEXT>',
+                                      add_special_tokens=False)[0]
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index 60a8f63eb5faa..f95cee277f4e6 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -2,12 +2,10 @@
 from typing import Optional
 
 import pytest
-import torch
-from transformers import AutoImageProcessor, AutoTokenizer
+from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, token_inputs
+from vllm.inputs import InputContext, InputProcessingContext
 from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
-from vllm.multimodal import MultiModalRegistry
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
@@ -17,15 +15,9 @@
 
 # Wrap lazy imports to avoid initializing CUDA during test collection
 @pytest.fixture()
-def input_processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import input_processor_for_phi3v
-    return input_processor_for_phi3v
-
-
-@pytest.fixture()
-def dummy_data_for_phi3v():
-    from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
-    return dummy_data_for_phi3v
+def processor_for_phi3v():
+    from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
+    return Phi3VMultiModalProcessor
 
 
 @pytest.fixture()
@@ -34,53 +26,6 @@ def get_max_phi3v_image_tokens():
     return get_max_phi3v_image_tokens
 
 
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops", [4, 16, None])
-def test_input_mapper_override(model: str, image_assets: _ImageAssets,
-                               num_crops: Optional[int]):
-    """Ensure that the [default] input mapper handles num_crops properly."""
-    # We pass the processor kwargs here since for this model, we fall back to
-    # the default mapper; this will fall back to the HF mapper and forward
-    # mm_processor_kwargs to it.
-    mm_processor_kwargs = {
-        "num_crops": num_crops
-    } if num_crops is not None else {}
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=mm_processor_kwargs,
-    )
-
-    hf_processor = AutoImageProcessor.from_pretrained(model,
-                                                      trust_remote_code=True,
-                                                      **mm_processor_kwargs)
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    image = image_assets[0].pil_image
-    hf_result = hf_processor.preprocess(
-        image,
-        return_tensors="pt",
-    )
-
-    vllm_result = mm_registry.map_input(
-        ctx.model_config,
-        {"image": image},
-    )
-
-    assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
-    assert torch.all(
-        hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
-
-    # For pixel values, the second axis should be the num_crops + 1
-    # for the rescaled original image. The default value in VLLM falls
-    # back to the HF config, which is why we compare to the processor num_crops
-    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
-    assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
-
-
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("num_crops,expected_max_tokens", [
     (4, 781),
@@ -112,48 +57,18 @@ def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
-    (4, 781, 1),
-    (4, 781, 2),
-    (16, 2653, 1),
-    (16, 2653, 2),
-])
-def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
-                             toks_per_img: int, num_imgs: int):
-    """Ensure dummy_data_for_phi3v handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the dummy data func.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    dummy_data = dummy_data_for_phi3v(
-        ctx=ctx,
-        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
-        mm_counts={"image": num_imgs},
-        num_crops=num_crops,
-    )
-    sequence_data = dummy_data.seq_data
-    # Ensure we have the right number of placeholders per num_crops size
-    img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
-    assert img_tok_count == toks_per_img * num_imgs
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
-    (4, 757, 1),
-    (4, 757, 2),
-    (16, 1921, 1),
-    (16, 1921, 2),
-])
-def test_input_processor_override(input_processor_for_phi3v,
-                                  image_assets: _ImageAssets, model: str,
-                                  num_crops: int, expected_toks_per_img: int,
-                                  num_imgs: int):
+@pytest.mark.parametrize(
+    "num_crops,expected_toks_per_img",
+    [
+        (4, 757),
+        (16, 1921),
+        # the default num_crops of phi-3.5-vision is 4
+        (None, 757),
+    ])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
+                            model: str, num_crops: Optional[int],
+                            expected_toks_per_img: int, num_imgs: int):
     """Ensure input_processor_for_phi3v handles num_crops properly."""
     # Same as the previous test - don't initialize mm_processor_kwargs
     # in this test and assume that the kwargs will be correctly expanded by
@@ -163,19 +78,20 @@ def test_input_processor_override(input_processor_for_phi3v,
         tokenizer_name=model,
         trust_remote_code=True,
     )
-    tokenizer = AutoTokenizer.from_pretrained(model)
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
     # Build the image str / prompt based on the number of images we pass
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
     images = [image_assets[0].pil_image] * num_imgs
 
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": images})
+    mm_data = {"image": images}
+    mm_processor_kwargs = {}
+    if num_crops is not None:
+        mm_processor_kwargs = {"num_crops": num_crops}
 
-    processed_inputs = input_processor_for_phi3v(ctx,
-                                                 inputs,
-                                                 num_crops=num_crops)
+    processor = processor_for_phi3v(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
     img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3f6d8ef42cd5f..3101d1d2ea831 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -1,20 +1,23 @@
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
+import math
 import os
+from collections import defaultdict
 from pathlib import PosixPath
 from typing import Type
 
 import pytest
-import transformers
 from transformers import AutoModelForVision2Seq
+from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
-from vllm.utils import cuda_device_count_stateless, identity
+from vllm.utils import identity
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
                           _VideoAssets)
-from ....utils import fork_new_process_for_each_test, large_gpu_mark
+from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
+                       multi_gpu_marks)
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
 from .vlm_utils.case_filtering import get_parametrized_options
@@ -34,7 +37,7 @@
     "dtype": "half",
     "max_tokens": 5,
     "tensor_parallel_size": 2,
-    "model_kwargs": {"device_map": "auto"},
+    "hf_model_kwargs": {"device_map": "auto"},
     "image_size_factors": [(.25, 0.5, 1.0)],
     "distributed_executor_backend": (
         "ray",
@@ -108,7 +111,7 @@
             "cherry_blossom": "What is in the picture?",
         }),
         auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
@@ -134,6 +137,35 @@
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     #### Extended model tests
+    "aria": VLMTestInfo(
+        models=["rhymes-ai/Aria"],
+        tokenizer_mode="slow",
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+        ),
+        dtype="bfloat16",
+        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<vlm_image>Please describe the image shortly.",
+            "cherry_blossom": "<vlm_image>Please infer the season with reason.",
+        }),
+        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
+        postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"),
+        stop_str=["<|im_end|>"],
+        image_size_factors=[(0.10, 0.15)],
+        max_tokens=64,
+        marks=[
+            pytest.mark.skipif(
+                not is_flash_attn_2_available(),
+                reason="Model needs flash-attn for numeric convergence.",
+            ),
+            large_gpu_mark(min_gb=64),
+        ],
+    ),
     "blip2": VLMTestInfo(
         models=["Salesforce/blip2-opt-2.7b"],
         test_type=VLMTestType.IMAGE,
@@ -148,7 +180,7 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         # For chameleon, we only compare the sequences
@@ -157,12 +189,6 @@
         comparator=check_outputs_equal,
         max_tokens=8,
         dtype="bfloat16",
-        marks=[
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.2",
-                reason="Model broken in HF, see huggingface/transformers#34379"
-            ),
-        ]
     ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
@@ -213,13 +239,7 @@
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
-        marks=[
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.0",
-                reason="Model introduced in HF >= 4.46.0"
-            ),
-            large_gpu_mark(min_gb=48),
-        ],
+        marks=[large_gpu_mark(min_gb=48)],
     ),
     "intern_vl": VLMTestInfo(
         models=[
@@ -264,7 +284,7 @@
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
         num_video_frames=16,
         max_model_len=16384,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values_videos"
         ),
         auto_cls=AutoModelForVision2Seq,
@@ -288,23 +308,44 @@
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
         image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
-        marks=[
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.2",
-                reason="Model broken with changes in transformers 4.46"
-            )
-        ],
     ),
-    "minicpmv": VLMTestInfo(
-        models=["openbmb/MiniCPM-Llama3-V-2_5"],
+    "mantis": VLMTestInfo(
+        models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        max_model_len=4096,
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
+            "pixel_values"
+        ),
+        vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}},  # noqa: E501
+        get_stop_token_ids=lambda tok: [128009],
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
+        patch_hf_runner=model_utils.mantis_patch_hf_runner,
+    ),
+    "minicpmv_25": VLMTestInfo(
+        models=["openbmb/MiniCPM-Llama3-V-2_5"],
+        test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
         max_num_seqs=2,
         get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
         postprocess_inputs=model_utils.wrap_inputs_post_processor,
-        hf_output_post_proc=model_utils.minicmpv_trunc_hf_output,
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+    ),
+    "minicpmv_26": VLMTestInfo(
+        models=["openbmb/MiniCPM-V-2_6"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        postprocess_inputs=model_utils.ignore_inputs_post_processor(
+            "image_sizes"
+        ),
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
     ),
     # Tests for phi3v currently live in another file because of a bug in
     # transformers. Once this issue is fixed, we can enable them here instead.
@@ -318,7 +359,7 @@
     #     max_num_seqs=2,
     #     task="generate",
     #     # use eager mode for hf runner since phi3v didn't work with flash_attn
-    #     model_kwargs={"_attn_implementation": "eager"},
+    #     hf_model_kwargs={"_attn_implementation": "eager"},
     #     use_tokenizer_eos=True,
     #     vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
     #     num_logprobs=10,
@@ -344,58 +385,36 @@
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
     ### Tensor parallel / multi-gpu broadcast tests
-    "broadcast-chameleon": VLMTestInfo(
+    "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
         hf_output_post_proc = lambda hf_output, model: hf_output[:2],
         comparator=check_outputs_equal,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            ),
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.2",
-                reason="Model broken in HF, see huggingface/transformers#34379"
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
-    "broadcast-llava": VLMTestInfo(
+    "llava-broadcast": VLMTestInfo(
         models=["llava-hf/llava-1.5-7b-hf"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
-    "broadcast-llava_next": VLMTestInfo(
+    "llava_next-broadcast": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
         prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
         max_model_len=10240,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
     ### Custom input edge-cases for specific models
@@ -418,7 +437,7 @@
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=16384,
         max_num_seqs=2,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         auto_cls=AutoModelForVision2Seq,
@@ -434,6 +453,41 @@
 # yapf: enable
 
 
+def _mark_splits(
+    test_settings: dict[str, VLMTestInfo],
+    *,
+    num_groups: int,
+) -> dict[str, VLMTestInfo]:
+    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
+    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)
+
+    for info in test_settings.values():
+        for model in info.models:
+            test_infos_by_model[model].append(info)
+
+    models = sorted(test_infos_by_model.keys())
+    split_size = math.ceil(len(models) / num_groups)
+
+    new_test_settings = dict[str, VLMTestInfo]()
+
+    for i in range(num_groups):
+        models_in_group = models[i * split_size:(i + 1) * split_size]
+
+        for model in models_in_group:
+            for info in test_infos_by_model[model]:
+                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
+                new_info = info._replace(marks=new_marks)
+                new_test_settings[name_by_test_info_id[id(info)]] = new_info
+
+    missing_keys = test_settings.keys() - new_test_settings.keys()
+    assert not missing_keys, f"Missing keys: {missing_keys}"
+
+    return new_test_settings
+
+
+VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
+
+
 ### Test wrappers
 # Wrappers around the core test running func for:
 # - single image
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index d8a98a0f84d3b..90c0fab99054c 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -8,13 +8,17 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import pytest
+from mistral_common.multimodal import download_image
 from mistral_common.protocol.instruct.messages import ImageURLChunk
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
+from transformers import AutoProcessor
 
-from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
+from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams,
+                  TextPrompt, TokensPrompt)
 from vllm.multimodal import MultiModalDataBuiltins
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sequence import Logprob, SampleLogprobs
 
 from ....utils import VLLM_PATH, large_gpu_test
@@ -49,6 +53,20 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
     }]
 
 
+def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
+    return [{
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "content": PROMPT,
+        }, *({
+            "type": "image",
+            "image": download_image(url)
+        } for url in urls)],
+    }]
+
+
 def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
     msg = _create_msg_format(urls)
 
@@ -70,6 +88,23 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
     return engine_inputs
 
 
+def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
+    msg = _create_msg_format_hf(urls)
+
+    tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
+    prompt = tokenizer.apply_chat_template(msg)
+
+    images = []
+    for chunk in msg[0]["content"]:
+        if chunk["type"] == "image":
+            images.append(chunk["image"])
+
+    mm_data = MultiModalDataBuiltins(image=images)
+    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)
+
+    return engine_inputs
+
+
 MSGS = [
     _create_msg_format(IMG_URLS[:1]),
     _create_msg_format(IMG_URLS[:2]),
@@ -191,3 +226,45 @@ def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
                          outputs_1_lst=logprobs,
                          name_0="h100_ref",
                          name_1="output")
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize(
+    "prompt,expected_ranges",
+    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
+        "offset": 10,
+        "length": 494
+    }]),
+     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
+         "offset": 10,
+         "length": 266
+     }, {
+         "offset": 276,
+         "length": 1056
+     }, {
+         "offset": 1332,
+         "length": 418
+     }])])
+def test_multi_modal_placeholders(
+        vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None:
+    with vllm_runner(
+            "mistral-community/pixtral-12b",
+            max_model_len=8192,
+            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+    ) as vllm_model:
+        outputs = vllm_model.model.generate(prompt)
+
+        assert len(outputs) == 1, f"{len(outputs)=}"
+        output: RequestOutput = outputs[0]
+        assert hasattr(output,
+                       "multi_modal_placeholders"), f"{output.__dict__=}"
+        assert "image" in output.multi_modal_placeholders, \
+            f"{output.multi_modal_placeholders.keys()=}"
+        image_placeholder_ranges: list[
+            PlaceholderRange] = output.multi_modal_placeholders["image"]
+        assert len(image_placeholder_ranges) == len(
+            expected_ranges), f"{image_placeholder_ranges=}"
+        for real_range, expected_range in zip(image_placeholder_ranges,
+                                              expected_ranges):
+            assert real_range == expected_range, \
+                f"{real_range=} {expected_range=}"
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 718c675b86fb4..71b6ba4dca435 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -18,6 +18,7 @@
 
 IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
 VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+MODEL_HIDDEN_SIZE = 1536
 
 
 def qwen2_vl_chat_template(*query):
@@ -230,7 +231,7 @@ def batch_make_video_embeddings(
     return result
 
 
-def run_test(
+def run_embedding_input_test(
     vllm_runner: Type[VllmRunner],
     inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
     model: str,
@@ -326,7 +327,7 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
             [],
         ) for image, prompt in zip(images, IMAGE_PROMPTS)]
 
-    run_test(
+    run_embedding_input_test(
         vllm_runner,
         inputs_per_case,
         model,
@@ -371,7 +372,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
                                     [],
                                 )]
 
-    run_test(
+    run_embedding_input_test(
         vllm_runner,
         inputs_per_case,
         model,
@@ -416,7 +417,134 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
             [rescale_video_size(video, factor) for factor in size_factors],
         ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
 
-    run_test(
+    run_embedding_input_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+def run_chunked_prefill_test(
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Compare inference result between
+    chunked prefill disabled and chunked prefill enabled
+    """
+
+    # NOTE:
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     task="generate",
+                     max_model_len=4000,
+                     max_num_seqs=4,
+                     dtype=dtype,
+                     limit_mm_per_prompt={
+                         "image": mm_limit,
+                         "video": mm_limit
+                     },
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+
+        outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images or None,
+                                                videos=videos or None)
+            for prompts, images, videos in inputs
+        ]
+
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=4000,
+            max_num_seqs=4,
+            dtype=dtype,
+            limit_mm_per_prompt={
+                "image": mm_limit,
+                "video": mm_limit
+            },
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_chunked_prefill=True,
+            # should be small enough to ensure prefilling is chunked
+            max_num_batched_tokens=32,
+            mm_processor_kwargs={
+                "max_pixels": 16 * 28 * 28,
+            }) as vllm_model_chunked:
+        outputs_per_case_chunked = [
+            vllm_model_chunked.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images or None,
+                videos=videos or None) for prompts, images, videos in inputs
+        ]
+
+    for outputs, \
+        outputs_chunked \
+        in zip(outputs_per_case,
+            outputs_per_case_chunked):
+        check_logprobs_close(
+            outputs_0_lst=outputs,
+            outputs_1_lst=outputs_chunked,
+            name_0="non_chunked",
+            name_1="chunked",
+        )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [1])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts,
+                                        model: str, dtype: str,
+                                        max_tokens: int,
+                                        num_logprobs: int) -> None:
+    """
+    Test Qwen2-VL's chunked prefill with M-RoPE
+    """
+    prompts = [
+        qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt)
+        for prompt in example_prompts[:1]
+    ]
+
+    # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs,
+    #    so an image is included in the inputs
+    # 2. however, Qwen2-VL currently won't work properly
+    #    when chunked prefill is enabled and there are some multi-modal inputs,
+    #    here use a hacky way: provide a **zero-length** image to make it happy
+    #
+    # and finally we achieved:
+    # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests
+    zero_len_image = {
+        "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)),
+        "image_grid_thw": torch.tensor([[0, 0, 0]])
+    }
+    images = [zero_len_image] * len(prompts)
+
+    inputs_per_case: List[Tuple[List[str], PromptImageInput,
+                                PromptVideoInput]] = [
+                                    (prompts, images, []),
+                                ]
+
+    run_chunked_prefill_test(
         vllm_runner,
         inputs_per_case,
         model,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 7e8c6dabb15af..54b7b0733210f 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -3,9 +3,11 @@
 
 import torch
 from PIL.Image import Image
-from transformers import AutoTokenizer, BatchEncoding
+from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
+from vllm.config import TaskOption
+
 from .....conftest import HfRunner, VllmRunner
 from .types import RunnerOutput
 
@@ -28,11 +30,15 @@ def run_test(
     use_tokenizer_eos: bool,
     postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
     comparator: Callable[..., None],
-    get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]],
+    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
+                                          List[int]]],
+    stop_str: Optional[List[str]],
+    tokenizer_mode: str,
     limit_mm_per_prompt: Dict[str, int],
-    model_kwargs: Optional[Dict[str, Any]],
+    vllm_runner_kwargs: Optional[Dict[str, Any]],
+    hf_model_kwargs: Optional[Dict[str, Any]],
     patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
-    task: str = "auto",
+    task: TaskOption = "auto",
     runner_mm_key: str = "images",
     distributed_executor_backend: Optional[str] = None,
     tensor_parallel_size: int = 1,
@@ -50,11 +56,17 @@ def run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    vllm_kwargs = {}
+    vllm_kwargs: Dict[str, Any] = {}
     if get_stop_token_ids is not None:
         vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
+    if stop_str:
+        vllm_kwargs["stop"] = stop_str
+
+    if vllm_runner_kwargs is None:
+        vllm_runner_kwargs = {}
 
     with vllm_runner(model,
+                     tokenizer_mode=tokenizer_mode,
                      max_model_len=max_model_len,
                      max_num_seqs=max_num_seqs,
                      dtype=dtype,
@@ -62,7 +74,8 @@ def run_test(
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=enforce_eager,
-                     task=task) as vllm_model:
+                     task=task,
+                     **vllm_runner_kwargs) as vllm_model:
         for prompts, media in vllm_inputs:
             vllm_kwargs[runner_mm_key] = media
             vllm_output = vllm_model.generate_greedy_logprobs(
@@ -73,7 +86,7 @@ def run_test(
                          dtype=dtype,
                          auto_cls=auto_cls,
                          postprocess_inputs=postprocess_inputs,
-                         model_kwargs=model_kwargs)
+                         model_kwargs=hf_model_kwargs)
 
     # Some models need to patch things like the model processor, e.g., internvl
     if patch_hf_runner is not None:
@@ -85,6 +98,8 @@ def run_test(
     hf_kwargs = {}
     if use_tokenizer_eos:
         hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
+    if stop_str:
+        hf_kwargs["stop_strings"] = stop_str
 
     with hf_model, torch.no_grad():
         for prompts, media in inputs:
@@ -138,4 +153,4 @@ def process_runner_outputs(
 def process_outputs(output_processor, model, outputs_per_image):
     """Applies a model specific post-processor function to a runner's output"""
     return [[output_processor(res, model) for res in outputs]
-            for outputs in outputs_per_image]
+            for outputs in outputs_per_image]
\ No newline at end of file
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 849857b4232e7..3eca8fb9dcb1a 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -126,6 +126,16 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
+def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    """Sanitize vllm output [mantis] to compare with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|eot_id|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
 def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
                             model: str) -> RunnerOutput:
     """Sanitize vllm output [phi3v] to be comparable with hf output."""
@@ -170,7 +180,7 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 
 ####### Post-processors for HF outputs
-def minicmpv_trunc_hf_output(hf_output: RunnerOutput,
+def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                              model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<|eot_id|>"):
@@ -184,7 +194,7 @@ def get_llava_embeddings(image_assets: _ImageAssets):
 
 
 ####### postprocessors to run on HF BatchEncoding
-def get_key_type_post_processor(
+def cast_dtype_post_processor(
         hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
     """Gets a handle to a post processor which converts a given key into a
     target data type."""
@@ -197,6 +207,17 @@ def process(hf_inputs: BatchEncoding, dtype: str):
     return process
 
 
+def ignore_inputs_post_processor(
+        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
+    """Gets a handle to a post processor which ignores a given key."""
+
+    def process(hf_inputs: BatchEncoding, dtype: str):
+        del hf_inputs[hf_inp_key]
+        return hf_inputs
+
+    return process
+
+
 def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
     return {"model_inputs": hf_inputs}
 
@@ -407,3 +428,26 @@ def _internvl_generate(
     )
 
     return outputs
+
+
+def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    from mantis.models.mllava import MLlavaProcessor
+
+    hf_model.processor = MLlavaProcessor.from_pretrained(hf_model.model_name)
+
+    orig_generate = hf_model.model.generate
+    tokenizer = hf_model.processor.tokenizer
+
+    def _generate(self, *args, **kwargs):
+        return orig_generate(
+            *args,
+            **kwargs,
+            eos_token_id=[
+                tokenizer.eos_token_id,
+                tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+            ],
+        )
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index 8459476dc2d07..e2e0c6390fcb9 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -7,9 +7,11 @@
 import torch
 from PIL.Image import Image
 from pytest import MarkDecorator
-from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
+from transformers import (AutoModelForCausalLM, BatchEncoding,
+                          PreTrainedTokenizerBase)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
+from vllm.config import TaskOption
 from vllm.sequence import SampleLogprobs
 from vllm.utils import identity
 
@@ -66,7 +68,7 @@ class ImageSizeWrapper(NamedTuple):
 class VLMTestInfo(NamedTuple):
     """Holds the configuration for 1+ tests for one model architecture."""
 
-    models: Union[List[str]]
+    models: List[str]
     test_type: Union[VLMTestType, Iterable[VLMTestType]]
 
     # Should be None only if this is a CUSTOM_INPUTS test
@@ -92,15 +94,20 @@ class VLMTestInfo(NamedTuple):
     enforce_eager: bool = True
     max_model_len: int = 1024
     max_num_seqs: int = 256
-    task: str = "auto"
+    task: TaskOption = "auto"
     tensor_parallel_size: int = 1
+    vllm_runner_kwargs: Optional[Dict[str, Any]] = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None
+    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
+                                          List[int]]] = None
+    # Optional list of strings to stop generation, useful when stop tokens are
+    # not special tokens in the tokenizer
+    stop_str: Optional[List[str]] = None
 
     # Exposed options for HF runner
-    model_kwargs: Optional[Dict[str, Any]] = None
-    # Indicates we should explicitly pass the EOS from the tokeniezr
+    hf_model_kwargs: Optional[Dict[str, Any]] = None
+    # Indicates we should explicitly pass the EOS from the tokenizer
     use_tokenizer_eos: bool = False
     auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
     # Callable to pass to the HF runner to run on inputs; for now, we also pass
@@ -148,6 +155,8 @@ class VLMTestInfo(NamedTuple):
 
     marks: Optional[List[MarkDecorator]] = None
 
+    tokenizer_mode: str = "auto"
+
     def get_non_parametrized_runner_kwargs(self):
         """Returns a dictionary of expandable kwargs for items that are used
         in all test types, which are NOT used when creating the parametrized
@@ -159,6 +168,7 @@ def get_non_parametrized_runner_kwargs(self):
             "max_num_seqs": self.max_num_seqs,
             "task": self.task,
             "tensor_parallel_size": self.tensor_parallel_size,
+            "vllm_runner_kwargs": self.vllm_runner_kwargs,
             "hf_output_post_proc": self.hf_output_post_proc,
             "vllm_output_post_proc": self.vllm_output_post_proc,
             "auto_cls": self.auto_cls,
@@ -166,8 +176,10 @@ def get_non_parametrized_runner_kwargs(self):
             "postprocess_inputs": self.postprocess_inputs,
             "comparator": self.comparator,
             "get_stop_token_ids": self.get_stop_token_ids,
-            "model_kwargs": self.model_kwargs,
+            "hf_model_kwargs": self.hf_model_kwargs,
+            "stop_str": self.stop_str,
             "patch_hf_runner": self.patch_hf_runner,
+            "tokenizer_mode": self.tokenizer_mode
         }
 
 
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index c3f351ef707be..f458ef5ef556d 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -4,6 +4,8 @@
 """
 import pytest
 
+from vllm.config import PoolerConfig
+
 from ..utils import check_embeddings_close
 
 
@@ -21,6 +23,7 @@
                      marks=[pytest.mark.core_model]),
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
+        pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
     ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
@@ -31,6 +34,13 @@ def test_models(
     model,
     dtype: str,
 ) -> None:
+    vllm_extra_kwargs = {}
+    if model == "ssmits/Qwen2-7B-Instruct-embed-base":
+        vllm_extra_kwargs["override_pooler_config"] = \
+            PoolerConfig(pooling_type="MEAN")
+    if model == "Alibaba-NLP/gte-Qwen2-7B-instruct":
+        vllm_extra_kwargs["hf_overrides"] = {"is_causal": False}
+
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
     # sentence_transformers will strip the input texts, see:
@@ -43,8 +53,11 @@ def test_models(
                    is_sentence_transformer=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
-    with vllm_runner(model, task="embedding", dtype=dtype,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(model,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
new file mode 100644
index 0000000000000..55c2e5d4ed412
--- /dev/null
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -0,0 +1,200 @@
+import importlib.util
+import math
+from array import array
+from typing import List
+
+import openai
+import pytest
+import pytest_asyncio
+from scipy.spatial.distance import cosine
+
+import vllm
+import vllm.config
+
+from ....utils import RemoteOpenAIServer
+
+# GritLM embedding implementation is only supported by XFormers backend.
+pytest.mark.skipif(not importlib.util.find_spec("xformers"),
+                   reason="GritLM requires XFormers")
+
+MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
+MAX_MODEL_LEN = 4000
+
+
+def _arr(arr):
+    """
+    Convert a list of integers to an array of integers.
+    """
+    return array("i", arr)
+
+
+def test_find_array(monkeypatch):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+    from vllm.model_executor.models.gritlm import GritLMPooler
+
+    # Create an LLM object to get the model config.
+    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+    pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
+
+    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+
+    with pytest.raises(ValueError):
+        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+
+
+@pytest.fixture(scope="module")
+def server_embedding():
+    # GritLM embedding implementation is only supported by XFormers backend.
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+        args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_generate():
+    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_embedding(server_embedding: RemoteOpenAIServer):
+    async with server_embedding.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest_asyncio.fixture
+async def client_generate(server_generate: RemoteOpenAIServer):
+    async with server_generate.get_async_client() as async_client:
+        yield async_client
+
+
+def run_llm_encode(llm: vllm.LLM, queries: List[str],
+                   instruction: str) -> List[float]:
+    outputs = llm.encode([instruction + q for q in queries], )
+    return [output.outputs.embedding for output in outputs]
+
+
+async def run_client_embeddings(client: vllm.LLM, queries: List[str],
+                                instruction: str) -> List[float]:
+    outputs = await client.embeddings.create(
+        model=MODEL_NAME,
+        input=[instruction + q for q in queries],
+    )
+    return [data.embedding for data in outputs.data]
+
+
+def gritlm_instruction(instruction):
+    return ("<|user|>\n" + instruction +
+            "\n<|embed|>\n" if instruction else "<|embed|>\n")
+
+
+def get_test_data():
+    """
+    Grabbed this test data and the expected values from
+    README.md in https://github.com/ContextualAI/gritlm
+    """
+    q_instruction = gritlm_instruction(
+        "Given a scientific paper title, retrieve the paper's abstract")
+    queries = [
+        "Bitcoin: A Peer-to-Peer Electronic Cash System",
+        "Generative Representational Instruction Tuning",
+    ]
+
+    d_instruction = gritlm_instruction("")
+    documents = [
+        # ruff: noqa: E501
+        "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+        "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+    ]
+
+    return queries, q_instruction, documents, d_instruction
+
+
+def validate_embed_output(q_rep: List[float], d_rep: List[float]):
+    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
+    assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
+
+    cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
+    assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001)
+
+    cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
+    assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
+
+    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
+    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
+
+
+def test_gritlm_offline_embedding(monkeypatch):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+
+    d_rep = run_llm_encode(
+        llm,
+        documents,
+        d_instruction,
+    )
+    q_rep = run_llm_encode(
+        llm,
+        queries,
+        q_instruction,
+    )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_embedding(
+        client_embedding: openai.AsyncOpenAI):
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    d_rep = await run_client_embeddings(
+        client_embedding,
+        documents,
+        d_instruction,
+    )
+    q_rep = await run_client_embeddings(
+        client_embedding,
+        queries,
+        q_instruction,
+    )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+def test_gritlm_offline_gen():
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
+    sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
+    outputs = llm.generate(input, sampling_params=sampling_params)
+
+    assert outputs[0].outputs[0].text == "The capital of France is Paris."
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    outputs = await client_generate.completions.create(
+        model=MODEL_NAME,
+        prompt=input,
+        max_tokens=256,
+        temperature=0.0,
+    )
+
+    assert outputs.choices[0].text == "The capital of France is Paris."
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
new file mode 100644
index 0000000000000..af31e1a635f65
--- /dev/null
+++ b/tests/models/embedding/language/test_scoring.py
@@ -0,0 +1,89 @@
+"""Compare the embedding outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_embedding.py`.
+"""
+import math
+
+import pytest
+
+MODELS = [
+    "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Bert
+    "BAAI/bge-reranker-v2-m3",  # Roberta
+]
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+
+@pytest.fixture(scope="module", params=MODELS)
+def model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict([text_pair]).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index fd1c44d9c117e..f96c7d2b176db 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -24,7 +24,7 @@ def check_embeddings_close(
                                   dim=0)
 
         fail_msg = (f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{embeddings_0!r}"
-                    f"\n{name_1}:\t{embeddings_1!r}")
+                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
+                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
 
         assert sim >= 1 - tol, fail_msg
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
index 3dd8cb729f8a6..2641987b25a3a 100644
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -93,7 +93,7 @@ def _run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
-                     task="embedding",
+                     task="embed",
                      dtype=dtype,
                      enforce_eager=True,
                      max_model_len=8192) as vllm_model:
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 329c6ba279f89..f4cd8b81a0d7d 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -47,7 +47,7 @@ def _run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
-                     task="embedding",
+                     task="embed",
                      dtype=dtype,
                      max_model_len=4096,
                      enforce_eager=True) as vllm_model:
@@ -86,7 +86,7 @@ def _run_test(
     )
 
 
-@pytest.mark.skipif(transformers.__version__.startswith("4.46"),
+@pytest.mark.skipif(transformers.__version__ >= "4.46",
                     reason="Model broken with changes in transformers 4.46")
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 6145aff1a5ea2..9374c23dd6ffe 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -39,7 +39,7 @@ def _run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model, task="embedding", dtype=dtype,
+    with vllm_runner(model, task="embed", dtype=dtype,
                      enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.encode(input_texts, images=input_images)
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3848367b6126c..fac8c4b2e9b19 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -43,6 +43,8 @@ class _HfExamplesInfo:
                                          trust_remote_code=True),
     "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
                                          trust_remote_code=True),
+    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria",
+                                                    trust_remote_code=True),
     "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
                                          trust_remote_code=True),
     "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
@@ -51,6 +53,8 @@ class _HfExamplesInfo:
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
                                          trust_remote_code=True),
+    "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
+                                         trust_remote_code=True),
     "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
     "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
                                          trust_remote_code=True),
@@ -61,6 +65,7 @@ class _HfExamplesInfo:
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
     "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
     "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),
@@ -91,6 +96,7 @@ class _HfExamplesInfo:
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
+    "Olmo2ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
     "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"),
     "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
@@ -112,6 +118,8 @@ class _HfExamplesInfo:
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
     "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
+    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
+                                            trust_remote_code=True),
     "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
                                          is_available_online=False,
                                          trust_remote_code=True),
@@ -129,12 +137,14 @@ class _HfExamplesInfo:
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
+    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
     "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
     "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
+    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
     "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
@@ -143,6 +153,13 @@ class _HfExamplesInfo:
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
 }
 
+_CROSS_ENCODER_EXAMPLE_MODELS = {
+    # [Text-only]
+    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
+    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
+    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
+}
+
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
@@ -162,6 +179,7 @@ class _HfExamplesInfo:
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
+    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"),  # noqa: E501
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
@@ -195,6 +213,7 @@ class _HfExamplesInfo:
 _EXAMPLE_MODELS = {
     **_TEXT_GENERATION_EXAMPLE_MODELS,
     **_EMBEDDING_EXAMPLE_MODELS,
+    **_CROSS_ENCODER_EXAMPLE_MODELS,
     **_MULTIMODAL_EXAMPLE_MODELS,
     **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
 }
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index b8312c2d9b7cc..a4eea7f035c91 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -11,11 +11,10 @@
 
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
-    if (model_arch == "Idefics3ForConditionalGeneration"
-            and transformers.__version__ < "4.46.0"):
-        pytest.skip(reason="Model introduced in HF >= 4.46.0")
-
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    if (model_arch == "Cohere2ForCausalLM"
+            and transformers.__version__ < "4.48.0"):
+        pytest.skip(reason="Model introduced in HF >= 4.48.0")
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
 
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 94be215258f89..2c413a633896a 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm import LLM, PoolingParams, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 
 from ..utils import fork_new_process_for_each_test
@@ -36,9 +36,8 @@ def test_oot_registration_text_generation(dummy_opt_path):
 def test_oot_registration_embedding(dummy_gemma2_embedding_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = ["Hello, my name is", "The text does not matter"]
-    sampling_params = PoolingParams()
     llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
-    outputs = llm.encode(prompts, sampling_params)
+    outputs = llm.embed(prompts)
 
     for output in outputs:
         assert all(v == 0 for v in output.outputs.embedding)
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index e462dae3dc688..b5368aab3ecf1 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -3,11 +3,11 @@
 import pytest
 import torch.cuda
 
-from vllm.model_executor.models import (is_embedding_model,
+from vllm.model_executor.models import (is_pooling_model,
                                         is_text_generation_model,
                                         supports_multimodal)
-from vllm.model_executor.models.registry import (_EMBEDDING_MODELS,
-                                                 _MULTIMODAL_MODELS,
+from vllm.model_executor.models.adapters import as_embedding_model
+from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                  _SPECULATIVE_DECODING_MODELS,
                                                  _TEXT_GENERATION_MODELS,
                                                  ModelRegistry)
@@ -23,28 +23,34 @@ def test_registry_imports(model_arch):
     model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
 
     if model_arch in _SPECULATIVE_DECODING_MODELS:
-        pass  # Ignore these models which do not have a unified format
-    else:
-        assert is_text_generation_model(model_cls) is (
-            model_arch in _TEXT_GENERATION_MODELS
-            or model_arch in _MULTIMODAL_MODELS)
+        return  # Ignore these models which do not have a unified format
 
-        assert is_embedding_model(model_cls) is (model_arch
-                                                 in _EMBEDDING_MODELS)
+    if (model_arch in _TEXT_GENERATION_MODELS
+            or model_arch in _MULTIMODAL_MODELS):
+        assert is_text_generation_model(model_cls)
 
-        assert supports_multimodal(model_cls) is (model_arch
-                                                  in _MULTIMODAL_MODELS)
+    # All vLLM models should be convertible to an embedding model
+    embed_model = as_embedding_model(model_cls)
+    assert is_pooling_model(embed_model)
+
+    if model_arch in _MULTIMODAL_MODELS:
+        assert supports_multimodal(model_cls)
 
 
 @fork_new_process_for_each_test
-@pytest.mark.parametrize("model_arch,is_mm,init_cuda", [
-    ("LlamaForCausalLM", False, False),
-    ("MllamaForConditionalGeneration", True, False),
-    ("LlavaForConditionalGeneration", True, True),
+@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
+    ("LlamaForCausalLM", False, False, False),
+    ("MllamaForConditionalGeneration", True, False, False),
+    ("LlavaForConditionalGeneration", True, True, False),
+    ("BertForSequenceClassification", False, False, True),
+    ("RobertaForSequenceClassification", False, False, True),
+    ("XLMRobertaForSequenceClassification", False, False, True),
 ])
-def test_registry_is_multimodal(model_arch, is_mm, init_cuda):
+def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
     assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
 
+    assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce
+
     if init_cuda and current_platform.is_cuda_alike():
         assert not torch.cuda.is_initialized()
 
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 13ad4a7966b9d..71832acbd17b8 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pytest
-from transformers import CLIPImageProcessor, LlavaNextImageProcessor
+from transformers import LlavaNextImageProcessor
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MultiModalRegistry
@@ -14,49 +14,6 @@ def mm_registry():
     return MultiModalRegistry()
 
 
-@pytest.mark.parametrize("dtype", ["half", "float"])
-@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
-def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-
-    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
-    assert isinstance(hf_processor, CLIPImageProcessor)
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    for asset in image_assets:
-        image = rescale_image_size(asset.pil_image, size_factor)
-
-        hf_result = hf_processor.preprocess(
-            image,
-            return_tensors="pt",
-        )
-        vllm_result = mm_registry.map_input(
-            model_config,
-            {"image": image},
-        )
-
-        assert hf_result.keys() == vllm_result.keys()
-        for key, hf_tensor in hf_result.items():
-            hf_arr: np.ndarray = hf_tensor.numpy()
-            vllm_arr: np.ndarray = vllm_result[key].numpy()
-
-            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
-            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
-
-
 @pytest.mark.parametrize("dtype", ["half", "float"])
 @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
 def test_llava_next_image_processor(image_assets, mm_registry, dtype,
@@ -107,7 +64,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
      (2, 1, False), (2, 2, True)],
 )
 def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"
 
     model_config = ModelConfig(
         model=MODEL_NAME,
@@ -138,7 +95,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
 # NOTE: We don't test zero images since the HF processor doesn't support it
 @pytest.mark.parametrize("num_images", [1, 2])
 def test_image_mapper_multi(image_assets, mm_registry, num_images):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"
 
     model_config = ModelConfig(
         model=MODEL_NAME,
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
new file mode 100644
index 0000000000000..d22d778f81fa8
--- /dev/null
+++ b/tests/multimodal/test_processing.py
@@ -0,0 +1,485 @@
+from typing import cast
+
+import pytest
+
+from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
+                                        find_text_matches, find_token_matches,
+                                        iter_placeholders, iter_token_matches,
+                                        replace_text_matches,
+                                        replace_token_matches)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import full_groupby
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("token_ids", "match_ids", "expected"),
+    [
+        ([], [], []),
+        ([], [32000], []),
+        (
+            [32000, 32000, 32000],
+            [32000],
+            [
+                { "start_idx": 0, "end_idx": 1 },
+                { "start_idx": 1, "end_idx": 2 },
+                { "start_idx": 2, "end_idx": 3 },
+            ],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000],
+            [{ "start_idx": 0, "end_idx": 2 }],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000, 32000],
+            [{ "start_idx": 0, "end_idx": 3 }],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000],
+            [
+                { "start_idx": 1, "end_idx": 3 },
+                { "start_idx": 6, "end_idx": 8 },
+            ],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000, 32000, 32000],
+            [
+                { "start_idx": 1, "end_idx": 5 },
+            ],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 0, 32000],
+            [],
+        ),
+    ],
+)
+# yapf: enable
+def test_iter_token_matches(token_ids, match_ids, expected):
+    result = list(iter_token_matches(token_ids, match_ids))
+
+    # Manually constructed results
+    assert [item._asdict() for item in result] == expected
+
+    # Invariants
+    match_lens = [end - start for start, end in result]
+    print("match_lens:", match_lens)  # Only displayed on error
+    assert all(match_len == len(match_ids) for match_len in match_lens)
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("prompt", "target_by_key", "expected_by_key"),
+    [
+        (
+            [],
+            {
+                "pattern_1": [],
+                "pattern_2": [32000],
+            },
+            {
+                "pattern_1": [],
+                "pattern_2": [],
+            }
+        ),
+        (
+            [32000, 32000, 32000, 32000],
+            {
+                "pattern_1": [32000],
+                "pattern_2": [32000, 32000],
+                "pattern_3": [32000, 32000, 32000],
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 0, "end_idx": 1 },
+                    { "start_idx": 1, "end_idx": 2 },
+                    { "start_idx": 2, "end_idx": 3 },
+                    { "start_idx": 3, "end_idx": 4 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 0, "end_idx": 2 },
+                    { "start_idx": 2, "end_idx": 4 },
+                ],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 3 },
+                ],
+            },
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            {
+                "pattern_1": [28747, 32000],
+                "pattern_2": [28747, 32000, 32000, 32000],
+                "pattern_3": [28747, 0, 32000],
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 1, "end_idx": 3 },
+                    { "start_idx": 6, "end_idx": 8 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 1, "end_idx": 5 },
+                ],
+                "pattern_3": [],
+            },
+        ),
+    ],
+)
+# yapf: enable
+def test_find_token_matches(prompt, target_by_key, expected_by_key):
+    # Should not be used since there is nothing to convert to token IDs
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    prompt_repls = [
+        PromptReplacement(key, target, []).bind(mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    result = find_token_matches(prompt, prompt_repls)
+
+    # Only displayed on error
+    print("result:", result)
+
+    # Manually constructed results
+    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
+    assert {
+        key: [
+            dict(start_idx=item.start_idx, end_idx=item.end_idx)
+            for item in result_groups.get(key, [])
+        ]
+        for key in expected_by_key
+    } == expected_by_key
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("prompt", "target_by_key", "expected_by_key"),
+    [
+        # Detokenized test cases of `test_find_token_matches`
+        # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
+        (
+            "",
+            {
+                "pattern_1": "",
+                "pattern_2": "<image>",
+            },
+            {
+                "pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
+                "pattern_2": [],
+            }
+        ),
+        (
+            "<image><image><image><image>",
+            {
+                "pattern_1": "<image>",
+                "pattern_2": "<image><image>",
+                "pattern_3": "<image><image><image>",
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 0, "end_idx": 7 },
+                    { "start_idx": 7, "end_idx": 14 },
+                    { "start_idx": 14, "end_idx": 21 },
+                    { "start_idx": 21, "end_idx": 28 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 0, "end_idx": 14 },
+                    { "start_idx": 14, "end_idx": 28 },
+                ],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 21 },
+                ],
+            },
+        ),
+        (
+            "Image:<image><image><image>Image:<image><image>!",
+            {
+                "pattern_1": "Image:<image>",
+                "pattern_2": "Image:<image><image><image>",
+                "pattern_3": "Image:<unk><image>",
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 0, "end_idx": 13 },
+                    { "start_idx": 27, "end_idx": 40 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 0, "end_idx": 27 },
+                ],
+                "pattern_3": [],
+            },
+        ),
+        # Test regex escape
+        (
+            "<|image|><image><|image|><image>",
+            {
+                "pattern_1": "<|image|>",
+                "pattern_2": "<|image|><image>",
+                "pattern_3": "<|image|><image><|image|>",
+            },
+            {
+                "pattern_1": [
+                    { "start_idx": 0, "end_idx": 9 },
+                    { "start_idx": 16, "end_idx": 25 },
+                ],
+                "pattern_2": [
+                    { "start_idx": 0, "end_idx": 16 },
+                    { "start_idx": 16, "end_idx": 32 },
+                ],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 25 },
+                ],
+            },
+        ),
+    ],
+)
+# yapf: enable
+def test_find_text_matches(prompt, target_by_key, expected_by_key):
+    # Should not be used since there is nothing to convert to text
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    prompt_repls = [
+        PromptReplacement(key, target, []).bind(mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    result = find_text_matches(prompt, prompt_repls)
+
+    # Only displayed on error
+    print("result:", result)
+
+    # Manually constructed results
+    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
+    assert {
+        key: [
+            dict(start_idx=item.start_idx, end_idx=item.end_idx)
+            for item in result_groups.get(key, [])
+        ]
+        for key in expected_by_key
+    } == expected_by_key
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("prompt", "target_by_key", "repl_by_key"),
+    [
+        (
+            "Image:<image>Image:<image><image>!",
+            {
+                # We use `<image>` before `Image:` to test matches that
+                # occur out of order
+                "pattern_1": "<image>",
+                "pattern_2": "Image:",
+                "pattern_3": "!",
+            },
+            {
+                # Test whether target is confused with replacement
+                "pattern_1": "<image><image>",
+                # Test empty replacement
+                "pattern_2": "",
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": "?!?",
+            },
+        ),
+    ]
+)
+@pytest.mark.parametrize(
+    ("mm_count", "expected"),
+    [
+        (0, "Image:<image>Image:<image><image>!"),
+        (1, "<image><image>Image:<image><image>?!?"),
+        (2, "<image><image><image><image><image>?!?"),
+    ]
+)
+# yapf: enable
+def test_find_replace_text(
+    prompt,
+    target_by_key,
+    repl_by_key,
+    mm_count,
+    expected,
+):
+    # Should not be used since there is nothing to convert to text
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    prompt_repls = [
+        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    matches = find_text_matches(prompt, prompt_repls)
+
+    result = replace_text_matches(
+        prompt,
+        matches,
+        {key: mm_count
+         for key in repl_by_key},
+    )
+
+    # Only displayed on error
+    print("matches:", matches)
+    print("result:", result)
+
+    # Manually constructed results
+    assert result == expected
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("prompt", "target_by_key", "repl_by_key"),
+    [
+        # Tokenized test cases of `test_find_replace_text`
+        # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
+        (
+            [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+            {
+                # We use `<image>` before `Image:` to test matches that
+                # occur out of order
+                "pattern_1": [32000],
+                "pattern_2": [9833, 28747],
+                "pattern_3": [918],
+            },
+            {
+                # Test whether target is confused with replacement
+                "pattern_1": [32000, 32000],
+                # Test empty replacement
+                "pattern_2": [],
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": [1550, 918, 1550],
+            },
+        ),
+    ]
+)
+@pytest.mark.parametrize(
+    ("mm_count", "expected"),
+    [
+        (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]),
+        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550]),
+        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550]),
+    ]
+)
+# yapf: enable
+def test_find_replace_tokens(
+    prompt,
+    target_by_key,
+    repl_by_key,
+    mm_count,
+    expected,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    prompt_repls = [
+        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
+        for key, target in target_by_key.items()
+    ]
+    matches = find_token_matches(prompt, prompt_repls)
+
+    result = replace_token_matches(
+        prompt,
+        matches,
+        {key: mm_count
+         for key in repl_by_key},
+    )
+
+    # Only displayed on error
+    print("matches:", matches)
+    print("result:", result)
+
+    # Manually constructed results
+    assert result == expected
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    "repl_by_key",
+    [
+        {
+            "pattern_1": [32000, 32000],
+            "pattern_2": [],
+            "pattern_3": [1550, 918, 1550],
+        },
+    ],
+)
+@pytest.mark.parametrize(
+    ("prompt", "expected"),
+    [
+        (
+            [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+            [
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=6,
+                    replacement=[32000, 32000],
+                ),
+            ],
+        ),
+        (
+            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
+            [
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=1,
+                    replacement=[32000, 32000],
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=5,
+                    replacement=[32000, 32000],
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_3",
+                    start_idx=7,
+                    replacement=[1550, 918, 1550],
+                ),
+            ],
+        ),
+        (
+            [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
+            [
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=1,
+                    replacement=[32000, 32000],
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=3,
+                    replacement=[32000, 32000],
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_3",
+                    start_idx=6,
+                    replacement=[1550, 918, 1550],
+                ),
+            ],
+        ),
+    ]
+)
+def test_iter_placeholders(
+    repl_by_key,
+    prompt,
+    expected,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    prompt_repls = [
+        PromptReplacement(key, [], repl).bind(mock_tokenizer)
+        for key, repl in repl_by_key.items()
+    ]
+
+    result = list(
+        iter_placeholders(
+            prompt_repls,
+            prompt,
+            # Effectively match all occurrences in the prompt
+            {key: 3 for key in repl_by_key},
+         ))
+
+    # Only displayed on error
+    print("result:", result)
+
+    # Manually constructed results
+    assert result == expected
diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
index e6c8793989e13..d141cdf1f083b 100644
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -15,13 +15,13 @@
 # Used for fast tests where the model doesn't matter
 DUMMY_MODEL_ID = "facebook/opt-125m"
 # Used for tests that need a multimodal model
-MULTIMODAL_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+MULTIMODAL_MODEL_ID = "OpenGVLab/InternVL2-2B"
 
 # For mm_processor_kwargs - we test overrides by defining mocks for each place
 # it is used, and ensuring that we can pass processor kwargs an override value
 # to receive the intended result for things like sequence length etc.
-DEFAULT_NUM_CROPS = 4
-NUM_CROPS_OVERRIDE = 16
+DEFAULT_MAX_DYNAMIC_PATCH = 6
+MAX_DYNAMIC_PATCH_OVERRIDE = 4
 
 
 # Mocks for all of the places that we use the mm_processor_kwargs
@@ -33,10 +33,11 @@ def use_processor_mock():
     def custom_processor(ctx: InputContext,
                          inputs: DecoderOnlyInputs,
                          *,
-                         num_crops=DEFAULT_NUM_CROPS):
+                         max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
         # For testing purposes, we don't worry about the prompt
-        return token_inputs(prompt_token_ids=[],
-                            mm_processor_kwargs={"num_crops": num_crops})
+        return token_inputs(
+            prompt_token_ids=[],
+            mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch})
 
     with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor",
                return_value=custom_processor):
@@ -52,9 +53,9 @@ def custom_dummy_data_factory(self,
                                   seq_len: int,
                                   mm_counts: Mapping[str, int],
                                   *,
-                                  num_crops=DEFAULT_NUM_CROPS):
+                                  max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
         seq_data = SequenceData(
-            array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * num_crops))
+            array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * max_dynamic_patch))
         return DummyData(seq_data, None)
 
     with patch(
@@ -65,15 +66,15 @@ def custom_dummy_data_factory(self,
 
 # Lazy import to avoid CUDA reinitialization error
 def mm_model_cls():
-    from vllm.model_executor.models.phi3v import Phi3VForCausalLM
+    from vllm.model_executor.models.internvl import InternVLChatModel
 
-    return Phi3VForCausalLM
+    return InternVLChatModel
 
 
 # lambda whose signature matches max token calcs extra & mapper + extra kwargs
-get_num_crops = lambda ctx, *, num_crops=DEFAULT_NUM_CROPS: num_crops
-custom_mapper = lambda ctx, data, *, num_crops=DEFAULT_NUM_CROPS: {
-    "pixel_values": torch.zeros(size=(1, num_crops + 1, 3, 336, 336))
+get_max_dynamic_patch = lambda ctx, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: max_dynamic_patch  # noqa: E501
+custom_mapper = lambda ctx, data, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: {  # noqa: E501
+    "pixel_values": torch.zeros(size=(1, max_dynamic_patch + 1, 3, 448, 448))
 }
 
 
@@ -88,27 +89,28 @@ def test_default_processor_is_a_noop():
     assert proc_inputs is proc_outputs
 
 
-def _get_num_crops_info(init_num_crops: int, inference_num_crops: int):
-    """Get the init / inference kwargs and expected num_crops for this test."""
-    # If we have a value for num_crops, pass the override value and make
+def _get_max_dynamic_patch_info(init_max_dynamic_patch: int,
+                                inference_max_dynamic_patch: int):
+    """Get the init / inference kwargs and expected max_dynamic_patch."""
+    # If we have a value for max_dynamic_patch, pass the override value and make
     # sure we get that value as a return-value from out mock processor,
     # otherwise fall back to the default value
-    init_kwargs = None if init_num_crops is None else {
-        "num_crops": init_num_crops
+    init_kwargs = None if init_max_dynamic_patch is None else {
+        "max_dynamic_patch": init_max_dynamic_patch
     }
-    inference_kwargs = None if inference_num_crops is None else {
-        "num_crops": inference_num_crops
+    inference_kwargs = None if inference_max_dynamic_patch is None else {
+        "max_dynamic_patch": inference_max_dynamic_patch
     }
-    if inference_num_crops is not None:
-        expected_seq_count = inference_num_crops
-    elif init_num_crops is not None:
-        expected_seq_count = init_num_crops
+    if inference_max_dynamic_patch is not None:
+        expected_seq_count = inference_max_dynamic_patch
+    elif init_max_dynamic_patch is not None:
+        expected_seq_count = init_max_dynamic_patch
     else:
-        expected_seq_count = DEFAULT_NUM_CROPS
+        expected_seq_count = DEFAULT_MAX_DYNAMIC_PATCH
     return init_kwargs, inference_kwargs, expected_seq_count
 
 
-def _get_processed_num_crops(
+def _get_processed_max_dynamic_patch(
     processor: Callable[[ProcessorInputs], ProcessorInputs],
     inference_kwargs: Optional[Dict[str, int]],
 ) -> int:
@@ -120,27 +122,30 @@ def _get_processed_num_crops(
     assert "type" in processed_inputs
     assert processed_inputs["type"] == "token"
     assert "mm_processor_kwargs" in processed_inputs
-    return processed_inputs["mm_processor_kwargs"]["num_crops"]
+    return processed_inputs["mm_processor_kwargs"]["max_dynamic_patch"]
 
 
-@pytest.mark.parametrize("init_num_crops,inference_num_crops", [
-    (None, None),
-    (NUM_CROPS_OVERRIDE, None),
-    (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
-])
-def test_input_processor_kwargs(use_processor_mock, init_num_crops,
-                                inference_num_crops):
+@pytest.mark.parametrize(
+    "init_max_dynamic_patch,inference_max_dynamic_patch", [
+        (None, None),
+        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
+        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
+    ])
+def test_input_processor_kwargs(use_processor_mock, init_max_dynamic_patch,
+                                inference_max_dynamic_patch):
     """Ensure input processors can use processor kwargs."""
     dummy_registry = InputRegistry()
 
-    init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
-        init_num_crops, inference_num_crops)
+    (init_kwargs, inference_kwargs,
+     expected_seq_count) = _get_max_dynamic_patch_info(
+         init_max_dynamic_patch, inference_max_dynamic_patch)
 
     ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs)
     processor = dummy_registry.create_input_processor(ctx.model_config)
-    num_crops_val = _get_processed_num_crops(processor, inference_kwargs)
+    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
+        processor, inference_kwargs)
 
-    assert num_crops_val == expected_seq_count
+    assert max_dynamic_patch_val == expected_seq_count
 
 
 @pytest.mark.parametrize(
@@ -165,18 +170,21 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock,
 
     processor = dummy_registry.create_input_processor(ctx.model_config)
     # Should filter out the inference time kwargs
-    num_crops_val = _get_processed_num_crops(processor, mm_processor_kwargs)
-    assert num_crops_val == DEFAULT_NUM_CROPS
+    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
+        processor, mm_processor_kwargs)
+    assert max_dynamic_patch_val == DEFAULT_MAX_DYNAMIC_PATCH
 
 
 ### Test overrides for the dummy data
-@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE])
-def test_dummy_data_kwarg_overrides(use_dummy_data_mock, num_crops):
+@pytest.mark.parametrize("max_dynamic_patch",
+                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
+def test_dummy_data_kwarg_overrides(use_dummy_data_mock, max_dynamic_patch):
     """Ensure dummy data factories can use processor kwargs."""
-    mm_processor_kwargs = None if num_crops is None else {
-        "num_crops": num_crops
+    mm_processor_kwargs = None if max_dynamic_patch is None else {
+        "max_dynamic_patch": max_dynamic_patch
     }
-    expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
+    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
+                          if max_dynamic_patch is None else max_dynamic_patch)
     dummy_registry = InputRegistry()
     ctx = build_model_context(DUMMY_MODEL_ID,
                               mm_processor_kwargs=mm_processor_kwargs)
@@ -217,17 +225,20 @@ def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock,
     # len is solely dependent on the value of the mm_processor_kwargs.
     dummy_data = dummy_registry.dummy_data_for_profiling(
         ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(dummy_data.seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS
+    assert len(
+        dummy_data.seq_data.prompt_token_ids) == DEFAULT_MAX_DYNAMIC_PATCH
 
 
 ### Test overrides for the max token count per multimodal instance
-@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE])
-def test_max_tokens_kwarg_overrides(num_crops):
+@pytest.mark.parametrize("max_dynamic_patch",
+                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
+def test_max_tokens_kwarg_overrides(max_dynamic_patch):
     """Ensure max token calcs can use processor kwargs."""
-    mm_processor_kwargs = None if num_crops is None else {
-        "num_crops": num_crops
+    mm_processor_kwargs = None if max_dynamic_patch is None else {
+        "max_dynamic_patch": max_dynamic_patch
     }
-    expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
+    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
+                          if max_dynamic_patch is None else max_dynamic_patch)
 
     ctx = build_model_context(MULTIMODAL_MODEL_ID,
                               task="generate",
@@ -239,11 +250,11 @@ def test_max_tokens_kwarg_overrides(num_crops):
     mm_registry.init_mm_limits_per_prompt(ctx.model_config)
     # Patch the image registry for phi3v with our lambda that is compatible
     # with overrides, then ensure that calling the method correctly echos
-    # our num_crops value back from the mm_processor_kwargs.
+    # our max_dynamic_patch value back from the mm_processor_kwargs.
     with patch.object(
             mm_registry._get_plugin("image"),
             "_max_mm_tokens",
-        {mm_model_cls(): get_num_crops},
+        {mm_model_cls(): get_max_dynamic_patch},
     ):
         max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
             ctx.model_config)
@@ -279,26 +290,29 @@ def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
     with patch.object(
             mm_registry._get_plugin("image"),
             "_max_mm_tokens",
-        {mm_model_cls(): get_num_crops},
+        {mm_model_cls(): get_max_dynamic_patch},
     ):
         max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
             ctx.model_config)
 
-    assert max_multimodal_tokens == DEFAULT_NUM_CROPS
+    assert max_multimodal_tokens == DEFAULT_MAX_DYNAMIC_PATCH
 
 
 ### Test overrides for the mapper
-@pytest.mark.parametrize("num_crops", [DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE])
-def test_default_mapper_with_processor_kwargs(image_assets, num_crops):
+@pytest.mark.parametrize(
+    "max_dynamic_patch",
+    [DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE])
+def test_default_mapper_with_processor_kwargs(image_assets, max_dynamic_patch):
     """Ensure that the mapper processor kwargs can fall back to HF models."""
     # NOTE - we don't validate bad inputs for the default mapper, because it's
     # through the automodel interface in transformers, so we can't easily
     # inspect what kwargs are or are not allowed.
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs={"num_crops": num_crops},
-                              limit_mm_per_prompt={"image": 1})
+    ctx = build_model_context(
+        MULTIMODAL_MODEL_ID,
+        task="generate",
+        trust_remote_code=True,
+        mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch},
+        limit_mm_per_prompt={"image": 1})
 
     mm_registry = MultiModalRegistry()
     mm_registry.init_mm_limits_per_prompt(ctx.model_config)
@@ -307,20 +321,22 @@ def test_default_mapper_with_processor_kwargs(image_assets, num_crops):
     mm_inputs = {"image": image}
 
     mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs)
-    # Phi3v pixel vals should have shape: [batch, num_crops+1, 3, 336, 336]
-    assert mapped_inputs["pixel_values"].shape[1] == num_crops + 1
+    # pixel vals should have shape: [batch, max_dynamic_patch+1, ...]
+    assert mapped_inputs["pixel_values"].shape[1] == max_dynamic_patch + 1
 
 
-@pytest.mark.parametrize("init_num_crops,inference_num_crops", [
-    (None, None),
-    (NUM_CROPS_OVERRIDE, None),
-    (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
-])
-def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
-                                       inference_num_crops):
+@pytest.mark.parametrize(
+    "init_max_dynamic_patch,inference_max_dynamic_patch", [
+        (None, None),
+        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
+        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
+    ])
+def test_custom_mapper_kwarg_overrides(image_assets, init_max_dynamic_patch,
+                                       inference_max_dynamic_patch):
     """Ensure custom mappers can use processor kwargs."""
-    init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
-        init_num_crops, inference_num_crops)
+    (init_kwargs, inference_kwargs,
+     expected_seq_count) = _get_max_dynamic_patch_info(
+         init_max_dynamic_patch, inference_max_dynamic_patch)
 
     ctx = build_model_context(MULTIMODAL_MODEL_ID,
                               task="generate",
@@ -335,7 +351,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
 
     # Patch the image registry for phi3v with our lambda that is compatible
     # with overrides, then ensure that calling the method correctly echos
-    # our num_crops value back from the mm_processor_kwargs.
+    # our max_dynamic_patch value back from the mm_processor_kwargs.
     mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
         mm_model_cls())
     mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs,
@@ -373,11 +389,12 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
 
     # Patch the image registry for phi3v with our lambda that is compatible
     # with overrides, then ensure that calling the method correctly echos
-    # our num_crops value back from the mm_processor_kwargs.
+    # our max_dynamic_patch value back from the mm_processor_kwargs.
     mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
         mm_model_cls())
     # Should filter out the inference time kwargs
     mapped_inputs = mm_registry.map_input(
         ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs)
 
-    assert mapped_inputs["pixel_values"].shape[1] == DEFAULT_NUM_CROPS + 1
+    assert mapped_inputs["pixel_values"].shape[1] == (
+        DEFAULT_MAX_DYNAMIC_PATCH + 1)
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 9869c8123f001..fd82fb0c55fd7 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -139,7 +139,8 @@ def test_repeat_and_pad_placeholder_tokens(model):
             2,
             "<image><image><image>",
             [32000, 32000, 32000],
-            [{ "offset": 0, "length": 2 }]),
+            [{ "offset": 0, "length": 2 }],
+        ),
         (
             "<image><image>",
             [3, 2],
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index 21958b1640204..d676eacffb056 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -1,13 +1,34 @@
-from typing import List, Optional, Union
+from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
+import torch.nn as nn
 
 from vllm.attention import AttentionMetadata
-from vllm.model_executor.models.gemma2 import Gemma2EmbeddingModel
-from vllm.sequence import IntermediateTensors
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
+from vllm.model_executor.models.gemma2 import Gemma2Model
+from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.sequence import IntermediateTensors, PoolerOutput
 
 
-class MyGemma2Embedding(Gemma2EmbeddingModel):
+class MyGemma2Embedding(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        self.model = Gemma2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+
+        self._pooler = Pooler.from_config_with_defaults(
+            vllm_config.model_config.pooler_config,
+            pooling_type=PoolingType.LAST,
+            normalize=True,
+            softmax=False,
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
 
     def forward(
         self,
@@ -18,7 +39,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = super().forward(
+        hidden_states = self.model(
             input_ids,
             positions,
             kv_caches,
@@ -32,3 +53,17 @@ def forward(
 
         # Return all-zero embeddings
         return torch.zeros_like(hidden_states)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+        weights = hf_to_vllm_mapper.apply(weights)
+        weights = ((name, data) for name, data in weights
+                   if not name.startswith("lm_head."))
+        return self.model.load_weights(weights)
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 3ebd7864b8fc8..0d90635093ac7 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -2,19 +2,15 @@
 
 import torch
 
-from vllm.inputs import INPUT_REGISTRY
 from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              dummy_data_for_llava,
-                                              get_max_llava_image_tokens,
-                                              input_processor_for_llava)
+                                              LlavaMultiModalProcessor,
+                                              get_max_llava_image_tokens)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 50723dbb610ac..8d16710f14585 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -2,10 +2,15 @@
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
+
 import pytest
 
+from tests.conftest import VllmRunner
+from tests.core.utils import SchedulerProxy, create_dummy_prompt
 from tests.kernels.utils import override_backend_env_variable
 from vllm import SamplingParams, TokensPrompt
+from vllm.core.scheduler import Scheduler
+from vllm.engine.llm_engine import LLMEngine
 
 from ..models.utils import check_outputs_equal
 
@@ -27,6 +32,7 @@
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("cached_position", [0, 1])
+@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
 @pytest.mark.parametrize("block_size", [16])
 def test_mixed_requests(
     hf_runner,
@@ -37,6 +43,7 @@ def test_mixed_requests(
     dtype: str,
     max_tokens: int,
     cached_position: int,
+    enable_chunked_prefill: bool,
     block_size: int,
     monkeypatch,
 ) -> None:
@@ -55,6 +62,7 @@ def test_mixed_requests(
             model,
             dtype=dtype,
             enable_prefix_caching=True,
+            enable_chunked_prefill=enable_chunked_prefill,
             block_size=block_size,
     ) as vllm_model:
         # Run the first prompt so the cache is populated
@@ -72,13 +80,13 @@ def test_mixed_requests(
                     block_size) * block_size
             else:
                 expected_num_cached_tokens = 0
-            assert req_outputs[
-                i].num_cached_tokens == expected_num_cached_tokens
+            assert (
+                req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
 
-        vllm_outputs = [
-            (output.prompt_token_ids + list(output.outputs[0].token_ids),
-             output.prompt + output.outputs[0].text) for output in req_outputs
-        ]
+        vllm_outputs = [(
+            output.prompt_token_ids + list(output.outputs[0].token_ids),
+            output.prompt + output.outputs[0].text,
+        ) for output in req_outputs]
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -105,3 +113,89 @@ def test_unstable_prompt_sequence(
         for prompt in UNSTABLE_PROMPT_SEQUENCE:
             vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
                                 SamplingParams(max_tokens=1))
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_fully_cached_prefill_needs_uncached_token(model):
+    block_size = 16
+    max_num_batched_tokens = 16
+    num_output_tokens = 5
+    # Make a vllm engine
+    runner = VllmRunner(
+        model_name=model,
+        gpu_memory_utilization=0.7,
+        enable_chunked_prefill=True,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        block_size=block_size,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_batched_tokens,
+    )
+    engine: LLMEngine = runner.model.llm_engine
+
+    scheduler: Scheduler = SchedulerProxy(engine.scheduler[0])  # type: ignore
+    engine.scheduler[0] = scheduler
+
+    # SeqA
+    seqA_tokens = list(range(2 * block_size))
+    seqA, seq_groupA = create_dummy_prompt(
+        request_id="0",
+        prompt_tokens=seqA_tokens,
+        max_tokens=num_output_tokens,
+        block_size=block_size,
+    )
+
+    scheduler.add_seq_group(seq_groupA)
+
+    assert seqA.data.get_num_computed_tokens() == 0
+
+    # Prefill seqA
+    while not seqA.is_finished():
+        engine.step()
+
+    # seqB
+    seqB_tokens = [t + 1 for t in seqA_tokens]  # shift by 1
+    seqB, seq_groupB = create_dummy_prompt(
+        request_id="1",
+        prompt_tokens=seqB_tokens,
+        max_tokens=num_output_tokens,
+        block_size=block_size,
+    )
+
+    # seqC is the same as seqA
+    seqC, seq_groupC = create_dummy_prompt(
+        request_id="2",
+        prompt_tokens=seqA_tokens,
+        max_tokens=num_output_tokens,
+        block_size=block_size,
+    )
+
+    scheduler.add_seq_group(seq_groupB)
+    scheduler.add_seq_group(seq_groupC)
+
+    # Even seqC is fully cached, it should not be prefilled since we
+    # require at least 1 uncached token.
+    engine.step()
+
+    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
+    assert len(sched_out.scheduled_seq_groups) == 1
+    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
+            seq_groupB.request_id)
+    assert (sched_out.scheduled_seq_groups[0].token_chunk_size ==
+            max_num_batched_tokens)
+
+    # When seqB is finished, seqC could be prefilled.
+    while not seqB.is_finished():
+        engine.step()
+        sched_metas, sched_out, _ = scheduler.last_schedule_ret()
+        assert len(sched_out.scheduled_seq_groups) == 1
+        assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
+                seq_groupB.request_id)
+
+    engine.step()
+    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
+    assert len(sched_out.scheduled_seq_groups) == 1
+    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
+            seq_groupC.request_id)
+    assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(
+        seqA_tokens)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 26add5bf6d90d..21fec990aa873 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -10,9 +10,11 @@
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensors24, CompressedTensorsLinearMethod,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
+from vllm.platforms import current_platform
 
 
 @pytest.mark.parametrize(
@@ -208,3 +210,98 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
         output = llm.generate_greedy("Hello world!", max_tokens=20)
         assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
+    assert qkv_proj.scheme.input_quant.strategy == input_strategy
+    assert qkv_proj.scheme.quantized
+    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+    assert sparsity_map.get("Linear").format == "dense"
+    assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
+     "token"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+     "channel", "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+     "channel", "token"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.int8
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize(
+    "args_2of4",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
+def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+        assert qkv_proj.scheme.weight_quant is None
+        assert qkv_proj.scheme.input_quant is None
+        assert not qkv_proj.scheme.quantized
+        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+        assert sparsity_map.get("Linear").format == "dense"
+        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index d541efcefcac3..68a73f0f8ab48 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -1,5 +1,5 @@
 """Test model set-up and inference for quantized HF models supported
- on the CPU backend using IPEX (including AWQ).
+ on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
  
  Validating the configuration and printing results for manual checking.
 
@@ -11,13 +11,15 @@
 from vllm.platforms import current_platform
 
 MODELS = [
-    "casperhansen/llama-3-8b-instruct-awq",
+    "AMead10/Llama-3.2-1B-Instruct-AWQ",
+    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",  # with g_idx
 ]
 DTYPE = ["bfloat16"]
 
 
-@pytest.mark.skipif(not current_platform.is_cpu(),
-                    reason="only supports the CPU backend.")
+@pytest.mark.skipif(not current_platform.is_cpu()
+                    and not current_platform.is_xpu(),
+                    reason="only supports Intel CPU/XPU backend.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", DTYPE)
 def test_ipex_quant(vllm_runner, model, dtype):
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 061a077592e80..8ebd8dd2be0d5 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,4 +1,4 @@
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.layers.quantization import get_quantization_config
 from vllm.platforms import current_platform
 
 
@@ -10,6 +10,6 @@ def is_quant_method_supported(quant_method: str) -> bool:
     capability = current_platform.get_device_capability()
     assert capability is not None
 
-    min_capability = QUANTIZATION_METHODS[quant_method].get_min_capability()
+    min_capability = get_quantization_config(quant_method).get_min_capability()
 
     return capability.to_int() >= min_capability
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index a3f0464e79675..af8397c235f48 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -50,49 +50,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
     with pytest.raises(ValueError, match="cannot be larger than"):
         get_output_from_llm_generator(test_llm_generator, prompts,
                                       sampling_params)
-
-
-@pytest.mark.parametrize("common_llm_kwargs",
-                         [{
-                             "model": "meta-llama/Llama-2-7b-chat-hf",
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 5,
-                             "enable_chunked_prefill": "True",
-                         }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "tensor_parallel_size": 2,
-        "speculative_draft_tensor_parallel_size": 2,
-    },
-    {
-        "tensor_parallel_size": 4,
-        "speculative_draft_tensor_parallel_size": 4,
-    },
-    {
-        "tensor_parallel_size": 8,
-        "speculative_draft_tensor_parallel_size": 8,
-    },
-])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail_chunked_prefill_draft_model_tp_not_one(
-        test_llm_generator):
-    """Verify that speculative decoding fails if chunked prefill is enabled for 
-    draft model with tensor parallelism of more than 1.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-    ]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    with pytest.raises(ValueError, match="with tensor parallel size 1"):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 25562ca85adf4..02cba92795142 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -115,3 +115,60 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
                                      max_output_len=32,
                                      seed=seed,
                                      temperature=0.0)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--tensor_parallel_size",
+        "2",
+
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [["--enable-chunked-prefill", "False"],
+     [
+         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
+         "--max-num-seqs", "4"
+     ]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative-model",
+                             "JackFram/llama-68m",
+                             "--num_speculative-tokens",
+                             "3",
+                         ]),
+                          ("JackFram/llama-68m", [
+                              "--speculative-model",
+                              "JackFram/llama-68m",
+                              "--num_speculative-tokens",
+                              "3",
+                              "--speculative-draft-tensor-parallel-size",
+                              "1",
+                          ])])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
+                                         per_test_common_llm_kwargs,
+                                         baseline_llm_kwargs, test_llm_kwargs,
+                                         batch_size: int, seed: int):
+    """Verify spec decode works well with same and different TP size for
+    the draft model with chunked prefill.
+    """
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0)
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 5ecc0d4e95719..183ff2f5db274 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -203,7 +203,7 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
 @pytest.mark.parametrize("output_len", [64])
 @pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("temperature", [0.1, 1.0])
+@pytest.mark.parametrize("temperature", [1.0])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 0d6aaa449d856..3504fcf43e361 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -90,6 +90,14 @@ def test_create_single_target_seq_group_metadata(k: int):
     )
 
     assert output.request_id == input_seq_group_metadata.request_id
+    assert output.sampling_params.repetition_penalty == \
+        input_seq_group_metadata.sampling_params.repetition_penalty
+    assert output.sampling_params.temperature == \
+        input_seq_group_metadata.sampling_params.temperature
+    assert output.sampling_params.top_p == \
+        input_seq_group_metadata.sampling_params.top_p
+    assert output.sampling_params.top_k == \
+        input_seq_group_metadata.sampling_params.top_k
     assert len(output.seq_data) == 1
     assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple(
         prompt_tokens)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 8df143104c279..caf7a7e625b46 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -595,8 +595,8 @@ def test_init_device(acceptance_sampler_method: str):
 
     target_worker.init_device.assert_called_once()
 
-    metrics_collector.init_gpu_tensors.assert_called_once()
-    spec_decode_sampler.init_gpu_tensors.assert_called_once()
+    metrics_collector.init_tensors.assert_called_once()
+    spec_decode_sampler.init_tensors.assert_called_once()
 
 
 @pytest.mark.parametrize("acceptance_sampler_method",
@@ -867,7 +867,8 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
     target_group_metadata_list = prefill + decodes
     execute_model_req = ExecuteModelRequest(
         seq_group_metadata_list=target_group_metadata_list,
-        num_lookahead_slots=k)
+        # For prefill only batches we expect num_lookahead_slots = 0.
+        num_lookahead_slots=k if n_decodes > 0 else 0)
 
     target_token_ids = torch.randint(low=0,
                                      high=vocab_size,
diff --git a/tests/standalone_tests/lazy_torch_compile.py b/tests/standalone_tests/lazy_torch_compile.py
new file mode 100644
index 0000000000000..b950877a4337b
--- /dev/null
+++ b/tests/standalone_tests/lazy_torch_compile.py
@@ -0,0 +1,28 @@
+# Description: Test the lazy import module
+# The utility function cannot be placed in `vllm.utils`
+# this needs to be a standalone script
+import sys
+from contextlib import nullcontext
+
+from vllm_test_utils import BlameResult, blame
+
+module_name = "torch._inductor.async_compile"
+
+# In CI, we only check finally if the module is imported.
+# If it is indeed imported, we can rerun the test with `use_blame=True`,
+# which will trace every function call to find the first import location,
+# and help find the root cause.
+# We don't run it in CI by default because it is slow.
+use_blame = False
+context = blame(
+    lambda: module_name in sys.modules) if use_blame else nullcontext()
+with context as result:
+    import vllm  # noqa
+
+if use_blame:
+    assert isinstance(result, BlameResult)
+    print(f"the first import location is:\n{result.trace_stack}")
+
+assert module_name not in sys.modules, (
+    f"Module {module_name} is imported. To see the first"
+    f" import location, run the test with `use_blame=True`.")
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
new file mode 100644
index 0000000000000..f00895c0997f1
--- /dev/null
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# This script tests if the python only compilation works correctly
+# for users who do not have any compilers installed on their system
+
+set -e
+set -x
+
+cd /vllm-workspace/
+
+# uninstall vllm
+pip3 uninstall -y vllm
+# restore the original files
+mv test_docs/vllm ./vllm
+
+# remove all compilers
+apt remove --purge build-essential -y
+apt autoremove -y
+
+echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
+
+VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+
+# Run the script
+python3 -c 'import vllm'
+
+# Check if the clangd log file was created
+if [ ! -f /tmp/changed.file ]; then
+    echo "changed.file was not created, python only compilation failed"
+    exit 1
+fi
diff --git a/tests/test_config.py b/tests/test_config.py
index 3cf90297ce177..4518adfc31bfc 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -7,11 +7,17 @@
 from vllm.platforms import current_platform
 
 
-@pytest.mark.parametrize(("model_id", "expected_task"), [
-    ("facebook/opt-125m", "generate"),
-    ("intfloat/e5-mistral-7b-instruct", "embedding"),
-])
-def test_auto_task(model_id, expected_task):
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_task"),
+    [
+        ("facebook/opt-125m", "generate", "generate"),
+        ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
+    ],
+)
+def test_auto_task(model_id, expected_runner_type, expected_task):
     config = ModelConfig(
         model_id,
         task="auto",
@@ -22,12 +28,12 @@ def test_auto_task(model_id, expected_task):
         dtype="float16",
     )
 
+    assert config.runner_type == expected_runner_type
     assert config.task == expected_task
 
 
 @pytest.mark.parametrize(("model_id", "bad_task"), [
-    ("facebook/opt-125m", "embedding"),
-    ("intfloat/e5-mistral-7b-instruct", "generate"),
+    ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
 ])
 def test_incorrect_task(model_id, bad_task):
     with pytest.raises(ValueError, match=r"does not support the .* task"):
diff --git a/tests/test_utils.py b/tests/test_utils.py
index a731b11eae81c..32a6b0aed66aa 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,15 +1,16 @@
 import asyncio
 import os
 import socket
-from functools import partial
 from typing import AsyncIterator, Tuple
 
 import pytest
+import torch
 
 from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
-                        get_open_port, merge_async_iterators, supports_kw)
+                        get_open_port, memory_profiling, merge_async_iterators,
+                        supports_kw)
 
-from .utils import error_on_warning
+from .utils import error_on_warning, fork_new_process_for_each_test
 
 
 @pytest.mark.asyncio
@@ -24,10 +25,7 @@ async def mock_async_iterator(idx: int):
             print(f"iterator {idx} cancelled")
 
     iterators = [mock_async_iterator(i) for i in range(3)]
-    merged_iterator = merge_async_iterators(*iterators,
-                                            is_cancelled=partial(asyncio.sleep,
-                                                                 0,
-                                                                 result=False))
+    merged_iterator = merge_async_iterators(*iterators)
 
     async def stream_output(generator: AsyncIterator[Tuple[int, str]]):
         async for idx, output in generator:
@@ -270,3 +268,41 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
         requires_kw_only=requires_kw_only,
         allow_var_kwargs=allow_var_kwargs
     ) == is_supported
+
+
+@fork_new_process_for_each_test
+def test_memory_profiling():
+    # Fake out some model loading + inference memory usage to test profiling
+    # Memory used by other processes will show up as cuda usage outside of torch
+    from vllm.distributed.device_communicators.cuda_wrapper import (
+        CudaRTLibrary)
+    lib = CudaRTLibrary()
+    # 512 MiB allocation outside of this instance
+    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
+
+    baseline_memory_in_bytes = \
+        torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
+
+    # load weights
+
+    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
+
+    weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
+
+    with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
+    weights_memory_in_bytes=weights_memory_in_bytes) as result:
+        # make a memory spike, 1 GiB
+        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
+        del spike
+
+        # Add some extra non-torch memory 256 MiB (simulate NCCL)
+        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
+
+    # Check that the memory usage is within 5% of the expected values
+    non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
+    torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
+    assert abs(non_torch_ratio - 1) <= 0.05
+    assert abs(torch_peak_ratio - 1) <= 0.05
+    del weights
+    lib.cudaFree(handle1)
+    lib.cudaFree(handle2)
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 6818ac44b2478..2241f1846e746 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -103,7 +103,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
         "supports_rocm":
         False,
     },
-    "granite8b": {
+    "granite-3.0-8b": {
         "model":
         "ibm-granite/granite-3.0-8b-instruct",
         "arguments": [
@@ -111,6 +111,14 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
             str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
         ],
     },
+    "granite-3.1-8b": {
+        "model": "ibm-granite/granite-3.1-8b-instruct",
+        "arguments": [
+            "--tool-call-parser",
+            "granite",
+        ],
+        "supports_parallel": True,
+    },
     "internlm": {
         "model":
         "internlm/internlm2_5-7b-chat",
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 86d9af88e49ea..b7124ebc1b0f3 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -1,24 +1,46 @@
 import glob
 import os
-import runpy
 import tempfile
 
 import depyf
 
-from vllm.compilation.levels import CompilationLevel
-
-# disable custom dispatcher, let Dynamo takes over
-# all the control
-os.environ['VLLM_TORCH_COMPILE_LEVEL'] = str(CompilationLevel.DYNAMO_AS_IS)
+from vllm.config import CompilationLevel
 
 temp_dir = tempfile.mkdtemp()
 with depyf.prepare_debug(temp_dir):
-    cur_dir = os.path.dirname(__file__)
-    parent_dir = os.path.dirname(cur_dir)
-    root_dir = os.path.dirname(parent_dir)
-    example_file = os.path.join(root_dir, "examples",
-                                "offline_inference_tpu.py")
-    runpy.run_path(example_file)
+    from vllm import LLM, SamplingParams
+
+    prompts = [
+        "A robot may not injure a human being",
+        "It is only with the heart that one can see rightly;",
+        "The greatest glory in living lies not in never falling,",
+    ]
+    answers = [
+        " or, through inaction, allow a human being to come to harm.",
+        " what is essential is invisible to the eye.",
+        " but in rising every time we fall.",
+    ]
+    N = 1
+    # Currently, top-p sampling is disabled. `top_p` should be 1.0.
+    sampling_params = SamplingParams(temperature=0.7,
+                                     top_p=1.0,
+                                     n=N,
+                                     max_tokens=16)
+
+    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
+    # In real workloads, `enforace_eager` should be `False`.
+
+    # disable custom dispatcher, let Dynamo takes over
+    # all the control
+    llm = LLM(model="google/gemma-2b",
+              enforce_eager=True,
+              compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
+    outputs = llm.generate(prompts, sampling_params)
+    for output, answer in zip(outputs, answers):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        assert generated_text.startswith(answer)
 
 compiled_code = sorted(
     glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index 923d0f1680802..bb1379deba3fc 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,6 +1,6 @@
 import os
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 
 from ..utils import compare_two_settings
 
@@ -13,7 +13,10 @@
 def test_custom_dispatcher():
     compare_two_settings(
         "google/gemma-2b",
-        arg1=["--enforce-eager"],
-        arg2=["--enforce-eager"],
-        env1={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_ONCE)},
-        env2={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_AS_IS)})
+        arg1=[
+            "--enforce-eager",
+            f"-O{CompilationLevel.DYNAMO_ONCE}",
+        ],
+        arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
+        env1={},
+        env2={})
diff --git a/tests/utils.py b/tests/utils.py
index a893667e144a6..bf3d88194e4ca 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -163,12 +163,11 @@ def get_client(self):
             api_key=self.DUMMY_API_KEY,
         )
 
-    def get_async_client(self):
-        return openai.AsyncOpenAI(
-            base_url=self.url_for("v1"),
-            api_key=self.DUMMY_API_KEY,
-            max_retries=0,
-        )
+    def get_async_client(self, **kwargs):
+        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
+                                  api_key=self.DUMMY_API_KEY,
+                                  max_retries=0,
+                                  **kwargs)
 
 
 def _test_completion(
@@ -682,10 +681,12 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
 
 
 def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
-    """Gets a pytest skipif mark, which triggers ig the the device doesn't have
-    meet a minimum memory requirement in gb; can be leveraged via 
-    @large_gpu_test to skip tests in environments without enough resources, or
-    called when filtering tests to run directly.
+    """
+    Get a pytest mark, which skips the test if the GPU doesn't meet
+    a minimum memory requirement in GB.
+    
+    This can be leveraged via `@large_gpu_test` to skip tests in environments
+    without enough resources, or called when filtering tests to run directly.
     """
     try:
         if current_platform.is_cpu():
@@ -712,26 +713,37 @@ def large_gpu_test(*, min_gb: int):
 
     Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
     """
-    test_skipif = large_gpu_mark(min_gb)
+    mark = large_gpu_mark(min_gb)
 
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_skipif(f)
+        return mark(f)
 
     return wrapper
 
 
-def multi_gpu_test(*, num_gpus: int):
-    """
-    Decorate a test to be run only when multiple GPUs are available.
-    """
-    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
+def multi_gpu_marks(*, num_gpus: int):
+    """Get a collection of pytest marks to apply for `@multi_gpu_test`."""
+    test_selector = pytest.mark.distributed(num_gpus=num_gpus)
     test_skipif = pytest.mark.skipif(
         cuda_device_count_stateless() < num_gpus,
         reason=f"Need at least {num_gpus} GPUs to run the test.",
     )
 
+    return [test_selector, test_skipif]
+
+
+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    marks = multi_gpu_marks(num_gpus=num_gpus)
+
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
+        func = fork_new_process_for_each_test(f)
+        for mark in reversed(marks):
+            func = mark(func)
+
+        return func
 
     return wrapper
 
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index d614d3e67460f..ed04f0a373c51 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,14 +1,24 @@
 """Compare the with and without prefix caching."""
+import pytest
+
 from vllm.inputs import token_inputs
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sampling_params import SamplingParams
+from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
-from vllm.v1.core.kv_cache_utils import hash_block_tokens
+from vllm.v1.core.kv_cache_utils import KVCacheBlock, hash_block_tokens
 
 
-def make_request(request_id, prompt_token_ids):
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None):
     return Request(
         request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids),
+        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
+                            multi_modal_placeholders={"image": mm_positions}
+                            if mm_positions else None,
+                            multi_modal_hashes=mm_hashes),
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
@@ -20,7 +30,8 @@ def test_prefill():
     manager = KVCacheManager(
         block_size=16,
         num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -31,8 +42,10 @@ def test_prefill():
     # Fully cache miss
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
-    req0 = make_request("0", common_token_ids + unique_token_ids)
+    all_token_ids = common_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids)
     computed_blocks = manager.get_computed_blocks(req0)
+    assert len(req0.kv_block_hashes) == 3
     assert not computed_blocks
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -40,30 +53,23 @@ def test_prefill():
     # Check full block metadata
     parent_block_hash = None
     for block_id in (0, 1, 2):
-        block_hash = hash_block_tokens(parent_block_hash,
-                                       manager.block_pool[block_id].token_ids)
+        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
+        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
         assert manager.block_pool[block_id].block_hash == block_hash
         assert manager.block_pool[block_id].ref_cnt == 1
-        assert manager.block_pool[block_id].num_hashed_tokens == 16 * (
-            block_id + 1)
-        assert manager.block_pool[block_id].token_ids == tuple([block_id] * 16)
-        parent_block_hash = block_hash
+        parent_block_hash = block_hash.hash_value
 
     # Check partial/preallocated block metadata
     for block_id in (3, 4):
         assert manager.block_pool[block_id].block_hash is None
         assert manager.block_pool[block_id].ref_cnt == 1
-        assert manager.block_pool[block_id].num_hashed_tokens == 0
-        if block_id == 3:
-            assert manager.block_pool[block_id].token_ids == [3] * 7
-        else:
-            assert not manager.block_pool[block_id].token_ids
 
     # Cache hit in the common prefix when the original block is still in use.
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks = manager.get_computed_blocks(req1)
+    assert len(req1.kv_block_hashes) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
@@ -93,6 +99,7 @@ def test_prefill():
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_block = manager.get_computed_blocks(req2)
+    assert len(req2.kv_block_hashes) == 3
     assert [b.block_id for b in computed_block] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
@@ -113,7 +120,7 @@ def test_prefill():
     req3 = make_request("3", [99] * (16 * 9))
     computed_blocks = manager.get_computed_blocks(req3)
     assert not computed_blocks
-    blocks = manager.allocate_slots(req2, 16 * 9, computed_blocks)
+    blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
     # This block ID order also checks the eviction order.
     assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
     assert manager.free_block_queue.num_free_blocks == 0
@@ -125,7 +132,8 @@ def test_decode():
     manager = KVCacheManager(
         block_size=16,
         num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -148,7 +156,7 @@ def test_decode():
         req0.append_output_token_ids(8)
     new_blocks = manager.append_slots(req0, 4)
     assert new_blocks is not None and len(new_blocks) == 0
-    assert len(manager.block_pool[3].token_ids) == 11
+    assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
 
     # Append slots without allocating a new block, but start using the
     # preallocated block.
@@ -159,8 +167,7 @@ def test_decode():
         req0.append_output_token_ids(7)
     new_blocks = manager.append_slots(req0, 15)
     assert new_blocks is not None and len(new_blocks) == 0
-    assert len(manager.block_pool[3].token_ids) == 16
-    assert len(manager.block_pool[4].token_ids) == 10
+    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
 
     # Append slots with allocating a new block.
     req0.num_computed_tokens = 74
@@ -171,16 +178,14 @@ def test_decode():
     new_blocks = manager.append_slots(req0, 17)
     # Plus one preallocated block.
     assert new_blocks is not None and len(new_blocks) == 2
-    assert len(manager.block_pool[4].token_ids) == 16
-    assert len(manager.block_pool[5].token_ids) == 11
-    assert len(manager.block_pool[6].token_ids) == 0
 
 
 def test_evict():
     manager = KVCacheManager(
         block_size=16,
         num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -217,3 +222,281 @@ def test_evict():
     blocks = manager.allocate_slots(req2, 3, computed_blocks)
     assert [b.block_id for b in blocks] == [6, 5]
     assert manager.free_block_queue.num_free_blocks == 6
+
+
+def test_hash_block_correct_reuse():
+    """
+    This tests when a previously cached block is reused as a new block,
+    its hash metadata should be correctly reset.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        block_size=block_size,
+        num_gpu_blocks=1,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+
+    # Allocate 1 block and cache it.
+    num_tokens = block_size * 1
+    req = make_request("0", list(range(num_tokens)))
+    computed_blocks = manager.get_computed_blocks(req)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req, num_tokens, computed_blocks)
+    assert len(blocks) == 1
+
+    # Deallocate the block.
+    manager.free(req)
+
+    # Allocate a new block that's not full, make sure hash info on the
+    # block is cleared.
+    req = make_request("1", list(range(num_tokens - 1)))
+    computed_blocks = manager.get_computed_blocks(req)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks)
+    assert len(blocks) == 1
+
+    assert manager.block_pool[blocks[0].block_id].block_hash is None
+
+
+def test_computed_blocks_not_evicted():
+    """
+    Test that the computed blocks are not evicted when getting new blocks
+    for a request if there are any other free blocks.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        block_size=block_size,
+        num_gpu_blocks=2,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+
+    # Allocate a block and cache it.
+    num_tokens = block_size * 1
+    req0 = make_request("0", list(range(num_tokens)))
+    computed_blocks = manager.get_computed_blocks(req0)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
+    assert len(blocks) == 1
+    assert blocks[0].block_id == 0
+
+    # Allocate another block.
+    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
+    assert len(blocks) == 1
+    assert blocks[0].block_id == 1
+
+    # Free the blocks.
+    manager.free(req0)
+    manager.free(req1)
+
+    # Now if we have a cache hit on the first block, we should evict the second
+    # cached block rather than the first one.
+    req2 = make_request("2", list(range(num_tokens * 2)))
+    computed_blocks = manager.get_computed_blocks(req2)
+    assert len(computed_blocks) == 1
+    assert computed_blocks[0].block_id == 0
+
+    blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
+                                    computed_blocks)
+    assert len(blocks) == 1
+    assert blocks[0].block_id == 1
+
+
+def test_basic_prefix_caching_disabled():
+    """
+    This tests that the prefix caching is disabled.
+    """
+    block_size = 4
+    manager = KVCacheManager(
+        block_size=block_size,
+        num_gpu_blocks=4,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=False,
+        num_preallocate_tokens=0,
+    )
+
+    req1 = make_request("1", list(range(10)))  # 2 blocks and some more
+
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req1, 10, computed_blocks)
+    assert len(blocks) == 3
+
+    # Free the blocks.
+    manager.free(req1)
+
+    # No caching.
+    req2 = make_request("2", list(range(16)))  # shared prefix
+    computed_blocks = manager.get_computed_blocks(req2)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req2, 16, computed_blocks)
+    assert len(blocks) == 4
+
+    # New requests should not have any blocks.
+    req3 = make_request("3", list(range(4)))
+    computed_blocks = manager.get_computed_blocks(req3)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req3, 4, computed_blocks)
+    assert not blocks
+
+
+@pytest.mark.parametrize("num_preallocate_tokens", list(range(0, 8)))
+@pytest.mark.parametrize("block_size", [4])
+def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
+    """
+    This tests that the preallocated blocks are correctly added.
+    """
+    manager = KVCacheManager(
+        block_size=block_size,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=num_preallocate_tokens,
+    )
+    num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)
+
+    req = make_request("0", list(range(block_size * 30)))
+    computed_blocks = manager.get_computed_blocks(req)
+    assert not computed_blocks
+    # Just ask for 1 block.
+    blocks = manager.allocate_slots(req, block_size, computed_blocks)
+    req.num_computed_tokens = block_size
+    assert len(blocks) == 1 + num_preallocated_blocks
+
+    # Assume all computed.
+    manager.append_slots(req, block_size * (len(blocks) - 1))
+    req.num_computed_tokens = block_size * len(blocks)
+
+    # Append 1 block.
+    blocks = manager.append_slots(req, block_size)
+    assert len(blocks) == 1 + num_preallocated_blocks
+
+
+def test_cache_blocks():
+    """
+    This is a unit test that tests the correctness of the _cache_full_blocks
+    function of KVCacheManager.
+    """
+    block_size = 4
+    manager = KVCacheManager(
+        block_size=block_size,
+        num_gpu_blocks=5,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+    # Req:
+    #  Block 0: [0, 1, 2, 3]
+    #  Block 1: [4, 5, 6, 7]
+    #  Block 2: [8, 9, 10, 11]
+    #  Block 3: [12, 13]
+    req = make_request("0", list(range(14)))
+
+    # Test that blocks are cached correctly for 2 full blocks from the start.
+    blocks = [KVCacheBlock(block_id=i) for i in range(2)]
+
+    manager._cache_full_blocks(
+        request=req,
+        blk_start_idx=0,
+        full_blocks=blocks,
+        prev_block=None,
+    )
+
+    assert len(manager.cached_block_hash_to_block) == 2
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Test that blocks that don't start from the beginning are cached correctly.
+    blocks = [KVCacheBlock(block_id=2)]
+    manager._cache_full_blocks(
+        request=req,
+        blk_start_idx=2,
+        full_blocks=blocks,
+        prev_block=None,
+    )
+    assert len(manager.cached_block_hash_to_block) == 3
+    assert blocks[0].block_hash is not None
+
+
+def test_mm_prefix_caching():
+    """
+    This tests that the multi-modal prefix caching is correct.
+    """
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    # Common prompt tokens (T is text tokens and P is image placeholder tokens)
+    # [T,...,T, P0,...,P0], [P0,...,P0,T,...,T,P1,...,P1], [P1,...,P1]
+    common_token_ids = list(range(10)) + [-1] * 6
+    common_token_ids += [-1] * 4 + list(range(10, 20)) + [-1] * 2
+    common_token_ids += [-1] * 16
+
+    common_mm_positions = [
+        PlaceholderRange(offset=11, length=10),
+        PlaceholderRange(offset=30, length=18),
+    ]
+    common_mm_hashes = ["aaa", "bbb"]
+
+    # A unique image plus some text tokens.
+    unique_token_ids = [-1] * 7 + [100] * 4
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req0 = make_request("0",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req0)
+
+    # Completed block should have hashes with extra keys.
+    assert not computed_blocks
+    assert len(req0.kv_block_hashes) == 3
+    assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
+    assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
+    assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
+
+    blocks = manager.allocate_slots(req0, 59, computed_blocks)
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    req0.num_computed_tokens = 59
+
+    # Append slots without allocating a new block.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.append_slots(req0, 5)
+    assert new_blocks is not None and len(new_blocks) == 0
+
+    # The just completed block should have hashes with extra keys.
+    assert len(req0.kv_block_hashes) == 4
+    assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
+
+    # Cache hit.
+    unique_token_ids = [-1] * 7 + [200] * 5
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req1 = make_request("1",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert len(computed_blocks) == 3
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 1f26fe0fc892f..fffb5b8100ec7 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -32,6 +32,9 @@ async def generate(engine: AsyncLLM, request_id: str,
 
 @pytest.mark.asyncio
 async def test_load(monkeypatch):
+    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
+    # so that in the future when we switch, we don't have to change all the
+    # tests.
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
new file mode 100644
index 0000000000000..ff38a4568ecb1
--- /dev/null
+++ b/tests/v1/engine/test_engine_args.py
@@ -0,0 +1,46 @@
+import pytest
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+def test_prefix_caching_from_cli():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args([])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert (engine_args.enable_prefix_caching
+            ), "V1 turns on prefix caching by default."
+
+    # Turn it off possible with flag.
+    args = parser.parse_args(["--no-enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert not engine_args.enable_prefix_caching
+
+    # Turn it on with flag.
+    args = parser.parse_args(["--enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert engine_args.enable_prefix_caching
+
+
+def test_defaults_with_usage_context():
+    engine_args = EngineArgs(model="facebook/opt-125m")
+    vllm_config: VllmConfig = engine_args.create_engine_config(
+        UsageContext.LLM_CLASS)
+
+    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
+
+    engine_args = EngineArgs(model="facebook/opt-125m")
+    vllm_config = engine_args.create_engine_config(
+        UsageContext.OPENAI_API_SERVER)
+    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index b3692b594326a..a61ec63a365b5 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -27,9 +27,9 @@ def make_request() -> EngineCoreRequest:
         request_id=uuid.uuid4(),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
-        mm_data=None,
+        mm_inputs=None,
+        mm_hashes=None,
         mm_placeholders=None,
-        mm_processor_kwargs=None,
         sampling_params=SamplingParams(),
         eos_token_id=None,
         arrival_time=time.time(),
@@ -43,7 +43,8 @@ def test_engine_core(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 7b241bf836a0e..2f1cbec607a91 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -29,9 +29,9 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         request_id=str(uuid.uuid4()),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
-        mm_data=None,
+        mm_inputs=None,
+        mm_hashes=None,
         mm_placeholders=None,
-        mm_processor_kwargs=None,
         sampling_params=params,
         eos_token_id=None,
         arrival_time=time.time(),
@@ -81,8 +81,9 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
+        engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
             vllm_config,
@@ -153,7 +154,8 @@ async def test_engine_core_client_asyncio(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
 
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
             vllm_config,
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
new file mode 100644
index 0000000000000..790e891ec837d
--- /dev/null
+++ b/tests/vllm_test_utils/setup.py
@@ -0,0 +1,7 @@
+from setuptools import setup
+
+setup(
+    name='vllm_test_utils',
+    version='0.1',
+    packages=['vllm_test_utils'],
+)
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
new file mode 100644
index 0000000000000..bf0b62a5b75e3
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -0,0 +1,8 @@
+"""
+vllm_utils is a package for vLLM testing utilities.
+It does not import any vLLM modules.
+"""
+
+from .blame import BlameResult, blame
+
+__all__ = ["blame", "BlameResult"]
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
new file mode 100644
index 0000000000000..1ddd3471d357b
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -0,0 +1,53 @@
+import contextlib
+import dataclasses
+import sys
+import traceback
+from typing import Callable, Generator
+
+
+@dataclasses.dataclass
+class BlameResult:
+    found: bool = False
+    trace_stack: str = ""
+
+
+@contextlib.contextmanager
+def blame(func: Callable) -> Generator[BlameResult, None, None]:
+    """
+    Trace the function calls to find the first function that satisfies the
+    condition. The trace stack will be stored in the result.
+
+    Usage:
+
+    ```python
+    with blame(lambda: some_condition()) as result:
+        # do something
+    
+    if result.found:
+        print(result.trace_stack)
+    """
+    result = BlameResult()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal result
+        if event in ['call', 'return']:
+            # for every function call or return
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # check condition here
+                if not result.found and func():
+                    result.found = True
+                    result.trace_stack = "".join(traceback.format_stack())
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    try:
+        sys.settrace(_trace_calls)
+        yield result
+    finally:
+        sys.settrace(None)
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index a4ee9538d646b..a06956ce18a93 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,10 +21,13 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
 marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
 marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
 qqq, HandH1998/QQQ-Llama-3-8b-g128, main
-qqq, HandH1998/QQQ-Llama-3-8b, main
\ No newline at end of file
+qqq, HandH1998/QQQ-Llama-3-8b, main
+hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
\ No newline at end of file
diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
index a4d0c44c22b51..693128640e07d 100755
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -26,6 +26,10 @@ do
     export QUANTIZATION=${array[0]}
     export MODEL_NAME=${array[1]}
     export REVISION=${array[2]}
+    # If array length is larger than 3, then MIN_CAPABILITY is provided
+    if [ ${#array[@]} -gt 3 ]; then
+        export MIN_CAPABILITY=${array[3]}
+    fi
     pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
 
     if [[ $LOCAL_SUCCESS == 0 ]]; then
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index d8bca05e204c0..199731bdc21fe 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,14 +1,21 @@
 import os
 
+import pytest
 import torch
 
+from vllm.platforms import current_platform
+
 MAX_MODEL_LEN = 1024
 MODEL_NAME = os.environ.get("MODEL_NAME",
                             "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
 REVISION = os.environ.get("REVISION", "main")
 QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
+MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
 
 
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(int(MIN_CAPABILITY)),
+    reason="Current system does not have minimum capability.")
 def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 9e166ae64dbfb..a6b3cb5759f2b 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -9,7 +9,6 @@
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
-from vllm.worker.model_runner import _get_graph_batch_size
 
 BATCH_SIZES = [1, 4, 16, 64, 256]
 
@@ -548,7 +547,8 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     # With CUDA Graph capture and replay enabled, the decoder and encoder
     # input sequences will be padded. Create the expected padded tensors
     # accordingly.
-    graph_batch_size = _get_graph_batch_size(expanded_batch_size)
+    graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
+        expanded_batch_size)
     cuda_graph_pad_size = graph_batch_size - expanded_batch_size
     padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
     padded_encoder_seq_lens = encoder_seq_lens + list(
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index b36e8bfe73ff3..309854e6babf3 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -8,10 +8,10 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.worker.embedding_model_runner import (
-    ModelInputForGPUWithPoolingMetadata)
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 from vllm.worker.multi_step_model_runner import StatefulModelInput
+from vllm.worker.pooling_model_runner import (
+    ModelInputForGPUWithPoolingMetadata)
 
 
 class MockAttentionBackend(AttentionBackend):
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 433a9b30ba57a..aabe913c242e1 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -9,7 +9,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 from vllm.utils import get_open_port
-from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size
+from vllm.worker.model_runner import ModelRunner
 
 
 def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
@@ -176,7 +176,8 @@ def test_prepare_decode_cuda_graph(batch_size):
         model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
     assert len(slot_mapping) == len(input_tokens)
 
-    expected_bs = _get_graph_batch_size(len(seq_group_metadata_list))
+    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
+        len(seq_group_metadata_list))
     # Verify input metadata is correct for prompts.
     device = model_runner.device
     assert attn_metadata.num_prefills == 0
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 194ea2aa506f4..79233c75714de 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -31,10 +31,6 @@ def test_gpu_memory_profiling():
         is_driver_worker=True,
     )
 
-    # Load the model so we can profile it
-    worker.init_device()
-    worker.load_model()
-
     # Set 10GiB as the total gpu ram to be device-agnostic
     def mock_mem_info():
         current_usage = torch.cuda.memory_stats(
@@ -46,20 +42,24 @@ def mock_mem_info():
 
     from unittest.mock import patch
     with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
+        # Load the model so we can profile it
+        worker.init_device()
+        worker.load_model()
         gpu_blocks, _ = worker.determine_num_available_blocks()
 
-    # Peak vram usage by torch should be 0.7077 GiB
+    # Peak vram usage by torch should be 0.47 GiB
+    # Model weights take 0.25 GiB
     # No memory should be allocated outside of torch
     # 9.0 GiB should be the utilization target
-    # 8.2923 GiB should be available for the KV cache
+    # 8.28 GiB should be available for the KV cache
     block_size = CacheEngine.get_cache_block_size(
         engine_config.cache_config, engine_config.model_config,
         engine_config.parallel_config)
 
-    expected_blocks = (8.2923 * 1024**3) // block_size
+    expected_blocks = (8.28 * 1024**3) // block_size
 
     # Check within a small tolerance for portability
     # Hardware, kernel, or dependency changes could all affect memory
     # utilization.
-    # A 10 block tolerance here should be about 6MB of wiggle room.
-    assert abs(gpu_blocks - expected_blocks) < 10
+    # A 100 block tolerance here should be about 60MB of wiggle room.
+    assert abs(gpu_blocks - expected_blocks) < 100
diff --git a/tools/mypy.sh b/tools/mypy.sh
index e984e739d70cf..2454ff9fde466 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -29,3 +29,4 @@ run_mypy vllm/plugins
 run_mypy vllm/prompt_adapter
 run_mypy vllm/spec_decode
 run_mypy vllm/worker
+run_mypy vllm/v1
diff --git a/tools/png-lint.sh b/tools/png-lint.sh
new file mode 100755
index 0000000000000..a80fe9837342f
--- /dev/null
+++ b/tools/png-lint.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Ensure that *.excalidraw.png files have the excalidraw metadata
+# embedded in them. This ensures they can be loaded back into
+# the tool and edited in the future.
+
+find . -iname '*.excalidraw.png' | while read -r file; do
+	if git check-ignore -q "$file"; then
+		continue
+	fi
+	if ! grep -q "excalidraw+json" "$file"; then
+		echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
+		exit 1
+	fi
+done
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 081076ad7dbdc..394ca8663e189 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -34,9 +34,10 @@ def get_entries(node, curr_depth=0):
                         "examples/offline_profile.py")
     parser.add_argument("--phase",
                         type=str,
-                        choices=["prefill", "decode_1"],
                         required=True,
-                        help="The phase to print the table for.")
+                        help="The phase to print the table for. This is either"
+                        "prefill or decode_n, where n is the decode step "
+                        "number")
     parser.add_argument("--table",
                         type=str,
                         choices=["summary", "model"],
@@ -49,6 +50,10 @@ def get_entries(node, curr_depth=0):
     with open(args.json_trace) as f:
         profile_data = json.load(f)
 
+    assert args.phase in profile_data, \
+       (f"Cannot find phase {args.phase} in profile data. Choose one among"
+        f'{[x for x in profile_data.keys() if "prefill" in x or "decode" in x]}') #noqa
+
     if args.table == "summary":
         entries_and_depths = flatten_entries(
             SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index adc44474aa4c1..da7a28da15c19 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -151,16 +151,31 @@ def is_quant(op_name: str):
            "scaled_int8_quant" in op_name:
             return True
 
+    # LoRA ops
+    def is_sgmv_shrink(op_name: str):
+        return "sgmv_shrink" in op_name
+
+    def is_sgmv_expand(op_name: str):
+        return "sgmv_expand" in op_name
+
+    def is_bgmv_shrink(op_name: str):
+        return "bgmv_shrink" in op_name
+
+    def is_bgmv_expand(op_name: str):
+        return "bgmv_expand" in op_name
+
+    def is_cutlass_gemm_op(op_name: str):
+        return "void cutlass::Kernel" in op_name or \
+           "void cutlass::device_kernel" in op_name
+
     def is_gemm_op(op_name: str):
         if is_quant(op_name):
             return False
-        if "xmma_gemm" in op_name  or \
+        return is_cutlass_gemm_op(op_name) or \
+           "xmma_gemm" in op_name  or \
            "gemv2T_kernel" in op_name or \
            "splitKreduce" in op_name or \
-           "void cutlass::Kernel" in op_name or \
-           "void cutlass::device_kernel" in op_name or \
-           "s16816gemm" in op_name:
-            return True
+           "s16816gemm" in op_name
 
     def is_elementwise_op(op_name: str):
         return "elementwise_kernel" in op_name
@@ -211,6 +226,18 @@ def is_reduce_kernel(op_name: str):
     quant_ops = list(filter(lambda x: is_quant(x), ops))
     ops = list(filter(lambda x: x not in quant_ops, ops))
 
+    sgmv_shrink_ops = list(filter(lambda x: is_sgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_shrink_ops, ops))
+    sgmv_expand_ops = list(filter(lambda x: is_sgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_expand_ops, ops))
+    bgmv_shrink_ops = list(filter(lambda x: is_bgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_shrink_ops, ops))
+    bgmv_expand_ops = list(filter(lambda x: is_bgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_expand_ops, ops))
+
+    cutlass_gemm_ops = list(filter(lambda x: is_cutlass_gemm_op(x), ops))
+    ops = list(filter(lambda x: x not in cutlass_gemm_ops, ops))
+
     gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
     ops = list(filter(lambda x: x not in gemm_ops, ops))
 
@@ -257,6 +284,24 @@ def is_reduce_kernel(op_name: str):
         trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
     if len(quant_ops):
         trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
+
+    if len(sgmv_shrink_ops):
+        trace_df['sgmv_shrink_ops'] = trace_df[sgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(sgmv_expand_ops):
+        trace_df['sgmv_expand_ops'] = trace_df[sgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_shrink_ops):
+        trace_df['bgmv_shrink_ops'] = trace_df[bgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_expand_ops):
+        trace_df['bgmv_expand_ops'] = trace_df[bgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+
+    if len(cutlass_gemm_ops):
+        trace_df['cutlass_gemm_ops'] = trace_df[cutlass_gemm_ops].agg("sum",
+                                                                      axis=1)
+
     if len(gemm_ops):
         trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
     if len(rms_norm_ops):
@@ -296,7 +341,9 @@ def is_reduce_kernel(op_name: str):
         trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
                                                                         axis=1)
 
-    trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops +
+    trace_df.drop(attention_ops + quant_ops + sgmv_shrink_ops +
+                  sgmv_expand_ops + bgmv_shrink_ops + bgmv_expand_ops +
+                  cutlass_gemm_ops + gemm_ops + rms_norm_ops +
                   vocab_embed_ops + mem_ops + elementwise_ops +
                   nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
                   nccl_other_ops + cross_device_reduce_1stage_ops +
@@ -315,7 +362,14 @@ def plot_trace_df(traces_df: pd.DataFrame,
                   plot_title: str,
                   output: Optional[Path] = None):
 
+    def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str:
+        phase_df = traces_df.query(f'phase == "{phase}"')
+        descs = phase_df['phase_desc'].to_list()
+        assert all([desc == descs[0] for desc in descs])
+        return descs[0]
+
     phases = traces_df['phase'].unique()
+    phase_descs = [get_phase_description(traces_df, p) for p in phases]
     traces_df = traces_df.pivot_table(index="phase",
                                       columns="name",
                                       values=plot_metric,
@@ -324,7 +378,8 @@ def plot_trace_df(traces_df: pd.DataFrame,
     traces_df = group_trace_by_operations(traces_df)
 
     # Make the figure
-    fig, ax = plt.subplots(1, figsize=(5, 8), sharex=True)
+    fig_size_x = max(5, len(phases))
+    fig, ax = plt.subplots(1, figsize=(fig_size_x, 8), sharex=True)
 
     # Draw the stacked bars
     ops = list(traces_df)
@@ -332,7 +387,7 @@ def plot_trace_df(traces_df: pd.DataFrame,
     for op in ops:
         values = [traces_df[op][phase] for phase in phases]
         values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
-        ax.bar(phases, values, label=op, bottom=bottom)
+        ax.bar(phase_descs, values, label=op, bottom=bottom)
         bottom = [bottom[j] + values[j] for j in range(len(phases))]
 
     # Write the values as text on the bars
@@ -390,6 +445,14 @@ def keep_only_top_entries(df: pd.DataFrame,
                    ["name"]] = "others"
             return df
 
+        def get_phase_description(key: str) -> str:
+            num_running_seqs = profile_json[key]['metadata'][
+                'num_running_seqs']
+            if num_running_seqs is not None:
+                return f"{key}-seqs-{num_running_seqs}"
+            else:
+                return key
+
         # Get data for each key
         traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
 
@@ -413,6 +476,7 @@ def keep_only_top_entries(df: pd.DataFrame,
         # Fill in information about the step-keys
         for trace_df, step_key in zip(trace_dfs, step_keys):
             trace_df['phase'] = step_key
+            trace_df['phase_desc'] = get_phase_description(step_key)
 
         # Combine all data frames so they can be put in a single plot
         traces_df = pd.concat(trace_dfs)
@@ -426,12 +490,16 @@ def keep_only_top_entries(df: pd.DataFrame,
     def make_plot_title_suffix(profile_json: dict) -> str:
         context = profile_json["context"]
         sparsity = context.get('sparsity', None)
-        return (f"{context['model']}\n"
+        run_type = \
+            f'Run {context["num_steps"]} steps' if context['num_steps'] else \
+                (f'Complete {context["complete_num_requests_per_step"]} per '
+                 f'step; Run till completion')
+        return (f"{context['engine_args']['model']}\n"
                 f"Batch={context['batch_size']}, "
                 f"PromptLen={context['prompt_len']}, "
-                f"OutputLen={context['output_len']},"
-                f"NumGpus={context['tensor_parallel_size']}"
-                f"{', Sparsity ' + sparsity if sparsity else ''}")
+                f"NumGpus={context['engine_args']['tensor_parallel_size']}"
+                f"{', Sparsity ' + sparsity if sparsity else ''}\n"
+                f"Run Type: {run_type}")
 
     profile_json = None
     with open(json_trace) as f:
diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh
new file mode 100755
index 0000000000000..04f8075c5527f
--- /dev/null
+++ b/tools/sphinx-lint.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+sphinx-lint --disable trailing-whitespace,missing-final-newline docs
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 8f477ea84756d..45252b93e3d54 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -7,8 +7,11 @@
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
-from vllm.outputs import (CompletionOutput, EmbeddingOutput,
-                          EmbeddingRequestOutput, RequestOutput)
+from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
+                          CompletionOutput, EmbeddingOutput,
+                          EmbeddingRequestOutput, PoolingOutput,
+                          PoolingRequestOutput, RequestOutput, ScoringOutput,
+                          ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
@@ -25,8 +28,14 @@
     "SamplingParams",
     "RequestOutput",
     "CompletionOutput",
+    "PoolingOutput",
+    "PoolingRequestOutput",
     "EmbeddingOutput",
     "EmbeddingRequestOutput",
+    "ClassificationOutput",
+    "ClassificationRequestOutput",
+    "ScoringOutput",
+    "ScoringRequestOutput",
     "LLMEngine",
     "EngineArgs",
     "AsyncLLMEngine",
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index b276b8fc25473..f6b5514f8987d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -19,9 +19,6 @@
     except ImportError as e:
         logger.warning("Failed to import from vllm._C with %r", e)
 
-if current_platform.is_rocm():
-    import vllm._rocm_C  # noqa: F401
-
 supports_moe_ops = False
 with contextlib.suppress(ImportError):
     import vllm._moe_C  # noqa: F401
@@ -252,6 +249,26 @@ def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
         block_table_bound)
 
 
+# fused quant layer norm ops
+def rms_norm_dynamic_per_token_quant(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+    scale_ub: Optional[torch.Tensor] = None,
+    residual: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(input, dtype=quant_dtype)
+    scales = torch.empty((input.numel() // input.shape[-1], 1),
+                         device=input.device,
+                         dtype=torch.float32)
+
+    torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight,
+                                                  scales, epsilon, scale_ub,
+                                                  residual)
+    return output, scales
+
+
 # quantization ops
 # awq
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
@@ -343,34 +360,10 @@ def _gptq_marlin_gemm_fake(a: torch.Tensor,
                                size_k: torch.SymInt,
                                is_k_full: bool,
                                has_zp: bool = False,
-                               use_fp32_reduce: bool = False) -> torch.Tensor:
+                               use_fp32_reduce: bool = False,
+                               is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
 
-    @register_fake("_C::ggml_dequantize")
-    def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int,
-                              m: torch.SymInt,
-                              n: torch.SymInt) -> torch.Tensor:
-        return torch.empty((m, n), dtype=torch.float16, device=W.device)
-
-    @register_fake("_C::ggml_mul_mat_vec_a8")
-    def _ggml_mul_mat_vec_a8_fake(
-        W: torch.Tensor,
-        X: torch.Tensor,
-        quant_type: int,
-        row: torch.SymInt,
-    ) -> torch.Tensor:
-        return torch.empty((1, row), dtype=torch.float16, device=W.device)
-
-    @register_fake("_C::ggml_mul_mat_a8")
-    def _ggml_mul_mat_a8_fake(
-        W: torch.Tensor,
-        X: torch.Tensor,
-        quant_type: int,
-        row: torch.SymInt,
-    ) -> torch.Tensor:
-        batch = X.size(0)
-        return torch.empty((batch, row), dtype=torch.float16, device=W.device)
-
     @register_fake("_C::marlin_qqq_gemm")
     def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                               s_tok: torch.Tensor, s_ch: torch.Tensor,
@@ -444,18 +437,18 @@ def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                               size_k: torch.SymInt) -> torch.Tensor:
         return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
 
-    @register_fake("_C::machete_gemm")
-    def machete_gemm_fake(
+    @register_fake("_C::machete_mm")
+    def machete_mm_fake(
         a: torch.Tensor,
-        # Should be the tensor returned by machete_prepack_B
+        # b_q Should be the tensor returned by machete_prepack_B
         b_q: torch.Tensor,
         b_type: ScalarType,
-        b_scales: Optional[torch.Tensor] = None,
-        b_zeros: Optional[torch.Tensor] = None,
+        out_type: Optional[torch.dtype] = None,
+        b_group_scales: Optional[torch.Tensor] = None,
+        b_group_zeros: Optional[torch.Tensor] = None,
         b_group_size: Optional[int] = None,
-        c: Optional[torch.Tensor] = None,
-        alpha: Optional[float] = None,
-        beta: Optional[float] = None,
+        b_channel_scales: Optional[torch.Tensor] = None,
+        a_token_scales: Optional[torch.Tensor] = None,
         schedule: Optional[str] = None,
     ) -> torch.Tensor:
         m = a.size(0)
@@ -463,12 +456,41 @@ def machete_gemm_fake(
         return torch.empty((m, n), device=a.device, dtype=a.dtype)
 
     @register_fake("_C::machete_prepack_B")
-    def machete_prepack_B_fake(b_q_weight: torch.Tensor,
-                               b_type: ScalarType) -> torch.Tensor:
+    def machete_prepack_B_fake(
+            b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
+            group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
         return torch.empty_like(b_q_weight,
                                 memory_format=torch.contiguous_format)
 
 
+if hasattr(torch.ops._C, "ggml_dequantize"):
+
+    @register_fake("_C::ggml_dequantize")
+    def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int,
+                              m: torch.SymInt,
+                              n: torch.SymInt) -> torch.Tensor:
+        return torch.empty((m, n), dtype=torch.float16, device=W.device)
+
+    @register_fake("_C::ggml_mul_mat_vec_a8")
+    def _ggml_mul_mat_vec_a8_fake(
+        W: torch.Tensor,
+        X: torch.Tensor,
+        quant_type: int,
+        row: torch.SymInt,
+    ) -> torch.Tensor:
+        return torch.empty((1, row), dtype=torch.float16, device=W.device)
+
+    @register_fake("_C::ggml_mul_mat_a8")
+    def _ggml_mul_mat_a8_fake(
+        W: torch.Tensor,
+        X: torch.Tensor,
+        quant_type: int,
+        row: torch.SymInt,
+    ) -> torch.Tensor:
+        batch = X.size(0)
+        return torch.empty((batch, row), dtype=torch.float16, device=W.device)
+
+
 # cutlass
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
@@ -530,6 +552,109 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_sparse_compress(a: torch.Tensor) \
+    -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+
+    Args:
+        a (torch.Tensor): 
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: 
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+
+    Raises:
+        ValueError: If the compression operation fails.
+
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
+    assert (a.dtype in [
+        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
+    ])
+    assert (a.is_contiguous())
+
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
+
+    m = a.shape[0]
+    k = a.shape[1]
+    assert (k % 2 == 0)
+    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
+                         dtype=torch.uint8,
+                         device=a.device)
+
+    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
+        raise ValueError
+
+    assert (a_nzs.is_contiguous())
+    assert (a_meta.is_contiguous())
+
+    return a_nzs, a_meta
+
+
+def cutlass_scaled_sparse_mm(
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
+        and bias.dtype == out_dtype
+
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
+                                          scale_b, bias)
+
+    return out
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
@@ -600,11 +725,12 @@ def gptq_marlin_gemm(a: torch.Tensor,
                      size_k: int,
                      is_k_full: bool,
                      has_zp: bool = False,
-                     use_fp32_reduce: bool = False) -> torch.Tensor:
+                     use_fp32_reduce: bool = False,
+                     is_zp_float: bool = False) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
                                          g_idx, perm, workspace, b_q_type.id,
                                          size_m, size_n, size_k, is_k_full,
-                                         has_zp, use_fp32_reduce)
+                                         has_zp, use_fp32_reduce, is_zp_float)
 
 
 # fp8 marlin
@@ -617,29 +743,41 @@ def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # machete
-def machete_supported_schedules(b_type: ScalarType) -> List[str]:
-    return torch.ops._C.machete_supported_schedules(b_type.id)
-
-
-def machete_gemm(
-    a: torch.Tensor,
-    b_q: torch.Tensor,  # Should be the tensor returned by machete_prepack_B
-    b_type: ScalarType,
-    b_scales: Optional[torch.Tensor] = None,
-    b_zeros: Optional[torch.Tensor] = None,
-    b_group_size: Optional[int] = None,
-    c: Optional[torch.Tensor] = None,
-    alpha: Optional[float] = None,
-    beta: Optional[float] = None,
-    schedule: Optional[str] = None,
-) -> torch.Tensor:
-    return torch.ops._C.machete_gemm(a, b_q, b_type.id, b_scales, b_zeros,
-                                     b_group_size, c, alpha, beta, schedule)
+def machete_supported_schedules(
+        a_type: torch.dtype,
+        b_type: ScalarType,
+        group_scales_type: Optional[torch.dtype],
+        group_zeros_type: Optional[torch.dtype] = None,
+        channel_scales_type: Optional[torch.dtype] = None,
+        token_scales_type: Optional[torch.dtype] = None,
+        out_type: Optional[torch.dtype] = None) -> List[str]:
+    return torch.ops._C.machete_supported_schedules(
+        a_type, b_type.id, group_scales_type, group_zeros_type,
+        channel_scales_type, token_scales_type, out_type)
 
 
-def machete_prepack_B(b_q_weight: torch.Tensor,
-                      b_type: ScalarType) -> torch.Tensor:
-    return torch.ops._C.machete_prepack_B(b_q_weight, b_type.id)
+def machete_mm(
+        a: torch.Tensor,
+        # b_q Should be the tensor returned by machete_prepack_B
+        b_q: torch.Tensor,
+        b_type: ScalarType,
+        out_type: Optional[torch.dtype] = None,
+        b_group_scales: Optional[torch.Tensor] = None,
+        b_group_zeros: Optional[torch.Tensor] = None,
+        b_group_size: Optional[int] = None,
+        b_channel_scales: Optional[torch.Tensor] = None,
+        a_token_scales: Optional[torch.Tensor] = None,
+        schedule: Optional[str] = None) -> torch.Tensor:
+    return torch.ops._C.machete_mm(a, b_q, b_type.id, out_type, b_group_scales,
+                                   b_group_zeros, b_group_size,
+                                   b_channel_scales, a_token_scales, schedule)
+
+
+def machete_prepack_B(
+        b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
+        group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
+    return torch.ops._C.machete_prepack_B(b_q_weight, a_type, b_type.id,
+                                          group_scales_type)
 
 
 if hasattr(torch.ops._C, "permute_cols"):
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index a504cb1f7e318..aed04361e5fb4 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,7 +1,6 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass, fields
-from enum import Enum, auto
 from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Set,
                     Tuple, Type, TypeVar)
 
@@ -15,13 +14,19 @@
                                                ModelRunnerInputBuilderBase)
 
 
-class AttentionType(Enum):
-    DECODER = auto()  # Decoder attention between previous layer Q/K/V
-    ENCODER = auto(
-    )  # Encoder attention between previous layer Q/K/V for encoder-decoder
-    ENCODER_ONLY = auto()  # Encoder attention between previous layer Q/K/V
-    ENCODER_DECODER = auto(
-    )  # Attention between dec. Q and enc. K/V for encoder-decoder
+class AttentionType:
+    """
+    Attention type.
+    Use string to be compatible with `torch.compile`.
+    """
+    # Decoder attention between previous layer Q/K/V
+    DECODER = "decoder"
+    # Encoder attention between previous layer Q/K/V for encoder-decoder
+    ENCODER = "encoder"
+    # Encoder attention between previous layer Q/K/V
+    ENCODER_ONLY = "encoder_only"
+    # Attention between dec. Q and enc. K/V for encoder-decoder
+    ENCODER_DECODER = "encoder_decoder"
 
 
 class AttentionBackend(ABC):
@@ -241,6 +246,7 @@ def forward(
         attn_metadata: T,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 409a42187f46c..99cb84346d84e 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -87,6 +87,11 @@ def __post_init__(self):
 
 class BlocksparseFlashAttentionBackend(AttentionBackend):
 
+    @staticmethod
+    def get_name() -> str:
+        # For attention layer compatibility
+        return "FLASH_ATTN"
+
     @staticmethod
     def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]:
         return BlocksparseFlashAttentionImpl
@@ -354,7 +359,8 @@ def forward(
         attn_metadata: BlocksparseFlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -443,5 +449,6 @@ def forward(
                 blocksparse_head_sliding_step=self.head_sliding_step,
             )
 
+        assert output is not None
         # Reshape the output tensor.
         return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 314822b695722..c69e12ad78c44 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -16,10 +16,8 @@
     compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
     get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
     is_all_encoder_attn_metadata_set, is_block_tables_empty)
-from vllm.forward_context import get_forward_context
 from vllm.multimodal import MultiModalPlaceholderMap
-from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
-                        make_tensor_with_pad)
+from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -639,25 +637,28 @@ def forward(
         attn_metadata: FlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
         Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            output: shape = [num_tokens, num_heads, head_size]
             kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
+        NOTE: It in-place updates the output tensor.
         """
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
+        assert output is not None, "Output tensor must be provided."
+
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
             raise AttributeError("Encoder attention requires setting "
@@ -668,31 +669,162 @@ def forward(
                                  "requires setting cross-attention "
                                  "metadata attributes.")
 
-        output = torch.ops.vllm.unified_flash_attention(
-            query,
-            key,
-            value,
-            self.num_heads,
-            self.head_size,
-            self.num_kv_heads,
-            kv_cache,
-            self.kv_cache_dtype,
-            k_scale,
-            v_scale,
-            self.scale,
-            attn_type.value,
-            self.sliding_window,
-            self.alibi_slopes,
-            self.logits_soft_cap,
-        )
-
+        kv_cache_dtype: str = self.kv_cache_dtype
+        softmax_scale: float = self.scale
+        window_size = self.sliding_window
+        alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes
+        logits_soft_cap: Optional[float] = self.logits_soft_cap
+
+        if kv_cache.numel() > 0:
+            key_cache = kv_cache[0]
+            value_cache = kv_cache[1]
+            # We skip updating the KV cache under two conditions:
+            #  a. When the Attention Type is ENCODER. In this phase, we compute
+            #     only the encoder attention without updating the cache.
+            #  b. When both Key and Value are None. This occurs during
+            #     cross-attention computation in the decoding phase, where the
+            #     KV cache is already populated with the cross-attention
+            #     tensor. Thus, we skip cache updates during this time.
+            if (attn_type != AttentionType.ENCODER) and (key is not None) and (
+                    value is not None):
+                if attn_type == AttentionType.ENCODER_DECODER:
+                    # Update cross-attention KV cache (prefill-only)
+                    updated_slot_mapping = attn_metadata.cross_slot_mapping
+                else:
+                    # Update self-attention KV cache (prefill/decode)
+                    updated_slot_mapping = attn_metadata.slot_mapping
+
+                # Reshape the input keys and values and store them in the cache.
+                # If kv_cache is not provided, the new key and value tensors are
+                # not cached. This happens during the initial memory
+                # profiling run.
+                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                    key,
+                    value,
+                    kv_cache[0],
+                    kv_cache[1],
+                    updated_slot_mapping.flatten(),  # type: ignore[union-attr]
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                )
+
+        (num_prefill_query_tokens, num_prefill_kv_tokens,
+        num_decode_query_tokens) = \
+            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
+        decode_query = query[num_prefill_query_tokens:]
+        decode_output = output[num_prefill_query_tokens:]
+        # QKV for prefill.
+        query = query[:num_prefill_query_tokens]
+        prefill_output = output[:num_prefill_query_tokens]
+        assert query.shape[0] == num_prefill_query_tokens
+        assert decode_query.shape[0] == num_decode_query_tokens
+
+        if prefill_meta := attn_metadata.prefill_metadata:
+            # Prompt run.
+            if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
+                    or prefill_meta.block_tables.numel() == 0):
+                # normal attention
+                # When block_tables are not filled, it means q and k are the
+                # prompt, and they have the same length.
+                q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \
+                    _get_query_key_seq_metadata(prefill_meta, True, attn_type)
+
+                key = key[:num_prefill_kv_tokens]
+                value = value[:num_prefill_kv_tokens]
+
+                flash_attn_varlen_func(
+                    q=query,
+                    k=key,
+                    v=value,
+                    cu_seqlens_q=q_seq_start_loc,
+                    cu_seqlens_k=k_seq_start_loc,
+                    max_seqlen_q=q_seq_len,
+                    max_seqlen_k=k_seq_len,
+                    softmax_scale=softmax_scale,
+                    causal=_get_causal_option(attn_type),
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    softcap=logits_soft_cap,
+                    out=prefill_output,
+                )
+            else:
+                # prefix-enabled attention
+                assert attn_type == AttentionType.DECODER, (
+                    "Only decoder-only models support prefix caching")
+                assert prefill_meta.seq_lens is not None
+                max_seq_len = max(prefill_meta.seq_lens)
+                flash_attn_varlen_func(  # noqa
+                    q=query,
+                    k=key_cache,
+                    v=value_cache,
+                    cu_seqlens_q=prefill_meta.query_start_loc,
+                    max_seqlen_q=prefill_meta.max_query_len,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_k=max_seq_len,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    block_table=prefill_meta.block_tables,
+                    softcap=logits_soft_cap,
+                    out=prefill_output,
+                )
+
+        if decode_meta := attn_metadata.decode_metadata:
+            # Decoding run.
+            # Use flash_attn_varlen_func kernel for speculative decoding
+            # because different queries might have different lengths.
+
+            assert decode_meta.max_decode_query_len is not None
+            # use only for actual varlen decoding
+            if decode_meta.max_decode_query_len > 1:
+                assert attn_type == AttentionType.DECODER, (
+                    "Only decoder-only models support max_decode_query_len > 1"
+                )
+                flash_attn_varlen_func(
+                    q=decode_query,
+                    k=key_cache,
+                    v=value_cache,
+                    cu_seqlens_q=decode_meta.query_start_loc,
+                    max_seqlen_q=decode_meta.max_decode_query_len,
+                    cu_seqlens_k=decode_meta.seq_start_loc,
+                    max_seqlen_k=decode_meta.max_decode_seq_len,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    softcap=logits_soft_cap,
+                    block_table=decode_meta.block_tables,
+                    out=decode_output,
+                )
+            else:
+                # Use flash_attn_with_kvcache for normal decoding.
+                (
+                    seq_lens_arg,
+                    _,
+                    block_tables_arg,
+                ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
+                flash_attn_with_kvcache(
+                    q=decode_query.unsqueeze(1),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    block_table=block_tables_arg,
+                    cache_seqlens=seq_lens_arg,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    softcap=logits_soft_cap,
+                    out=decode_output.unsqueeze(1),
+                )
         return output
 
 
 def _get_query_key_seq_metadata(
     attn_metadata,
     is_prompt: bool,
-    attn_type: AttentionType,
+    attn_type: str,
 ) -> tuple:
     """
     Returns sequence metadata for key and query based on the specified 
@@ -754,7 +886,7 @@ def _get_query_key_seq_metadata(
         raise AttributeError(f"Invalid attention type {str(attn_type)}")
 
 
-def _get_causal_option(attn_type: AttentionType) -> bool:
+def _get_causal_option(attn_type: str) -> bool:
     """
     Determine whether the given attention type is suitable for causal 
     attention mechanisms.
@@ -770,220 +902,3 @@ def _get_causal_option(attn_type: AttentionType) -> bool:
     return not (attn_type == AttentionType.ENCODER
                 or attn_type == AttentionType.ENCODER_ONLY
                 or attn_type == AttentionType.ENCODER_DECODER)
-
-
-def unified_flash_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    num_heads: int,
-    head_size: int,
-    num_kv_heads: int,
-    kv_cache: torch.Tensor,
-    kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
-    softmax_scale: float,
-    attn_type_int_val: int,
-    window_size: Optional[List[int]] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    logits_soft_cap: Optional[float] = None,
-) -> torch.Tensor:
-
-    # Convert integer attn_type to enum
-    try:
-        attn_type = AttentionType(attn_type_int_val)
-    except ValueError as err:
-        raise AttributeError(
-            f"Invalid attention type {str(attn_type_int_val)}") from err
-
-    current_metadata = get_forward_context()
-    assert current_metadata is not None
-    assert isinstance(current_metadata, FlashAttentionMetadata)
-    attn_metadata: FlashAttentionMetadata = current_metadata
-
-    num_tokens, hidden_size = query.shape
-
-    # Reshape the query, key, and value tensors.
-    query = query.view(-1, num_heads, head_size)
-    if (key is not None) and (value is not None):
-        key = key.view(-1, num_kv_heads, head_size)
-        value = value.view(-1, num_kv_heads, head_size)
-
-    if kv_cache.numel() > 0:
-        key_cache = kv_cache[0]
-        value_cache = kv_cache[1]
-        # We skip updating the KV cache under two conditions:
-        #  a. When the Attention Type is ENCODER. In this phase, we compute
-        #     only the encoder attention without updating the cache.
-        #  b. When both Key and Value are None. This occurs during
-        #     cross-attention computation in the decoding phase, where the KV
-        #     cache is already populated with the cross-attention tensor.
-        #     Thus, we skip cache updates during this time.
-        if (attn_type != AttentionType.ENCODER) and (key is not None) and (
-                value is not None):
-            if attn_type == AttentionType.ENCODER_DECODER:
-                # Update cross-attention KV cache (prefill-only)
-                updated_slot_mapping = attn_metadata.cross_slot_mapping
-            else:
-                # Update self-attention KV cache (prefill/decode)
-                updated_slot_mapping = attn_metadata.slot_mapping
-
-            # Reshape the input keys and values and store them in the cache.
-            # If kv_cache is not provided, the new key and value tensors are
-            # not cached. This happens during the initial memory profiling run.
-            torch.ops._C_cache_ops.reshape_and_cache_flash(
-                key,
-                value,
-                kv_cache[0],
-                kv_cache[1],
-                updated_slot_mapping.flatten(),  # type: ignore[union-attr]
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-            )
-
-    (num_prefill_query_tokens, num_prefill_kv_tokens,
-    num_decode_query_tokens) = \
-        get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
-    decode_query = query[num_prefill_query_tokens:]
-    # QKV for prefill.
-    query = query[:num_prefill_query_tokens]
-    assert query.shape[0] == num_prefill_query_tokens
-    assert decode_query.shape[0] == num_decode_query_tokens
-
-    prefill_output: Optional[torch.Tensor] = None
-    decode_output: Optional[torch.Tensor] = None
-    if prefill_meta := attn_metadata.prefill_metadata:
-        # Prompt run.
-        if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
-                or prefill_meta.block_tables.numel() == 0):
-            # normal attention
-            # When block_tables are not filled, it means q and k are the
-            # prompt, and they have the same length.
-            q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \
-                _get_query_key_seq_metadata(prefill_meta, True, attn_type)
-
-            key = key[:num_prefill_kv_tokens]
-            value = value[:num_prefill_kv_tokens]
-
-            prefill_output = flash_attn_varlen_func(
-                q=query,
-                k=key,
-                v=value,
-                cu_seqlens_q=q_seq_start_loc,
-                cu_seqlens_k=k_seq_start_loc,
-                max_seqlen_q=q_seq_len,
-                max_seqlen_k=k_seq_len,
-                softmax_scale=softmax_scale,
-                causal=_get_causal_option(attn_type),
-                window_size=window_size,
-                alibi_slopes=alibi_slopes,
-                softcap=logits_soft_cap,
-            )
-        else:
-            # prefix-enabled attention
-            assert attn_type == AttentionType.DECODER, (
-                "Only decoder-only models support prefix caching")
-            assert prefill_meta.seq_lens is not None
-            max_seq_len = max(prefill_meta.seq_lens)
-            prefill_output = flash_attn_varlen_func(  # noqa
-                q=query,
-                k=key_cache,
-                v=value_cache,
-                cu_seqlens_q=prefill_meta.query_start_loc,
-                max_seqlen_q=prefill_meta.max_query_len,
-                cu_seqlens_k=prefill_meta.seq_start_loc,
-                max_seqlen_k=max_seq_len,
-                softmax_scale=softmax_scale,
-                causal=True,
-                window_size=window_size,
-                alibi_slopes=alibi_slopes,
-                block_table=prefill_meta.block_tables,
-                softcap=logits_soft_cap,
-            )
-
-    if decode_meta := attn_metadata.decode_metadata:
-        # Decoding run.
-        # Use flash_attn_varlen_func kernel for speculative decoding
-        # because different queries might have different lengths.
-
-        assert decode_meta.max_decode_query_len is not None
-        # use only for actual varlen decoding
-        if decode_meta.max_decode_query_len > 1:
-            assert attn_type == AttentionType.DECODER, (
-                "Only decoder-only models support max_decode_query_len > 1")
-            decode_output = flash_attn_varlen_func(
-                q=decode_query,
-                k=key_cache,
-                v=value_cache,
-                cu_seqlens_q=decode_meta.query_start_loc,
-                max_seqlen_q=decode_meta.max_decode_query_len,
-                cu_seqlens_k=decode_meta.seq_start_loc,
-                max_seqlen_k=decode_meta.max_decode_seq_len,
-                softmax_scale=softmax_scale,
-                causal=True,
-                window_size=window_size,
-                alibi_slopes=alibi_slopes,
-                softcap=logits_soft_cap,
-                block_table=decode_meta.block_tables,
-            )
-        else:
-            # Use flash_attn_with_kvcache for normal decoding.
-            (
-                seq_lens_arg,
-                _,
-                block_tables_arg,
-            ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
-            decode_output = flash_attn_with_kvcache(
-                q=decode_query.unsqueeze(1),
-                k_cache=key_cache,
-                v_cache=value_cache,
-                block_table=block_tables_arg,
-                cache_seqlens=seq_lens_arg,
-                softmax_scale=softmax_scale,
-                causal=True,
-                window_size=window_size,
-                alibi_slopes=alibi_slopes,
-                softcap=logits_soft_cap,
-            ).squeeze(1)
-
-    if prefill_output is None:
-        assert decode_output is not None
-        return decode_output.view(num_decode_query_tokens, hidden_size)
-    if decode_output is None:
-        assert prefill_output is not None
-        return prefill_output.view(num_prefill_query_tokens, hidden_size)
-
-    assert decode_meta is not None
-    decode_output = decode_output.squeeze(1)
-    output = torch.cat([prefill_output, decode_output], dim=0)
-    return output.view(num_tokens, hidden_size)
-
-
-def unified_flash_attention_fake(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    num_heads: int,
-    head_size: int,
-    num_kv_heads: int,
-    kv_cache: torch.Tensor,
-    kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
-    softmax_scale: float,
-    attn_type_int_val: int,
-    window_size: Optional[List[int]] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    logits_soft_cap: Optional[float] = None,
-) -> torch.Tensor:
-    return torch.empty_like(query)
-
-
-direct_register_custom_op(
-    op_name="unified_flash_attention",
-    op_func=unified_flash_attention,
-    mutates_args=["kv_cache"],
-    fake_impl=unified_flash_attention_fake,
-)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 107e3bbf79666..e367468d05d26 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -30,9 +30,8 @@
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.forward_context import get_forward_context
-from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
-                        get_kv_cache_torch_dtype, make_tensor_with_pad)
+from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
+                        make_tensor_with_pad)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -757,9 +756,8 @@ def __init__(
         if alibi_slopes is not None:
             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
         self.alibi_slopes = alibi_slopes
-        if sliding_window is not None:
-            raise ValueError("Sliding window is not supported in FlashInfer.")
-        self.sliding_window = (-1, -1)
+        self.sliding_window = ((sliding_window - 1,
+                                0) if sliding_window is not None else (-1, -1))
         self.kv_cache_dtype = kv_cache_dtype
         self.logits_soft_cap = logits_soft_cap
 
@@ -775,178 +773,129 @@ def forward(
         attn_metadata: FlashInferMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+
+        # TODO: directly write to output tensor
+
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
                                       "FlashInferImpl")
 
-        return torch.ops.vllm.unified_flash_infer(
-            query,
-            key,
-            value,
-            self.num_heads,
-            self.head_size,
-            self.num_kv_heads,
-            kv_cache,
-            self.kv_cache_dtype,
-            k_scale,
-            v_scale,
-            self.scale,
-            self.sliding_window,
-            self.alibi_slopes,
-            self.logits_soft_cap,
-        )
-
-
-def unified_flash_infer(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    num_heads: int,
-    head_size: int,
-    num_kv_heads: int,
-    kv_cache: torch.Tensor,
-    kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
-    softmax_scale: float,
-    window_size: Optional[List[int]] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    logits_soft_cap: Optional[float] = None,
-) -> torch.Tensor:
-
-    current_metadata = get_forward_context()
-    assert current_metadata is not None
-    assert isinstance(current_metadata, FlashInferMetadata)
-    attn_metadata: FlashInferMetadata = current_metadata
-
-    num_tokens, hidden_size = query.shape
-    query = query.view(-1, num_heads, head_size)
-    key = key.view(-1, num_kv_heads, head_size)
-    value = value.view(-1, num_kv_heads, head_size)
-
-    if kv_cache.numel() > 0:
-        # Use the same reshape and cache kernel as flash attention.
-        ops.reshape_and_cache_flash(
-            key,
-            value,
-            kv_cache[:, 0],
-            kv_cache[:, 1],
-            attn_metadata.slot_mapping.flatten(),
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-        # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
-        # to process the cache when the kv_cache_dtype is fp8
-        if kv_cache_dtype.startswith("fp8"):
-            torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                kv_cache_dtype)
-            kv_cache = kv_cache.view(torch_dtype)
-
-    num_prefill_tokens = attn_metadata.num_prefill_tokens
-    num_decode_tokens = attn_metadata.num_decode_tokens
-    assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
-    assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
-    query = query.contiguous()  # Flashinfer requires query to be contiguous
-    # Query for decode. KV is not needed because it is already cached.
-    # QKV for prefill.
-    decode_query = query[num_prefill_tokens:]
-    query = query[:num_prefill_tokens]
-
-    key = key[:num_prefill_tokens]
-    value = value[:num_prefill_tokens]
-
-    assert query.shape[0] == num_prefill_tokens
-    assert decode_query.shape[0] == num_decode_tokens
-
-    prefill_output: Optional[torch.Tensor] = None
-    decode_output: Optional[torch.Tensor] = None
-    if prefill_meta := attn_metadata.prefill_metadata:
-        # We will use flash attention for prefill
-        # when kv_cache is not provided.
-        # This happens when vllm runs the profiling to
-        # determine the number of blocks.
-        if kv_cache.numel() == 0:
-            prefill_output = flash_attn_varlen_func(
-                q=query,
-                k=key,
-                v=value,
-                cu_seqlens_q=prefill_meta.seq_start_loc,
-                cu_seqlens_k=prefill_meta.seq_start_loc,
-                max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                max_seqlen_k=prefill_meta.max_prefill_seq_len,
-                softmax_scale=softmax_scale,
-                causal=True,
-                window_size=window_size,
-                alibi_slopes=alibi_slopes,
+        num_heads: int = self.num_heads
+        head_size: int = self.head_size
+        num_kv_heads: int = self.num_kv_heads
+        kv_cache_dtype: str = self.kv_cache_dtype
+        softmax_scale: float = self.scale
+        window_size = self.sliding_window
+        alibi_slopes = self.alibi_slopes
+        logits_soft_cap = self.logits_soft_cap
+
+        num_tokens, hidden_size = query.shape
+        query = query.view(-1, num_heads, head_size)
+        key = key.view(-1, num_kv_heads, head_size)
+        value = value.view(-1, num_kv_heads, head_size)
+
+        if kv_cache.numel() > 0:
+            # Use the same reshape and cache kernel as flash attention.
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[:, 0],
+                kv_cache[:, 1],
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
             )
-        else:
-            assert prefill_meta is not None
-            assert prefill_meta.prefill_wrapper is not None
-            prefill_output = prefill_meta.prefill_wrapper.forward(
-                query,
+            # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
+            # to process the cache when the kv_cache_dtype is fp8
+            if kv_cache_dtype.startswith("fp8"):
+                torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                    kv_cache_dtype)
+                kv_cache = kv_cache.view(torch_dtype)
+
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
+                    f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
+        assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
+                    f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
+        query = query.contiguous(
+        )  # Flashinfer requires query to be contiguous
+        # Query for decode. KV is not needed because it is already cached.
+        # QKV for prefill.
+        decode_query = query[num_prefill_tokens:]
+        query = query[:num_prefill_tokens]
+
+        key = key[:num_prefill_tokens]
+        value = value[:num_prefill_tokens]
+
+        assert query.shape[0] == num_prefill_tokens
+        assert decode_query.shape[0] == num_decode_tokens
+
+        window_left = window_size[0] if window_size is not None else -1
+
+        prefill_output: Optional[torch.Tensor] = None
+        decode_output: Optional[torch.Tensor] = None
+        if prefill_meta := attn_metadata.prefill_metadata:
+            # We will use flash attention for prefill
+            # when kv_cache is not provided.
+            # This happens when vllm runs the profiling to
+            # determine the number of blocks.
+            if kv_cache.numel() == 0:
+                prefill_output = flash_attn_varlen_func(
+                    q=query,
+                    k=key,
+                    v=value,
+                    cu_seqlens_q=prefill_meta.seq_start_loc,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_q=prefill_meta.max_prefill_seq_len,
+                    max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                )
+            else:
+                assert prefill_meta is not None
+                assert prefill_meta.prefill_wrapper is not None
+                prefill_output = prefill_meta.prefill_wrapper.forward(
+                    query,
+                    kv_cache,
+                    logits_soft_cap=logits_soft_cap,
+                    causal=True,
+                    k_scale=k_scale,
+                    v_scale=v_scale,
+                    window_left=window_left)
+        if decode_meta := attn_metadata.decode_metadata:
+            assert decode_meta is not None
+            assert decode_meta.decode_wrapper is not None
+            decode_output = decode_meta.decode_wrapper.forward(
+                decode_query,
                 kv_cache,
+                sm_scale=softmax_scale,
                 logits_soft_cap=logits_soft_cap,
-                causal=True,
                 k_scale=k_scale,
-                v_scale=v_scale)
-    if decode_meta := attn_metadata.decode_metadata:
-        assert attn_metadata.decode_metadata is not None
-        assert attn_metadata.decode_metadata.decode_wrapper is not None
-        decode_output = attn_metadata.decode_metadata.decode_wrapper.forward(
-            decode_query,
-            kv_cache,
-            sm_scale=softmax_scale,
-            logits_soft_cap=logits_soft_cap,
-            k_scale=k_scale,
-            v_scale=v_scale)
-
-    if prefill_output is None and decode_output is not None:
-        # Decode only batch.
-        output, num_tokens = decode_output, num_decode_tokens
-    elif decode_output is None and prefill_output is not None:
-        # Prefill only batch.
-        output, num_tokens = prefill_output, num_prefill_tokens
-    else:
-        # Chunked prefill batch does not work with speculative decoding in
-        # FlashInfer backend, so the query length for decode should be 1.
-        assert prefill_output is not None
-        assert decode_output is not None
-        assert decode_meta is not None
-        assert decode_meta.decode_query_len == 1
-        decode_output = decode_output.squeeze(1)
-        output = torch.cat([prefill_output, decode_output], dim=0)
-    return output.view(num_tokens, hidden_size)
-
-
-def unified_flash_infer_fake(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    num_heads: int,
-    head_size: int,
-    num_kv_heads: int,
-    kv_cache: torch.Tensor,
-    kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
-    softmax_scale: float,
-    window_size: Optional[List[int]] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    logits_soft_cap: Optional[float] = None,
-) -> torch.Tensor:
-    return torch.empty_like(query).contiguous()
-
-
-direct_register_custom_op(
-    op_name="unified_flash_infer",
-    op_func=unified_flash_infer,
-    mutates_args=["kv_cache"],
-    fake_impl=unified_flash_infer_fake,
-)
+                v_scale=v_scale,
+                window_left=window_left)
+
+        if prefill_output is None and decode_output is not None:
+            # Decode only batch.
+            output, num_tokens = decode_output, num_decode_tokens
+        elif decode_output is None and prefill_output is not None:
+            # Prefill only batch.
+            output, num_tokens = prefill_output, num_prefill_tokens
+        else:
+            # Chunked prefill batch does not work with speculative decoding in
+            # FlashInfer backend, so the query length for decode should be 1.
+            assert prefill_output is not None
+            assert decode_output is not None
+            assert decode_meta is not None
+            assert decode_meta.decode_query_len == 1
+            decode_output = decode_output.squeeze(1)
+            output = torch.cat([prefill_output, decode_output], dim=0)
+        return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index a8f4b09b67274..f90d15d4207e7 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -22,6 +22,10 @@
 
 class HPUAttentionBackend(AttentionBackend):
 
+    @staticmethod
+    def get_name() -> str:
+        return "HPU_ATTN"
+
     @staticmethod
     def get_impl_cls() -> Type["HPUAttentionImpl"]:
         return HPUAttentionImpl
@@ -107,8 +111,16 @@ def __init__(
         self.matmul_qk = Matmul()
         self.softmax = Softmax()
         self.matmul_av = Matmul()
+        self.batch2block_matmul = Matmul()
+        self.block2batch_matmul = Matmul()
+        # NOTE(kzawora): Contiguous PA is off until model runner supports it
         self.k_cache = VLLMKVCache()
+        self.k_cache.use_contiguous_pa = False
         self.v_cache = VLLMKVCache()
+        self.v_cache.use_contiguous_pa = False
+        # NOTE(kzawora): Pipelined PA is off until model runner supports it
+        ops.pa_impl = ops.pa
+
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
         self.sliding_window = sliding_window
         self.alibi_slopes = alibi_slopes
@@ -140,7 +152,8 @@ def forward(
         attn_metadata: HPUAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
@@ -223,9 +236,12 @@ def forward(
                 block_mapping=attn_metadata.block_mapping,
                 block_bias=attn_metadata.attn_bias,
                 block_scales=attn_metadata.block_scales,
+                block_groups=None,
                 scale=self.scale,
                 matmul_qk_op=self.matmul_qk,
                 matmul_av_op=self.matmul_av,
+                batch2block_matmul_op=self.batch2block_matmul,
+                block2batch_matmul_op=self.block2batch_matmul,
                 keys_fetch_func=self.k_cache.fetch_from_cache,
                 values_fetch_func=self.v_cache.fetch_from_cache)
         # Reshape the output tensor.
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 87bdb1e0e6565..21949874bea47 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -172,7 +172,8 @@ def forward(
         attn_metadata: IpexAttnMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
 
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 6fee81de14420..9809aed0e66f9 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -65,6 +65,7 @@ class PallasMetadata(AttentionMetadata):
     # or all decoding.
     block_tables: Optional[torch.Tensor] = None
     context_lens: Optional[torch.Tensor] = None
+    effective_query_lens: Optional[torch.Tensor] = None
 
     @property
     def prefill_metadata(self) -> Optional["PallasMetadata"]:
@@ -72,8 +73,6 @@ def prefill_metadata(self) -> Optional["PallasMetadata"]:
             return None
 
         assert self.num_decode_tokens == 0
-        assert self.block_tables is None
-        assert self.context_lens is None
         return self
 
     @property
@@ -151,7 +150,8 @@ def forward(
         attn_metadata: PallasMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 
@@ -186,29 +186,50 @@ def forward(
 
         query = query * self.scale
         if attn_metadata.num_prefills > 0:
-            assert seq_len % 16 == 0, (
-                "Pallas FlashAttention kernel requires seq_len to be a "
-                f"multiple of 16 but got {seq_len}")
-
-            # Handle GQA/MQA.
-            if self.num_kv_heads != self.num_heads:
-                key = key.repeat_interleave(self.num_queries_per_kv, dim=-2)
-                key = key.view(batch_size, seq_len, self.num_heads,
-                               self.head_size)
-                value = value.repeat_interleave(self.num_queries_per_kv,
+            if attn_metadata.block_tables is None:
+                # Prefill without paged KV cache.
+                assert seq_len % 16 == 0, (
+                    "Pallas FlashAttention kernel requires seq_len to be a "
+                    f"multiple of 16 but got {seq_len}")
+
+                # Handle GQA/MQA.
+                if self.num_kv_heads != self.num_heads:
+                    key = key.repeat_interleave(self.num_queries_per_kv,
                                                 dim=-2)
-                value = value.view(batch_size, seq_len, self.num_heads,
+                    key = key.view(batch_size, seq_len, self.num_heads,
                                    self.head_size)
-            # FlashAttention requires [batch_size, num_heads, seq_len, d_model]
-            # while the input is [batch_size, seq_len, num_heads, d_model].
-            # Permute the input to match the required format.
-            output = torch.ops.xla.flash_attention(
-                query.permute(0, 2, 1, 3),
-                key.permute(0, 2, 1, 3),
-                value.permute(0, 2, 1, 3),
-                True,
-            )
-            output = output.permute(0, 2, 1, 3)
+                    value = value.repeat_interleave(self.num_queries_per_kv,
+                                                    dim=-2)
+                    value = value.view(batch_size, seq_len, self.num_heads,
+                                       self.head_size)
+                # FlashAttention kernel requires the input shape to be
+                # [batch_size, num_heads, seq_len, d_model]
+                # while the input is [batch_size, seq_len, num_heads, d_model].
+                # Permute the input to match the required format.
+                output = torch.ops.xla.flash_attention(
+                    query.permute(0, 2, 1, 3),
+                    key.permute(0, 2, 1, 3),
+                    value.permute(0, 2, 1, 3),
+                    True,
+                )
+                output = output.permute(0, 2, 1, 3)
+            else:
+                # Prefill with paged KV cache.
+                # TODO(woosuk): Tune the below knobs.
+                num_kv_pages_per_compute_block = 16
+                num_queries_per_compute_block = 16
+                assert seq_len % num_queries_per_compute_block == 0
+                output = torch.ops.xla.multi_queries_paged_attention(
+                    query,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.context_lens,
+                    attn_metadata.block_tables,
+                    attn_metadata.effective_query_lens,
+                    num_kv_pages_per_compute_block,
+                    num_queries_per_compute_block,
+                    use_kernel=True,
+                )
         else:
             # Decoding run.
             assert kv_cache[0].numel() > 0
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 888adbffb8578..534f79b3a60bf 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -11,9 +11,10 @@
 from vllm.multimodal import MultiModalPlaceholderMap
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
 
-# Placeholder attention backend for models like Mamba and embedding models that
+# Placeholder attention backend for models like Mamba and pooling models that
 # lack attention.
 
 
@@ -186,6 +187,67 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
         )
         return self._cached_decode_metadata
 
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        assert not turn_prefills_into_decodes, \
+            ("Multi-Step + Chunked-Prefill is not supported for attention-free"
+             "models. turn_prefills_into_decodes is a "
+             "Multi-Step + Chunked-Prefill specific parameter.")
+
+        assert self.seq_lens is not None
+        assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        # Update sequences, masking off entries greater than num_queries
+        device = self.seq_lens_tensor.device
+        mask = torch.arange(self.seq_lens_tensor.size(0),
+                            device=device) < num_queries
+        self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype)
+        if sampled_token_ids is not None:
+            model_input.input_tokens.masked_scatter_(
+                mask, sampled_token_ids[:num_queries])
+
 
 class PlaceholderAttentionMetadataBuilder(
         AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 2bae370eaa90f..19daeb729ee61 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -414,7 +414,8 @@ def forward(
         attn_metadata: ROCmFlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -429,7 +430,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 563178d3ab60d..0cff6f5952aba 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -7,18 +7,14 @@
 from torch.nn.functional import scaled_dot_product_attention
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.platforms import current_platform
-
-if current_platform.is_cpu():
-    try:
-        from vllm.attention.ops.ipex_attn import PagedAttention
-    except ImportError:
-        from vllm.attention.ops.paged_attn import PagedAttention
-else:
-    from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.utils import make_tensor_with_pad, print_warning_once
+from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
 
 class TorchSDPABackend(AttentionBackend):
@@ -39,6 +35,10 @@ def get_metadata_cls() -> Type["AttentionMetadata"]:
     def get_state_cls() -> Type["CommonAttentionState"]:
         return CommonAttentionState
 
+    @staticmethod
+    def get_builder_cls() -> Type["TorchSDPAMetadataBuilder"]:
+        return TorchSDPAMetadataBuilder
+
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -71,9 +71,15 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
     """
     # Currently, input sequences can only contain all prompts
     # or all decoding. True if all sequences are prompts.
-    is_prompt: bool
-    slot_mapping: torch.Tensor
-    seq_lens: Optional[List[int]]
+    chunked_prefill: bool
+    seq_lens: Optional[List[int]] = None  # For non-chunked prefill
+
+    # For chunked prefill only
+    max_query_len: Optional[int] = None
+    max_kv_len: Optional[int] = None
+    query_start_loc: Optional[torch.Tensor] = None
+    kv_start_loc: Optional[torch.Tensor] = None
+    prefill_block_tables: Optional[torch.Tensor] = None
 
     # Begin encoder attn & enc/dec cross-attn fields...
     # Encoder sequence lengths representation
@@ -123,25 +129,19 @@ def is_all_cross_attn_metadata_set(self):
 
     @property
     def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        # Currently chunked prefill is not supported
-        if self.num_decode_tokens == 0:
-            assert self.num_prefills > 0
-            return self
-
-        return None
+        if self.num_prefill_tokens == 0:
+            return None
+        return self
 
     @property
     def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        # Currently chunked prefill is not supported
-        if self.num_prefills > 0:
-            assert self.num_decode_tokens == 0
+        if self.num_decode_tokens == 0:
             return None
-
         return self
 
     def get_seq_lens(
         self,
-        attn_type: AttentionType,
+        attn_type: str,
     ):
         '''
         Extract appropriate sequence lengths from attention metadata
@@ -174,7 +174,7 @@ def get_seq_lens(
 
     def get_attn_bias(
         self,
-        attn_type: AttentionType,
+        attn_type: str,
     ) -> Optional[List[torch.Tensor]]:
         '''
         Extract appropriate attention bias from attention metadata
@@ -203,7 +203,7 @@ def get_attn_bias(
     def set_attn_bias(
         self,
         attn_bias: List[torch.Tensor],
-        attn_type: AttentionType,
+        attn_type: str,
     ) -> None:
         '''
         Update appropriate attention bias field of attention metadata,
@@ -229,7 +229,7 @@ def set_attn_bias(
 
     def get_seq_len_block_table_args(
         self,
-        attn_type: AttentionType,
+        attn_type: str,
     ) -> tuple:
         '''
         The particular choice of sequence-length- and block-table-related
@@ -274,6 +274,109 @@ def get_seq_len_block_table_args(
             raise AttributeError(f"Invalid attention type {str(attn_type)}")
 
 
+class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]):
+
+    def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
+        self.chunked_prefill = input_builder.chunked_prefill
+        self.input_data = input_builder.input_data
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata:
+        input_data = self.input_data
+        prefill_seq_lens = seq_lens[0:input_data.num_prefills]
+        prefill_query_lens = query_lens[0:input_data.num_prefills]
+        slot_mapping = torch.tensor(input_data.slot_mapping,
+                                    dtype=torch.long,
+                                    device="cpu")
+
+        # For chunked-prefill
+        if self.chunked_prefill and input_data.num_prefill_tokens != 0:
+            prefill_block_tables = make_tensor_with_pad(
+                self.input_data.prefill_block_tables,
+                pad=0,
+                dtype=torch.int32,
+                device="cpu",
+            )
+            query_lens_tensor = torch.tensor(prefill_query_lens,
+                                             dtype=torch.int32,
+                                             device="cpu")
+            kv_lens_tensor = torch.tensor(prefill_seq_lens,
+                                          dtype=torch.int32,
+                                          device="cpu")
+            query_start_loc = torch.zeros(input_data.num_prefills + 1,
+                                          dtype=torch.int32,
+                                          device="cpu")
+            kv_start_loc = torch.zeros(input_data.num_prefills + 1,
+                                       dtype=torch.int32,
+                                       device="cpu")
+            torch.cumsum(query_lens_tensor,
+                         dim=0,
+                         dtype=torch.int32,
+                         out=query_start_loc[1:])
+            torch.cumsum(kv_lens_tensor,
+                         dim=0,
+                         dtype=torch.int32,
+                         out=kv_start_loc[1:])
+            max_query_len = max(prefill_query_lens)
+            max_kv_len = max(prefill_seq_lens)
+        else:
+            prefill_block_tables = None
+            query_start_loc = None
+            kv_start_loc = None
+            max_query_len = None
+            max_kv_len = None
+
+        # For paged attention
+        if input_data.num_decode_tokens != 0:
+            seq_lens_tensor = torch.tensor(
+                input_data.seq_lens[input_data.num_prefills:],
+                dtype=torch.int32,
+                device="cpu",
+            )
+            block_tables = make_tensor_with_pad(
+                self.input_data.decode_block_tables,
+                pad=0,
+                dtype=torch.int32,
+                device="cpu",
+            )
+        else:
+            block_tables = torch.tensor([])
+            seq_lens_tensor = torch.tensor(
+                input_data.seq_lens[:input_data.num_prefills],
+                dtype=torch.int32,
+                device="cpu",
+            )
+
+        # For multi-modal models
+        placeholder_index_maps = None
+        if len(input_data.multi_modal_inputs_list) != 0:
+            placeholder_index_maps = {
+                modality: placeholder_map.index_map()
+                for modality, placeholder_map in
+                input_data.multi_modal_placeholder_maps.items()
+            }
+
+        attn_metadata = TorchSDPAMetadata(
+            chunked_prefill=self.chunked_prefill,
+            seq_lens=prefill_seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_kv_len=max_kv_len,
+            query_start_loc=query_start_loc,
+            kv_start_loc=kv_start_loc,
+            max_decode_seq_len=input_data.max_decode_seq_len,
+            num_prefills=input_data.num_prefills,
+            num_prefill_tokens=input_data.num_prefill_tokens,
+            num_decode_tokens=input_data.num_decode_tokens,
+            block_tables=block_tables,
+            prefill_block_tables=prefill_block_tables,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+        )
+
+        return attn_metadata
+
+
 class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
 
     def __init__(
@@ -292,7 +395,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError("Torch SPDA does not support logits soft cap.")
+            print_warning_once("Torch SPDA does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -327,7 +431,8 @@ def forward(
         attn_metadata: TorchSDPAMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 
@@ -409,19 +514,35 @@ def forward(
             assert key.shape[0] == num_prefill_tokens + num_decode_tokens
             assert value.shape[0] == num_prefill_tokens + num_decode_tokens
 
+        output = torch.empty_like(query)
         if prefill_meta := attn_metadata.prefill_metadata:
             assert attn_metadata.seq_lens is not None
-            if (kv_cache.numel() == 0
-                    or prefill_meta.block_tables.numel() == 0):
-                output = self._run_sdpa_forward(query,
-                                                key,
-                                                value,
-                                                prefill_meta,
-                                                attn_type=attn_type)
+            if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
+                self._run_sdpa_forward(output,
+                                       query,
+                                       key,
+                                       value,
+                                       prefill_meta,
+                                       attn_type=attn_type)
             else:
                 # prefix-enabled attention
-                raise RuntimeError(
-                    "Torch SDPA backend doesn't support prefix decoding.")
+                assert not self.need_mask
+                import intel_extension_for_pytorch.llm.modules as ipex_modules
+                output = torch.empty_like(query)
+                ipex_modules.PagedAttention.flash_attn_varlen_func(
+                    output[:prefill_meta.num_prefill_tokens, :, :],
+                    query[:prefill_meta.num_prefill_tokens, :, :],
+                    key_cache,
+                    value_cache,
+                    prefill_meta.query_start_loc,
+                    prefill_meta.kv_start_loc,
+                    prefill_meta.max_query_len,
+                    prefill_meta.max_kv_len,
+                    self.scale,
+                    True,
+                    prefill_meta.prefill_block_tables,
+                    self.alibi_slopes,
+                )
 
         if decode_meta := attn_metadata.decode_metadata:
             assert attn_type != AttentionType.ENCODER_ONLY, (
@@ -433,8 +554,9 @@ def forward(
                 block_tables_arg,
             ) = decode_meta.get_seq_len_block_table_args(attn_type)
 
-            output = PagedAttention.forward_decode(
-                query,
+            PagedAttention.forward_decode(
+                output[attn_metadata.num_prefill_tokens:, :, :],
+                query[attn_metadata.num_prefill_tokens:, :, :],
                 key_cache,
                 value_cache,
                 block_tables_arg,
@@ -453,12 +575,13 @@ def forward(
 
     def _run_sdpa_forward(
         self,
+        output: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         attn_metadata: TorchSDPAMetadata,
-        attn_type: AttentionType = AttentionType.DECODER,
-    ):
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
         if self.num_kv_heads != self.num_heads:
             key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
             value = value.repeat_interleave(self.num_queries_per_kv, dim=1)
@@ -479,7 +602,6 @@ def _run_sdpa_forward(
                 attn_masks = [None] * len(seq_lens)
             attn_metadata.set_attn_bias(attn_masks, attn_type)
 
-        output = torch.empty_like(query)
         query = query.movedim(0, query.dim() - 2)
         key = key.movedim(0, key.dim() - 2)
         value = value.movedim(0, value.dim() - 2)
@@ -498,11 +620,10 @@ def _run_sdpa_forward(
                 value[None, :, start_kv:end_kv, :],
                 attn_mask=mask,
                 dropout_p=0.0,
-                is_causal=causal_attn and not self.need_mask,
+                is_causal=causal_attn and mask is None,
                 scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
             output[start_q:end_q, :, :] = sub_out
             start_q, start_kv = end_q, end_kv
-        return output
 
 
 def _make_alibi_bias(
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 12800668af223..56cc43430301f 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -478,7 +478,7 @@ def is_all_cross_attn_metadata_set(attn_metadata):
 def get_seq_len_block_table_args(
     attn_metadata,
     is_prompt: bool,
-    attn_type: AttentionType,
+    attn_type: str,
 ) -> tuple:
     '''
     The particular choice of sequence-length- and block-table-related
@@ -529,7 +529,7 @@ def get_seq_len_block_table_args(
 
 def get_num_prefill_decode_query_kv_tokens(
     attn_metadata,
-    attn_type: AttentionType,
+    attn_type: str,
 ) -> Tuple[int, int, int]:
     """
     Calculate the number of prefill and decode tokens for query, key/value
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 83d03606524dc..3e59b3603d2c6 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,9 +17,7 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+from vllm.utils import print_warning_once
 
 
 class XFormersBackend(AttentionBackend):
@@ -284,7 +282,7 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]:
 
 def _get_attn_bias(
     attn_metadata: XFormersMetadata,
-    attn_type: AttentionType,
+    attn_type: str,
 ) -> Optional[AttentionBias]:
     '''
     Extract appropriate attention bias from attention metadata
@@ -314,7 +312,7 @@ def _get_attn_bias(
 def _set_attn_bias(
     attn_metadata: XFormersMetadata,
     attn_bias: List[Optional[AttentionBias]],
-    attn_type: AttentionType,
+    attn_type: str,
 ) -> None:
     '''
     Update appropriate attention bias field of attention metadata,
@@ -386,8 +384,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError(
-                "XFormers does not support attention logits soft capping.")
+            print_warning_once("XFormers does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -416,7 +414,8 @@ def forward(
         attn_metadata: "XFormersMetadata",
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
@@ -617,7 +616,7 @@ def _run_memory_efficient_xformers_forward(
         key: torch.Tensor,
         value: torch.Tensor,
         attn_metadata: XFormersMetadata,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Attention for 1D query of multiple prompts. Multiple prompt
         tokens are flattened in to `query` input.
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 33d05cbd3fe01..05d997279893b 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -3,13 +3,17 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from vllm.attention import AttentionMetadata, AttentionType
-from vllm.attention.selector import get_attn_backend
-from vllm.config import CacheConfig
+from vllm.attention.selector import backend_name_to_enum, get_attn_backend
+from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.platforms import _Backend, current_platform
+from vllm.utils import direct_register_custom_op
 
 
 class Attention(nn.Module):
@@ -35,18 +39,26 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        per_layer_sliding_window: Optional[int] = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
+        if per_layer_sliding_window is not None:
+            # per-layer sliding window
+            sliding_window = per_layer_sliding_window
+        elif cache_config is not None:
+            # model-level sliding window
+            sliding_window = cache_config.sliding_window
+        else:
+            sliding_window = None
+
         if cache_config is not None:
             kv_cache_dtype = cache_config.cache_dtype
             block_size = cache_config.block_size
-            sliding_window = cache_config.sliding_window
             is_attention_free = cache_config.is_attention_free
         else:
             kv_cache_dtype = "auto"
             block_size = 16
-            sliding_window = None
             is_attention_free = False
         if num_kv_heads is None:
             num_kv_heads = num_heads
@@ -85,6 +97,28 @@ def __init__(
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
                              blocksparse_params, logits_soft_cap)
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.num_kv_heads = num_kv_heads
+        self.backend = backend_name_to_enum(attn_backend.get_name())
+
+        # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
+        # torch.compile works by registering the attention as one giant
+        # opaque custom op. For other platforms, we directly call them
+        # and let torch.compile handle them.
+        self.use_direct_call = not current_platform.is_cuda_alike(
+        ) and not current_platform.is_cpu()
+
+        # For some attention backends, we allocate an output tensor before
+        # calling the custom op. When piecewise cudagraph is enabled, this
+        # makes sure the output tensor is allocated inside the cudagraph.
+        self.use_output = self.backend == _Backend.FLASH_ATTN or \
+            self.backend == _Backend.FLASH_ATTN_VLLM_V1
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+        self.layer_name = prefix
 
     def forward(
         self,
@@ -93,17 +127,38 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
-        attn_type: AttentionType = AttentionType.DECODER,
+        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
 
-        return self.impl.forward(query,
-                                 key,
-                                 value,
-                                 kv_cache,
-                                 attn_metadata,
-                                 self._k_scale,
-                                 self._v_scale,
-                                 attn_type=attn_type)
+        if self.use_direct_call:
+            return self.impl.forward(query,
+                                     key,
+                                     value,
+                                     kv_cache,
+                                     attn_metadata,
+                                     self._k_scale,
+                                     self._v_scale,
+                                     attn_type=attn_type)
+        elif self.use_output:
+            output = torch.empty_like(query)
+            hidden_size = query.size(-1)
+            # Reshape the query, key, and value tensors.
+            # NOTE(woosuk): We do this outside the custom op to minimize the
+            # CPU overheads from the non-CUDA-graph regions.
+            query = query.view(-1, self.num_heads, self.head_size)
+            output = output.view(-1, self.num_heads, self.head_size)
+            if key is not None:
+                key = key.view(-1, self.num_kv_heads, self.head_size)
+            if value is not None:
+                value = value.view(-1, self.num_kv_heads, self.head_size)
+            torch.ops.vllm.unified_attention_with_output(
+                query, key, value, output, kv_cache, attn_type,
+                self.layer_name)
+            return output.view(-1, hidden_size)
+        else:
+            return torch.ops.vllm.unified_attention(query, key, value,
+                                                    kv_cache, attn_type,
+                                                    self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
@@ -112,3 +167,150 @@ def extra_repr(self) -> str:
         s += f", scale={self.impl.scale}"  # type: ignore
         s += f", backend={self.impl.__class__.__name__}"
         return s
+
+
+class MultiHeadAttention(nn.Module):
+    """Multi-headed attention without any cache, used for ViT."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = scale
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+
+        dtype = torch.get_default_dtype()
+        attn_backend = get_attn_backend(head_size,
+                                        dtype,
+                                        kv_cache_dtype=None,
+                                        block_size=16,
+                                        is_attention_free=False)
+        if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            attn_backend = _Backend.XFORMERS
+
+        self.attn_backend = attn_backend if attn_backend in {
+            _Backend.TORCH_SDPA, _Backend.XFORMERS
+        } else _Backend.TORCH_SDPA
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+    ) -> torch.Tensor:
+        """Input shape: batch_size x seq_len x hidden_size"""
+        # TODO(Isotr0py): Use existing backend implementations and support FA2
+        bsz, q_len, _ = query.size()
+        kv_len = key.size(1)
+
+        query = query.view(bsz, q_len, self.num_heads, self.head_size)
+        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+
+        if self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+
+            out = xops.memory_efficient_attention_forward(query,
+                                                          key,
+                                                          value,
+                                                          scale=self.scale)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            query, key, value = (x.transpose(1, 2)
+                                 for x in (query, key, value))
+            out = F.scaled_dot_product_attention(query,
+                                                 key,
+                                                 value,
+                                                 scale=self.scale)
+            out = out.transpose(1, 2)
+        return out.view(bsz, q_len, -1)
+
+
+def unified_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_type: str,
+    layer_name: str,
+) -> torch.Tensor:
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.dynamic_forward_context
+    self = forward_context.static_forward_context[layer_name]
+    return self.impl.forward(query,
+                             key,
+                             value,
+                             kv_cache,
+                             attn_metadata,
+                             self._k_scale,
+                             self._v_scale,
+                             attn_type=attn_type)
+
+
+def unified_attention_fake(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_type: str,
+    layer_name: str,
+) -> torch.Tensor:
+    return torch.empty_like(query).contiguous()
+
+
+direct_register_custom_op(
+    op_name="unified_attention",
+    op_func=unified_attention,
+    mutates_args=["kv_cache"],
+    fake_impl=unified_attention_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
+def unified_attention_with_output(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_type: str,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.dynamic_forward_context
+    self = forward_context.static_forward_context[layer_name]
+    self.impl.forward(query,
+                      key,
+                      value,
+                      kv_cache,
+                      attn_metadata,
+                      self._k_scale,
+                      self._v_scale,
+                      attn_type=attn_type,
+                      output=output)
+
+
+def unified_attention_with_output_fake(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_type: str,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="unified_attention_with_output",
+    op_func=unified_attention_with_output,
+    mutates_args=["kv_cache", "output"],
+    fake_impl=unified_attention_with_output_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
index ec1c37c5bcb0e..727a470ba6d0e 100644
--- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
@@ -157,19 +157,22 @@ def _fwd_kernel_inner(
             k = tl.load(
                 k_ptrs + start_n * stride_kt,
                 mask=offs_n[None, :] + start_n < k_seqlen,
+                other=0.0,
             )
         else:
             k = tl.load(
                 k_ptrs + start_n * stride_kt,
                 mask=(offs_n[None, :] + start_n < k_seqlen) &
                 (offs_d[:, None] < D_HEAD),
+                other=0.0,
             )
     else:
         if EVEN_D:
             k = tl.load(k_ptrs + start_n * stride_kt)
         else:
             k = tl.load(k_ptrs + start_n * stride_kt,
-                        mask=offs_d[:, None] < D_HEAD)
+                        mask=offs_d[:, None] < D_HEAD,
+                        other=0.0)
 
     qk = tl.zeros([BLOCK_M_LOADING, BLOCK_N], dtype=tl.float32)
     qk += tl.dot(q, k)
@@ -200,19 +203,22 @@ def _fwd_kernel_inner(
             v = tl.load(
                 v_ptrs + start_n * stride_vt,
                 mask=offs_n[:, None] + start_n < k_seqlen,
+                other=0.0,
             )
         else:
             v = tl.load(
                 v_ptrs + start_n * stride_vt,
                 mask=(offs_n[:, None] + start_n < k_seqlen) &
                 (offs_d[None, :] < D_HEAD),
+                other=0.0,
             )
     else:
         if EVEN_D:
             v = tl.load(v_ptrs + start_n * stride_vt)
         else:
             v = tl.load(v_ptrs + start_n * stride_vt,
-                        mask=offs_d[None, :] < D_HEAD)
+                        mask=offs_d[None, :] < D_HEAD,
+                        other=0.0)
 
     acc += tl.dot(p, v)
 
@@ -318,12 +324,13 @@ def _fwd_kernel_batch_inference(
         q = tl.load(
             Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
             mask=offs_m[:, None] < q_seqlen,
+            other=0.0,
         )
     else:
         q = tl.load(
             Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
             mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD),
-            other=0,
+            other=0.0,
         )
 
     sparse_crow_ptr = (layout_crow_ptr + off_h * layout_crow_stride_h +
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 8df6d4ced9dc6..cbc6c74acf09a 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -1,12 +1,17 @@
 from typing import Dict, List, Optional, Tuple
 
-import intel_extension_for_pytorch.llm.modules as ipex_modules
+try:
+    import intel_extension_for_pytorch.llm.modules as ipex_modules
+    _use_ipex = True
+except ImportError:
+    _use_ipex = False
+
 import torch
 
 from vllm import _custom_ops as ops
 
 
-class PagedAttention:
+class _PagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
@@ -22,6 +27,105 @@ def get_kv_cache_shape(
     ) -> Tuple[int, ...]:
         return (2, num_blocks, block_size * num_kv_heads * head_size)
 
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = 16 // kv_cache.element_size()
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
+                                   -1, x)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+        *args,
+    ) -> None:
+        ops.reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping.flatten(),
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+
+    @staticmethod
+    def forward_decode(
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: float,
+        v_scale: float,
+        *args,
+    ) -> None:
+        tp_rank: int = 0
+        blocksparse_local_blocks: int = 0
+        blocksparse_vert_stride: int = 0
+        blocksparse_block_size: int = 64
+        blocksparse_head_sliding_step: int = 0
+        block_size = value_cache.shape[3]
+
+        ops.paged_attention_v1(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+            tp_rank,
+            blocksparse_local_blocks,
+            blocksparse_vert_stride,
+            blocksparse_block_size,
+            blocksparse_head_sliding_step,
+        )
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+        *args,
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
+
+
+class _IPEXPagedAttention(_PagedAttention):
+
     @staticmethod
     def split_kv_cache(
         kv_cache: torch.Tensor,
@@ -55,6 +159,7 @@ def write_to_paged_cache(
 
     @staticmethod
     def forward_decode(
+        output: torch.Tensor,
         query: torch.Tensor,
         key_cache: torch.Tensor,
         value_cache: torch.Tensor,
@@ -68,8 +173,7 @@ def forward_decode(
         k_scale: float,
         v_scale: float,
         *args,
-    ) -> torch.Tensor:
-        output = torch.empty_like(query)
+    ) -> None:
         block_size = value_cache.shape[2]
         head_mapping = torch.arange(
             0,
@@ -83,41 +187,5 @@ def forward_decode(
             scale, block_tables, context_lens, block_size, max_context_len,
             alibi_slopes)
 
-        return output
-
-    @staticmethod
-    def forward_prefix(
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache_dtype: str,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        subquery_start_loc: torch.Tensor,
-        prompt_lens_tensor: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_subquery_len: int,
-        alibi_slopes: Optional[torch.Tensor],
-        *args,
-    ) -> torch.Tensor:
-        raise NotImplementedError
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: Dict[int, int],
-        *args,
-    ) -> None:
-        raise NotImplementedError
 
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
-        *args,
-    ) -> None:
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        ops.copy_blocks(key_caches, value_caches, src_to_dists)
+PagedAttention = _IPEXPagedAttention if _use_ipex else _PagedAttention
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index a2a649c8ebcfd..9c11a8df55278 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -7,6 +7,13 @@
 
 from vllm.platforms import current_platform
 
+# Static kernels parameters
+BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64
+NUM_WARPS = 8
+
+# To check compatibility
+IS_TURING = current_platform.get_device_capability() == (7, 5)
+
 if triton.__version__ >= "2.1.0":
 
     @triton.jit
@@ -50,6 +57,7 @@ def _fwd_kernel(
         stride_v_cache_d,
         stride_v_cache_bl,
         num_queries_per_kv: int,
+        IN_PRECISION: tl.constexpr,
         BLOCK_M: tl.constexpr,
         BLOCK_DMODEL: tl.constexpr,  # head size
         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
@@ -130,7 +138,7 @@ def _fwd_kernel(
                 k = k_load
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)  # [M,N]
-            qk += tl.dot(q, k)
+            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
             qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
                           float("-inf"))
             qk *= sm_scale
@@ -178,7 +186,7 @@ def _fwd_kernel(
                 v = v_load
             p = p.to(v.dtype)
 
-            acc += tl.dot(p, v)
+            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
             # # update m_i and l_i
             l_i = l_i_new
             m_i = m_i_new
@@ -204,7 +212,7 @@ def _fwd_kernel(
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
+            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
             qk *= sm_scale
             # apply causal mask
             qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
@@ -238,7 +246,7 @@ def _fwd_kernel(
                         other=0.0)
             p = p.to(v.dtype)
 
-            acc += tl.dot(p, v)
+            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
             # update m_i and l_i
             l_i = l_i_new
             m_i = m_i_new
@@ -485,6 +493,7 @@ def _fwd_kernel_alibi(
         stride_v_cache_d,
         stride_v_cache_bl,
         num_queries_per_kv: int,
+        IN_PRECISION: tl.constexpr,
         BLOCK_M: tl.constexpr,
         BLOCK_DMODEL: tl.constexpr,  # head size
         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
@@ -560,7 +569,7 @@ def _fwd_kernel_alibi(
                 k = k_load
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
+            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
             qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
                           float("-inf"))
             qk *= sm_scale
@@ -600,7 +609,7 @@ def _fwd_kernel_alibi(
                 v = v_load
             p = p.to(v.dtype)
 
-            acc += tl.dot(p, v, allow_tf32=False)
+            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
             # update m_i and l_i
             l_i = l_i_new
             m_i = m_i_new
@@ -635,7 +644,7 @@ def _fwd_kernel_alibi(
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k, allow_tf32=False)
+            qk = tl.dot(q, k, acc=qk, input_precision='ieee')
             qk *= sm_scale
             qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
                           float("-inf"))
@@ -673,7 +682,7 @@ def _fwd_kernel_alibi(
                         other=0.0)
             p = p.to(v.dtype)
 
-            acc += tl.dot(p, v, allow_tf32=False)
+            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
             # update m_i and l_i
             l_i = l_i_new
             m_i = m_i_new
@@ -709,13 +718,17 @@ def context_attention_fwd(q,
                               alibi_slopes=None,
                               sliding_window=None):
 
-        BLOCK = 128 if current_platform.has_device_capability(80) else 64
-        NUM_WARPS = 8
-
+        q_dtype_is_f32 = q.dtype is torch.float32
         # need to reduce num. blocks when using fp32
         # due to increased use of GPU shared memory
-        if q.dtype is torch.float32:
-            BLOCK = BLOCK // 2
+        # if q.dtype is torch.float32:
+        BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK
+
+        # Turing does have tensor core for float32 multiplication
+        # use ieee as fallback for triton kernels work. There is also
+        # warning on vllm/config.py to inform users this fallback
+        # implementation
+        IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
 
         # Conversion of FP8 Tensor from uint8 storage to
         # appropriate torch.dtype for interpretation by Triton
@@ -799,6 +812,7 @@ def context_attention_fwd(q,
                 v_cache.stride(
                     3),  #[num_blocks, num_kv_heads, head_size, block_size]
                 num_queries_per_kv=num_queries_per_kv,
+                IN_PRECISION=IN_PRECISION,
                 BLOCK_M=BLOCK,
                 BLOCK_DMODEL=Lk,
                 BLOCK_DMODEL_PADDED=Lk_padded,
@@ -850,6 +864,7 @@ def context_attention_fwd(q,
             v_cache.stride(
                 3),  #[num_blocks, num_kv_heads, head_size, block_size]
             num_queries_per_kv=num_queries_per_kv,
+            IN_PRECISION=IN_PRECISION,
             BLOCK_M=BLOCK,
             BLOCK_DMODEL=Lk,
             BLOCK_DMODEL_PADDED=Lk_padded,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 664707e9dc65d..d263839705690 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,4 +1,3 @@
-import enum
 import os
 from contextlib import contextmanager
 from functools import lru_cache
@@ -9,26 +8,12 @@
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
+from vllm.platforms import _Backend, current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 
 logger = init_logger(__name__)
 
 
-class _Backend(enum.Enum):
-    FLASH_ATTN = enum.auto()
-    FLASH_ATTN_VLLM_V1 = enum.auto()
-    XFORMERS = enum.auto()
-    ROCM_FLASH = enum.auto()
-    TORCH_SDPA = enum.auto()
-    OPENVINO = enum.auto()
-    FLASHINFER = enum.auto()
-    HPU_ATTN = enum.auto()
-    PALLAS = enum.auto()
-    IPEX = enum.auto()
-    NO_ATTENTION = enum.auto()
-
-
 def backend_name_to_enum(backend_name: str) -> _Backend:
     assert backend_name is not None
 
@@ -216,40 +201,11 @@ def which_attn_to_use(head_size: int,
         if backend_by_env_var is not None:
             selected_backend = backend_name_to_enum(backend_by_env_var)
 
-    if current_platform.is_cpu():
-        if selected_backend != _Backend.TORCH_SDPA:
-            logger.info("Cannot use %s backend on CPU.", selected_backend)
-        return _Backend.TORCH_SDPA
-
-    if current_platform.is_openvino():
-        if selected_backend != _Backend.OPENVINO:
-            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
-        return _Backend.OPENVINO
-
-    if current_platform.is_xpu():
-        if selected_backend != _Backend.IPEX:
-            logger.info("Cannot use %s backend on XPU.", selected_backend)
-        return _Backend.IPEX
-
-    if current_platform.is_tpu():
-        if selected_backend != _Backend.PALLAS:
-            logger.info("Cannot use %s backend on TPU.", selected_backend)
-        return _Backend.PALLAS
-
-    if current_platform.is_rocm():
-        # AMD GPUs.
-        selected_backend = (_Backend.ROCM_FLASH if selected_backend
-                            == _Backend.FLASH_ATTN else selected_backend)
-        if selected_backend == _Backend.ROCM_FLASH:
-            if not current_platform.has_device_capability(90):
-                # not Instinct series GPUs.
-                logger.info("flash_attn is not supported on NAVI GPUs.")
-        else:
-            logger.info("%s is not supported in AMD GPUs.", selected_backend)
-        return _Backend.ROCM_FLASH
-
-    if current_platform.is_hpu():
-        return _Backend.HPU_ATTN
+    # get device-specific default attn_backend
+    default_backend = current_platform.get_default_attn_backend(
+        selected_backend)
+    if default_backend is not None:
+        return default_backend
 
     if use_v1:
         return _Backend.FLASH_ATTN_VLLM_V1
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 5682faa158069..0c7bbfe599b02 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,245 +1,281 @@
+import ast
 import copy
 import dataclasses
-import operator
+import os
+import pprint
+import time
+from collections import defaultdict
 from contextlib import ExitStack
-from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
-                    Union)
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
 from unittest.mock import patch
 
 import torch
 import torch.fx as fx
 
 import vllm.envs as envs
+from vllm.config import CompilationConfig, VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import combine_fx_passes, weak_ref_tensors
+from vllm.utils import weak_ref_tensors
 
-from .config import CompilationConfig
 from .counter import compilation_counter
-from .fusion import FusionPass
-from .levels import CompilationLevel
-from .reshapes import RedundantReshapesPass
+from .inductor_pass import InductorPass
+from .monitor import end_monitoring_torch_compile
+from .pass_manager import PostGradPassManager
 
 logger = init_logger(__name__)
 
 
-def fix_functionalization(graph: fx.Graph):
+class InductorHashCache:
     """
-    Rewrite the graph module to replace the pattern involving
-    torch._higher_order_ops.auto_functionalize.auto_functionalized
-    with a direct call to the inplace custom op.
+    Disk format: a Python list of tuples, each tuple is
+    (runtime_shape, graph_index, hash_str)
+    We use list of tuple for readability.
 
-    # TODO: check if PyTorch nightly has fixed this issue
+    In-memory format: a defaultdict of dict, where the key is
+    runtime_shape, and the value is a dict of graph_index to hash_str.
+
+    The data is essentially `Dict[Optional[int], Dict[int, str]]`,
+    we don't use json here because json doesn't support int as key.
+
+    TODO: better off-the-shelf solution to serialize the data?
+    """
+
+    def __init__(self, cache_dir: str, disabled: bool = False):
+        self.cache: defaultdict = defaultdict(dict)
+        self.disabled = disabled
+        self.cache_dir = cache_dir
+        self.cache_file_path = os.path.join(cache_dir,
+                                            "inductor_hash_cache.py")
+        if disabled:
+            return
+        # set flags so that Inductor and Triton store their cache
+        # in the cache_dir, then users only need to copy the cache_dir
+        # to another machine to reuse the cache.
+        inductor_cache = os.path.join(cache_dir, "inductor_cache")
+        os.makedirs(inductor_cache, exist_ok=True)
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
+        triton_cache = os.path.join(cache_dir, "triton_cache")
+        os.makedirs(triton_cache, exist_ok=True)
+        os.environ["TRITON_CACHE_DIR"] = triton_cache
+        if os.path.exists(self.cache_file_path):
+            with open(self.cache_file_path) as f:
+                self.deserialize(f.read())
+
+    def deserialize(self, data: str):
+        # we use ast.literal_eval to parse the data
+        # because it is a safe way to parse Python literals.
+        # do not use eval(), it is unsafe.
+        list_data = ast.literal_eval(data)
+        for runtime_shape, graph_index, hash_str in list_data:
+            self.cache[runtime_shape][graph_index] = hash_str
+
+    def serialize(self) -> str:
+        data = []
+        for runtime_shape, graph_index_to_hash_str in self.cache.items():
+            for graph_index, hash_str in graph_index_to_hash_str.items():
+                data.append((runtime_shape, graph_index, hash_str))
+        printer = pprint.PrettyPrinter(indent=4)
+        return printer.pformat(data)
+
+    def save_to_file(self):
+        if self.disabled:
+            return
+        with open(self.cache_file_path, "w") as f:
+            f.write(self.serialize())
+
+    def __contains__(self, key: Tuple[Optional[int], int]) -> bool:
+        if self.disabled:
+            return False
+        runtime_shape, graph_index = key
+        return runtime_shape in self.cache and graph_index in self.cache[
+            runtime_shape]
+
+    def __getitem__(self, key: Tuple[Optional[int], int]) -> str:
+        if self.disabled:
+            raise KeyError("cannot read from disabled cache")
+        runtime_shape, graph_index = key
+        return self.cache[runtime_shape][graph_index]
+
+    def __setitem__(self, key: Tuple[Optional[int], int], value: str):
+        # setitem for disabled cache is fine, because we
+        # don't actually write to the disk
+        runtime_shape, graph_index = key
+        self.cache[runtime_shape][graph_index] = value
+
+
+class AlwaysHitShapeEnv:
+    """
+    Why do we need this class:
+
+    For normal `torch.compile` usage, every compilation will have
+    one Dynamo bytecode compilation and one Inductor compilation.
+    The Inductor compilation happens under the context of the
+    Dynamo bytecode compilation, and that context is used to
+    determine the dynamic shape information, etc.
+
+    For our use case, we only run Dynamo bytecode compilation once,
+    and run Inductor compilation multiple times with different shapes
+    plus a general shape. The compilation for specific shapes happens
+    outside of the context of the Dynamo bytecode compilation. At that
+    time, we don't have shape environment to provide to Inductor, and
+    it will fail the Inductor code cache lookup.
+
+    By providing a dummy shape environment that always hits, we can
+    make the Inductor code cache lookup always hit, and we can
+    compile the graph for different shapes as needed.
+
+    The following dummy methods are obtained by trial-and-error
+    until it works.
     """
 
-    # debug code, if we want to see the graph before the transformation
-    # with open("before.py", "w") as f:
-    #     print(graph.python_code(root_module="self", verbose=True).src, file=f)
-
-    nodes_to_remove = []
-
-    for node in graph.nodes:
-        # Identify the auto_functionalized node
-        if node.op == 'call_function' and node.target == torch._higher_order_ops.auto_functionalize.auto_functionalized:  # noqa
-            if node.args[0] == torch.ops._C.rotary_embedding.default:
-                # manual replace for rotary_embedding
-
-                # Now, collect the arguments
-                kwargs = node.kwargs
-
-                query = kwargs['query']
-                mm_node = query.args[0].args[0]
-
-                # Create a new call to torch.ops._C.rotary_embedding.default
-                with graph.inserting_before(node):
-                    # just insert the call to the custom op
-                    # NOTE: don't run dead code elimination,
-                    # otherwise this op will be removed
-                    graph.call_function(torch.ops._C.rotary_embedding.default,
-                                        kwargs=kwargs)
-
-                # Remove the auto_functionalized node
-                # Since the node may have outputs, we need to handle its users
-                # Replace uses of the outputs (getitem nodes) with mm_node
-                for user in list(node.users):
-                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
-                        # Remove the getitem node
-                        for getitem_user in list(user.users):
-                            if (getitem_user.op == 'call_function'
-                                    and getitem_user.target
-                                    == torch.ops.aten.slice_scatter.default):
-                                # Replace the uses of slice_scatter node
-                                # with mm_node
-                                getitem_user.replace_all_uses_with(mm_node)
-                                nodes_to_remove.append(getitem_user)
-                        nodes_to_remove.append(user)
-                nodes_to_remove.append(node)
-
-            elif node.args[0] == torch.ops._C.fused_add_rms_norm.default:
-                # manual replace for fused_add_rms_norm
-                # this is the most effective optimization for llama
-                # failing to do this will result in many unnecessary copies
-
-                kwargs = node.kwargs
-
-                input = kwargs['input']
-                residual = kwargs['residual']
-
-                # Create a new call to torch.ops._C.rotary_embedding.default
-                with graph.inserting_before(node):
-                    # just insert the call to the custom op
-                    # NOTE: don't run dead code elimination,
-                    # otherwise this op will be removed
-                    graph.call_function(
-                        torch.ops._C.fused_add_rms_norm.default, kwargs=kwargs)
-
-                for user in list(node.users):
-                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
-                        # Remove the getitem node
-                        if user.args[1] == 1:
-                            replace_node = input
-                        elif user.args[1] == 2:
-                            replace_node = residual
-                        user.replace_all_uses_with(replace_node)
-                        nodes_to_remove.append(user)
-                nodes_to_remove.append(node)
-            elif (node.args[0] ==
-                  torch.ops._C.fused_add_rms_norm_static_fp8_quant.default):
-                # manual replace for fused_add_rms_norm_static_fp8_quant
-                # this is the most effective optimization for llama
-                # failing to do this will result in many unnecessary copies
-
-                kwargs = node.kwargs
-
-                result = kwargs['result']
-                residual = kwargs['residual']
-
-                # Create a new call to
-                # torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
-                with graph.inserting_before(node):
-                    # just insert the call to the custom op
-                    # NOTE: don't run dead code elimination,
-                    # otherwise this op will be removed
-                    graph.call_function(
-                        torch.ops._C.fused_add_rms_norm_static_fp8_quant.
-                        default,
-                        kwargs=kwargs)
-
-                for user in list(node.users):
-                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
-                        # Remove the getitem node
-                        if user.args[1] == 1:
-                            replace_node = result
-                        elif user.args[1] == 2:
-                            replace_node = residual
-                        user.replace_all_uses_with(replace_node)
-                        nodes_to_remove.append(user)
-                nodes_to_remove.append(node)
-
-            elif node.args[0] == torch.ops._C.rms_norm.default:
-                # manual replace for rms_norm
-
-                kwargs = node.kwargs
-
-                replace_node = kwargs['result']
-                # Create a new call to torch.ops._C.rms_norm.default
-                with graph.inserting_before(node):
-                    # just insert the call to the custom op
-                    # NOTE: don't run dead code elimination,
-                    # otherwise this op will be removed
-                    graph.call_function(torch.ops._C.rms_norm.default,
-                                        kwargs=kwargs)
-
-                for user in list(node.users):
-                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
-                        user.replace_all_uses_with(replace_node)
-                        nodes_to_remove.append(user)
-                nodes_to_remove.append(node)
-
-            elif node.args[
-                    0] == torch.ops._C.rms_norm_static_fp8_quant.default:  # noqa
-                # manual replace for rms_norm_static_fp8_quant
-
-                kwargs = node.kwargs
-
-                replace_node = kwargs['result']
-                # Create a new call to torch.ops._C.rms_norm_static_fp8_quant.default  # noqa
-                with graph.inserting_before(node):
-                    # just insert the call to the custom op
-                    # NOTE: don't run dead code elimination,
-                    # otherwise this op will be removed
-                    graph.call_function(
-                        torch.ops._C.rms_norm_static_fp8_quant.default,
-                        kwargs=kwargs)
-
-                for user in list(node.users):
-                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
-                        user.replace_all_uses_with(replace_node)
-                        nodes_to_remove.append(user)
-                nodes_to_remove.append(node)
-
-            elif node.args[0] == torch.ops._C.silu_and_mul.default:
-                # manual replace for silu_and_mul
-
-                kwargs = node.kwargs
-
-                input = kwargs['input']
-                out = kwargs['out']
-
-                # Create a new call to torch.ops._C.silu_and_mul.default
-                # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
-                with graph.inserting_before(node):
-                    # just insert the call to the custom op
-                    # NOTE: don't run dead code elimination,
-                    # otherwise this op will be removed
-                    graph.call_function(
-                        torch.ops._C.silu_and_mul.default,
-                        args=(out, input),
-                    )
-                replace_node = out
-
-                for user in list(node.users):
-                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
-                        user.replace_all_uses_with(replace_node)
-                        nodes_to_remove.append(user)
-                nodes_to_remove.append(node)
-
-    # Remove the nodes all at once
-    for node in nodes_to_remove:
-        graph.erase_node(node)
-
-    # debug code, if we want to see the graph after the transformation
-    # with open("after.py", "w") as f:
-    #     print(graph.python_code(root_module="self", verbose=True).src, file=f)
+    def __init__(self) -> None:
+        self.guards: List[Any] = []
+
+    def evaluate_guards_expression(self, *args, **kwargs):
+        return True
+
+    def get_pruned_guards(self, *args, **kwargs):
+        return []
+
+    def produce_guards_expression(self, *args, **kwargs):
+        return ""
 
 
 def wrap_inductor(graph,
                   example_inputs,
                   additional_inductor_config,
-                  do_logging=False,
+                  compilation_config: CompilationConfig,
+                  graph_index: int = 0,
+                  num_graphs: int = 1,
                   runtime_shape: Optional[int] = None,
                   use_inductor: bool = True):
+    if graph_index == 0:
+        # before compiling the first graph, record the start time
+        global compilation_start_time
+        compilation_start_time = time.time()
+
     if not use_inductor:
         return graph
 
     compilation_counter.num_inductor_compilations += 1
 
-    if do_logging:
-        if runtime_shape is None:
-            logger.info("Compiling a graph for general shape")
-        else:
-            logger.info("Compiling a graph for shape %s", runtime_shape)
-
     from torch._inductor import config
-    current_config = config.shallow_copy_dict()
+    current_config = config.get_config_copy()
     from torch._inductor.compile_fx import compile_fx
 
     if additional_inductor_config is not None:
         current_config.update(additional_inductor_config)
 
+    if isinstance(runtime_shape, int):
+        # for a specific batchsize, tuning triton kernel parameters
+        # can be beneficial
+        current_config["max_autotune"] = True
+        current_config["coordinate_descent_tuning"] = True
+
     # inductor can inplace modify the graph, so we need to copy it
     # see https://github.com/pytorch/pytorch/issues/138980
     graph = copy.deepcopy(graph)
-    return compile_fx(graph, example_inputs, config_patches=current_config)
+
+    cache_data = compilation_config.inductor_hash_cache
+    if (runtime_shape, graph_index) in cache_data:
+        # we compiled this graph before
+        # so we can directly lookup the compiled graph via hash
+        hash_str = cache_data[(runtime_shape, graph_index)]
+        if graph_index == 0:
+            # adds some info logging for the first graph
+            logger.info(
+                "Directly lookup the graph for shape %s from the cache",
+                str(runtime_shape))  # noqa
+        logger.debug(
+            "directly lookup the %s-th graph for shape %s via hash %s",
+            graph_index, str(runtime_shape), hash_str)
+        from torch._inductor.codecache import FxGraphCache
+        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
+            inductor_compiled_graph = FxGraphCache._lookup_graph(
+                hash_str, example_inputs, True, False)
+            assert inductor_compiled_graph is not None, (
+                "Inductor cache lookup failed. Please remove"
+                f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again."  # noqa
+            )
+
+        # Inductor calling convention (function signature):
+        # f(list) -> tuple
+        # Dynamo calling convention (function signature):
+        # f(*args) -> Any
+
+        # need to know if the graph returns a tuple
+        from torch._inductor.compile_fx import graph_returns_tuple
+        returns_tuple = graph_returns_tuple(graph)
+
+        # this is the graph we return to Dynamo to run
+        def compiled_graph(*args):
+            # convert args to list
+            list_args = list(args)
+            graph_output = inductor_compiled_graph(list_args)
+            # unpack the tuple if needed
+            if returns_tuple:
+                return graph_output
+            else:
+                return graph_output[0]
+    else:
+        # it's the first time we compile this graph
+        # the assumption is that we don't have nested Inductor compilation.
+        # compiled_fx_graph_hash will only be called once, and we can hook
+        # it to get the hash of the compiled graph directly.
+        from torch._inductor.codecache import compiled_fx_graph_hash
+
+        def hijack_compiled_fx_graph_hash(*args, **kwargs):
+            out = compiled_fx_graph_hash(*args, **kwargs)
+            # store the hash in the cache
+            nonlocal cache_data
+            cache_data[(runtime_shape, graph_index)] = out[0]
+            if graph_index == 0:
+                # adds some info logging for the first graph
+                logger.info("Cache the graph of shape %s for later use",
+                            str(runtime_shape))
+            logger.debug("store the %s-th graph for shape %s via hash %s",
+                         graph_index, str(runtime_shape), out[0])
+            return out
+
+        def _check_can_cache(*args, **kwargs):
+            # no error means it can be cached.
+            # Inductor refuses to cache the graph outside of Dynamo
+            # tracing context, and also disables caching for graphs
+            # with high-order ops.
+            # For vLLM, in either case, we want to cache the graph.
+            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
+            return
+
+        def _get_shape_env():
+            return AlwaysHitShapeEnv()
+
+        with patch(# for hijacking the hash of the compiled graph
+                "torch._inductor.codecache.compiled_fx_graph_hash",
+                hijack_compiled_fx_graph_hash), \
+            patch(# for providing a dummy shape environment
+                "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                 _get_shape_env), \
+            patch(# for forcing the graph to be cached
+                "torch._inductor.codecache.FxGraphCache._check_can_cache",
+                _check_can_cache):
+            compiled_graph = compile_fx(graph,
+                                        example_inputs,
+                                        config_patches=current_config)
+
+    # after compiling the last graph, record the end time
+    if graph_index == num_graphs - 1:
+        now = time.time()
+        elapsed = now - compilation_start_time
+        compilation_config.compilation_time += elapsed
+        if runtime_shape is None:
+            logger.info("Compiling a graph for general shape takes %.2f s",
+                        elapsed)
+        else:
+            logger.info("Compiling a graph for shape %s takes %.2f s",
+                        runtime_shape, elapsed)
+
+    return compiled_graph
 
 
 @dataclasses.dataclass
@@ -301,6 +337,8 @@ def split_graph(graph: fx.GraphModule,
 # we share the global graph pool among all the backends
 global_graph_pool = None
 
+compilation_start_time = 0.0
+
 
 class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
@@ -315,14 +353,15 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     """
 
     def __init__(self, module: torch.fx.GraphModule,
-                 compile_submod_names: List[str],
-                 compilation_configs: CompilationConfig, graph_pool):
+                 compile_submod_names: List[str], vllm_config: VllmConfig,
+                 graph_pool):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
         self.compile_submod_names = compile_submod_names
-        self.compilation_configs = compilation_configs
+        self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
+        self.vllm_config = vllm_config
 
     def run(self, *args):
         fake_args = [
@@ -344,16 +383,19 @@ def call_module(self, target: torch.fx.node.Target,
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
+            global compilation_start_time
             compiled_graph_for_general_shape = wrap_inductor(
                 submod,
                 args,
-                self.compilation_configs.inductor_compile_config,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
+                graph_index=index,
+                num_graphs=len(self.compile_submod_names),
                 runtime_shape=None,
-                do_logging=index == 0,
-                use_inductor=self.compilation_configs.use_inductor)
+                use_inductor=self.compilation_config.use_inductor)
 
             self.module.__dict__[target] = PiecewiseBackend(
-                submod, self.compilation_configs, self.graph_pool, index,
+                submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_general_shape)
 
@@ -370,15 +412,12 @@ class VllmBackend:
     The major work of this backend is to split the graph into
     piecewise graphs, and pass them to the piecewise backend.
 
-    This backend also handles custom passes and adds them to Inductor config.
-    The order of the post-grad post-passes is:
-    1. post_grad_passes (constructor parameter)
-    2. config["post_grad_custom_post_pass"]
-    3. fix_functionalization
-    This way, all passes operate on a functionalized graph.
+    This backend also adds the PostGradPassManager to Inductor config,
+    which handles the post-grad passes.
     """
 
-    compilation_configs: CompilationConfig
+    vllm_config: VllmConfig
+    compilation_config: CompilationConfig
     graph_pool: Any
     _called: bool = False
     # the graph we compiled
@@ -392,7 +431,10 @@ class VllmBackend:
     sym_tensor_indices: List[int]
     input_buffers: List[torch.Tensor]
 
-    def __init__(self, post_grad_passes: Sequence[Callable] = ()):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ):
         global global_graph_pool
         if global_graph_pool is None:
             global_graph_pool = torch.cuda.graph_pool_handle()
@@ -401,55 +443,59 @@ def __init__(self, post_grad_passes: Sequence[Callable] = ()):
         # streams, it might not be safe to share a global pool.
         # only investigate this when we use multiple streams
         self.graph_pool = global_graph_pool
-        self.post_grad_passes = post_grad_passes
+
+        # Passes to run on the graph post-grad.
+        self.post_grad_pass_manager = PostGradPassManager()
 
         self.sym_tensor_indices = []
         self.input_buffers = []
 
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
+
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
-    def add_passes_to_config(self):
-        config = self.compilation_configs
-        passes = list(self.post_grad_passes)
-
-        passes = passes + [RedundantReshapesPass(config)]
-
-        if config.enable_fusion:
-            passes = passes + [FusionPass.instance(config)]
+    def configure_post_pass(self):
+        config = self.compilation_config
+        self.post_grad_pass_manager.configure(config.pass_config)
 
+        # Post-grad custom passes are run using the post_grad_custom_post_pass
+        # hook. If a pass for that hook exists, add it to the pass manager.
         inductor_config = config.inductor_compile_config
-        if "post_grad_custom_post_pass" in inductor_config:
-            passes = passes + [inductor_config["post_grad_custom_post_pass"]]
-
-        # add the fix_functionalization pass last, so that all other
-        # passes operate on a functionalized graph
-        passes = passes + [fix_functionalization]
-        combined_pass = combine_fx_passes(passes)
-        inductor_config["post_grad_custom_post_pass"] = combined_pass
+        PASS_KEY = "post_grad_custom_post_pass"
+        if PASS_KEY in inductor_config:
+            # Config should automatically wrap all inductor passes
+            assert isinstance(inductor_config[PASS_KEY], InductorPass)
+            self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
+        inductor_config[PASS_KEY] = self.post_grad_pass_manager
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
+        # when dynamo calls the backend, it means the bytecode
+        # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
+        from .monitor import torch_compile_start_time
+        dynamo_time = time.time() - torch_compile_start_time
+        logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
+        self.compilation_config.compilation_time += dynamo_time
 
         # we control the compilation process, each instance can only be
         # called once
         assert not self._called, "VllmBackend can only be called once"
 
         self.graph = graph
-        # config is read now, because only here can
-        # we get the sizes to capture for cudagraph
-        # from compilation context
-        self.compilation_configs = CompilationConfig.select_and_init_config()
-        self.add_passes_to_config()
+        self.configure_post_pass()
 
         self.split_gm, self.piecewise_graphs = split_graph(
-            graph, self.compilation_configs.non_cudagraph_ops)
+            graph, self.compilation_config.splitting_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
-        logger.debug("%s", lazy_format_graph_code("before split", self.graph))
-        logger.debug("%s", lazy_format_graph_code("after split",
-                                                  self.split_gm))
+
+        # depyf will hook lazy_format_graph_code and dump the graph
+        # for debugging, no need to print the graph here
+        lazy_format_graph_code("before split", self.graph)
+        lazy_format_graph_code("after split", self.split_gm)
 
         compilation_counter.num_piecewise_graphs_seen += len(
             self.piecewise_graphs)
@@ -461,13 +507,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.compilation_configs,
+                                    self.vllm_config,
                                     self.graph_pool).run(*example_inputs)
 
         self._called = True
 
-        if not self.compilation_configs.use_cudagraph or \
-            not self.compilation_configs.cudagraph_copy_inputs:
+        if not self.compilation_config.use_cudagraph or \
+            not self.compilation_config.cudagraph_copy_inputs:
             return self.split_gm
 
         # if we need to copy input buffers for cudagraph
@@ -527,10 +573,9 @@ class ConcreteSizeEntry:
 
 class PiecewiseBackend:
 
-    def __init__(self, graph: fx.GraphModule,
-                 compilation_configs: CompilationConfig, graph_pool: Any,
-                 piecewise_compile_index: int, total_piecewise_compiles: int,
-                 sym_shape_indices: List[int],
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, piecewise_compile_index: int,
+                 total_piecewise_compiles: int, sym_shape_indices: List[int],
                  compiled_graph_for_general_shape: Callable):
         """
         The backend for piecewise compilation.
@@ -538,7 +583,7 @@ def __init__(self, graph: fx.GraphModule,
 
         We will compile `self.graph` once for the general shape,
         and then compile for different shapes specified in
-        `compilation_configs.compile_sizes`.
+        `compilation_config.compile_sizes`.
 
         Independently, we will capture cudagraph for different shapes.
 
@@ -546,7 +591,8 @@ def __init__(self, graph: fx.GraphModule,
         compile it first, and then capture cudagraph.
         """
         self.graph = graph
-        self.compilation_configs = compilation_configs
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
@@ -556,10 +602,10 @@ def __init__(self, graph: fx.GraphModule,
             piecewise_compile_index == total_piecewise_compiles - 1)
 
         self.compile_sizes: Set[int] = set(
-            self.compilation_configs.compile_sizes)
+            self.compilation_config.compile_sizes)
         self.capture_sizes: Set[int] = set(
-            self.compilation_configs.capture_sizes
-        ) if self.compilation_configs.use_cudagraph else set()
+            self.compilation_config.capture_sizes
+        ) if self.compilation_config.use_cudagraph else set()
 
         self.first_run_finished = False
 
@@ -572,6 +618,8 @@ def __init__(self, graph: fx.GraphModule,
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
         self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
+        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union(
+            self.capture_sizes)
         for shape in self.compile_sizes.union(self.capture_sizes):
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_shape=shape,
@@ -582,6 +630,9 @@ def __init__(self, graph: fx.GraphModule,
     def __call__(self, *args) -> Any:
         if not self.first_run_finished:
             self.first_run_finished = True
+            # no specific sizes to compile
+            if self.is_last_graph and not self.to_be_compiled_sizes:
+                end_monitoring_torch_compile(self.vllm_config)
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
@@ -596,26 +647,36 @@ def __call__(self, *args) -> Any:
 
         if entry.need_to_compile and not entry.compiled:
             entry.compiled = True
+            self.to_be_compiled_sizes.remove(runtime_shape)
             # args are real arguments
             entry.runnable = wrap_inductor(
                 self.graph,
                 args,
-                self.compilation_configs.inductor_compile_config,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
+                graph_index=self.piecewise_compile_index,
+                num_graphs=self.total_piecewise_compiles,
                 runtime_shape=runtime_shape,
-                do_logging=self.is_first_graph,
-                use_inductor=self.compilation_configs.use_inductor)
+                use_inductor=self.compilation_config.use_inductor)
+
+            # finished compilations for all required shapes
+            if self.is_last_graph and not self.to_be_compiled_sizes:
+
+                # save the hash of the inductor graph for the next run
+                self.compilation_config.inductor_hash_cache.save_to_file()
+                end_monitoring_torch_compile(self.vllm_config)
 
         if not entry.use_cudagraph:
             return entry.runnable(*args)
 
         if entry.cudagraph is None:
-            if entry.num_finished_warmup < self.compilation_configs.cudagraph_num_of_warmups:  # noqa
+            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
                 entry.num_finished_warmup += 1
                 if self.is_first_graph:
                     logger.debug(
                         "Warming up %s/%s for shape %s",
                         entry.num_finished_warmup,
-                        self.compilation_configs.cudagraph_num_of_warmups,
+                        self.compilation_config.cudagraph_num_of_warmups,
                         runtime_shape)
                 return entry.runnable(*args)
 
@@ -680,12 +741,3 @@ def __call__(self, *args) -> Any:
 
         entry.cudagraph.replay()
         return entry.output
-
-
-def select_default_backend(level: int) -> Union[str, Callable]:
-    if level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
-        backend_str = "eager"
-        return backend_str
-    assert level == CompilationLevel.PIECEWISE
-
-    return VllmBackend()
diff --git a/vllm/compilation/compile_context.py b/vllm/compilation/compile_context.py
deleted file mode 100644
index 29db3d4c637b9..0000000000000
--- a/vllm/compilation/compile_context.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from contextlib import contextmanager
-from typing import Any
-
-_compile_context: Any = None
-
-
-def get_compile_context() -> Any:
-    """Get the current compile context."""
-    return _compile_context
-
-
-@contextmanager
-def set_compile_context(context: Any):
-    """A context manager that stores the current compile context,
-    usually it is a list of sizes to specialize.
-    """
-    global _compile_context
-    prev_context = _compile_context
-    _compile_context = context
-    try:
-        yield
-    finally:
-        _compile_context = prev_context
diff --git a/vllm/compilation/config.py b/vllm/compilation/config.py
deleted file mode 100644
index 3e663505c627d..0000000000000
--- a/vllm/compilation/config.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import copy
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-from pydantic import BaseModel, Field, PrivateAttr
-
-import vllm.envs as envs
-from vllm.logger import init_logger
-
-from .compile_context import get_compile_context
-
-logger = init_logger(__name__)
-
-
-class CompilationConfig(BaseModel):
-    """
-    Configuration for compilation.
-    It has two parts:
-    - CudaGraph capture:
-        - use_cudagraph: whether to use cudagraph inside compilation.
-            - False: cudagraph inside compilation is not used.
-            - True: cudagraph inside compilation is used. It requires
-                that all input buffers have fixed addresses.
-            Note that this is orthogonal to the cudagraph capture out
-            side of compilation.
-            TODO: move outside cudagraph logic into compilation.
-            torch.compile will handle cudagraph capture logic in the future.
-        - cudagraph_capture_sizes: sizes to capture cudagraph.
-            - None: capture sizes are inferred from compilation context.
-            - List[int]: capture sizes are specified.
-        - cudagraph_num_of_warmups: number of warmup runs for cudagraph.
-            It means the first several runs will be treated as warmup runs.
-            Only after that, the execution will be recorded, and the recorded
-            cudagraph will be used for subsequent runs.
-        - cudagraph_copy_inputs: whether to copy input tensors for
-            cudagraph. If the caller can guarantee that the same input buffers
-            are always used, it can set this to False. Otherwise, it should
-            set this to True, and the compiler will copy the input to an
-            internally managed buffer. Default is False.
-    - Inductor compilation:
-        - use_inductor: whether to use inductor compilation.
-            - False: inductor compilation is not used. graph runs in eager.
-            - True: inductor compilation is used. one graph for symbolic shape
-                is compiled. In addition, compile for different sizes specified
-                in inductor_compile_sizes, using configurations
-                in inductor_compile_config.
-        - inductor_compile_sizes: sizes to compile for inductor.
-        - inductor_specialize_for_cudagraph_no_more_than: an optional integer
-            to specialize inductor for cudagraph sizes no more than the
-            specified size. It is useful when we want to specialize inductor
-            with a subset of cudagraph sizes.
-        - inductor_compile_config: additional configurations for inductor.
-            - None: use default configurations.
-        - inductor_passes: additional passes for inductor. It is a dictionary
-            from pass name to pass function qualified name. We use function
-            name because the config uses json format. If we pass the config
-            from Python, functions can also be passed directly via Python object
-            constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
-    - Custom inductor passes:
-        - dump_graph_stages: list of stages for which we want to dump the graph.
-            Each pass defines its own stages (before, after, maybe in-between).
-        - dump_graph_dir: directory to dump the graph. Default is .
-        - enable_fusion: whether to enable the custom fusion pass.
-            TODO better pass enabling system.
-    
-    Why we have different sizes for cudagraph and inductor:
-    - cudagraph: a cudagraph captured for a specific size can only be used
-        for the same size. We need to capture all the sizes we want to use.
-    - inductor: a graph compiled by inductor for a general shape can be used
-        for different sizes. Inductor can also compile for specific sizes,
-        where it can have more information to optimize the graph with fully
-        static shapes. However, we find the general shape compilation is
-        sufficient for most cases. It might be beneficial to compile for
-        certain small batchsizes, where inductor is good at optimizing.
-    """
-    use_inductor: bool = True
-    inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None
-    inductor_compile_sizes: Optional[List[int]] = Field(default_factory=dict)
-    inductor_compile_config: Dict = Field(default_factory=dict)
-    inductor_passes: Dict[str, str] = Field(default_factory=dict)
-
-    use_cudagraph: bool = False
-    non_cudagraph_ops: List[str] = Field(default_factory=list)
-    cudagraph_num_of_warmups: int = 0
-    cudagraph_capture_sizes: Optional[List[int]] = None
-    cudagraph_copy_inputs: bool = False
-
-    dump_graph_stages: List[str] = Field(default_factory=list)
-    dump_graph_dir: Path = Field(default=Path("."))
-    enable_fusion: bool = True
-
-    # not configurable, computed after init
-    compile_sizes: List[int] = PrivateAttr
-    capture_sizes: List[int] = PrivateAttr
-
-    def model_post_init(self, __context: Any) -> None:
-        for k, v in self.inductor_passes.items():
-            if not isinstance(v, str):
-                assert callable(v), (
-                    f"pass {k} should be a function or a qualified name")
-                self.inductor_compile_config[k] = v
-                continue
-
-            # resolve function from qualified name
-            names = v.split(".")
-            module = ".".join(names[:-1])
-            func_name = names[-1]
-            func = __import__(module).__dict__[func_name]
-            self.inductor_compile_config[k] = func
-
-    def init_during_runtime(self):
-        """To complete the initialization of config,
-        we need to know the compile context, which is only available
-        during the first run of the model.
-        """
-        context = get_compile_context()
-        context = copy.deepcopy(context) if context is not None else []
-        sizes_to_specialize: List[int] = context
-        if self.cudagraph_capture_sizes is None:
-            self.capture_sizes = sizes_to_specialize
-        else:
-            self.capture_sizes = self.cudagraph_capture_sizes
-            logger.info(("cudagraph sizes specified by model runner"
-                         " %s is overridden by config %s"),
-                        sizes_to_specialize, self.cudagraph_capture_sizes)
-        if self.inductor_specialize_for_cudagraph_no_more_than is not None:
-            assert self.inductor_compile_sizes is None, (
-                "inductor_compile_sizes should be None when "
-                "inductor_specialize_for_cudagraph_no_more_than is not None")
-            self.compile_sizes = [
-                x for x in self.capture_sizes
-                if x <= self.inductor_specialize_for_cudagraph_no_more_than
-            ]
-        else:
-            assert self.inductor_compile_sizes is not None, (
-                "inductor_compile_sizes should not be None when "
-                "inductor_specialize_for_cudagraph_no_more_than is None")
-            self.compile_sizes = self.inductor_compile_sizes
-
-    @staticmethod
-    def select_and_init_config() -> "CompilationConfig":
-        """The order of selecting config is:
-        1. Use the config specified in environment variable.
-        2. Use the config specified in plugins.
-        3. Use the default config.
-        """
-        config_path = envs.VLLM_TORCH_COMPILE_CONFIG
-        if config_path is not None:
-            with open(config_path) as json_file:
-                config = CompilationConfig.model_validate_json(
-                    json_file.read())
-        else:
-            from vllm.plugins import get_compilation_config
-            predefined_config = get_compilation_config()
-            config = predefined_config if predefined_config is not None else (
-                CompilationConfig())
-
-        config.init_during_runtime()
-        return config
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 100a49aba74ac..6385f1c5dbf81 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -5,6 +5,7 @@
 
 @dataclasses.dataclass
 class CompilationCounter:
+    num_models_seen: int = 0
     num_graphs_seen: int = 0
     # including the splitting ops
     num_piecewise_graphs_seen: int = 0
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index ca1e96a33c014..805a217ee6ca1 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,22 +1,41 @@
 import inspect
-from typing import Dict, List, Optional, Union
+from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
 
 import torch
+import torch.nn as nn
 
-import vllm.envs as envs
-from vllm.compilation.levels import CompilationLevel
+from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import VllmConfig
+from vllm.config import CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils import supports_dynamo
 
+from .monitor import start_monitoring_torch_compile
+
 logger = init_logger(__name__)
 
+_T = TypeVar("_T", bound=type[nn.Module])
+
+
+@overload
+def support_torch_compile(
+    *,
+    dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]],
+) -> Callable[[_T], _T]:
+    ...
+
+
+@overload
+def support_torch_compile(cls: _T) -> _T:
+    ...
+
 
 def support_torch_compile(
-        cls: Optional[type] = None,
-        dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None):
+    cls: Optional[_T] = None,
+    *,
+    dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None,
+) -> Union[Callable[[_T], _T], _T]:
     """
     A decorator to add support for compiling the forward method of a class.
 
@@ -67,7 +86,7 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
     computation graph.
     """
 
-    def cls_decorator_helper(cls: type):
+    def cls_decorator_helper(cls: _T) -> _T:
         # helper to pass `dynamic_arg_dims`` to `_support_torch_compile``
         # to avoid too much indentation for `_support_torch_compile``
         if not hasattr(cls, 'forward'):
@@ -106,8 +125,10 @@ def cls_decorator_helper(cls: type):
     return cls_decorator_helper
 
 
-def _support_torch_compile(cls: type,
-                           dynamic_arg_dims: Dict[str, Union[int, List[int]]]):
+def _support_torch_compile(
+    cls: _T,
+    dynamic_arg_dims: Dict[str, Union[int, List[int]]],
+) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
     """
@@ -120,20 +141,24 @@ def _support_torch_compile(cls: type,
     #  other than TorchCompileWrapperWithCustomDispatcher
     cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
 
-    old_init = cls.__init__  # type: ignore
+    old_init = cls.__init__
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
         old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
+        self.vllm_config = vllm_config
         # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
-        self.do_not_compile = envs.VLLM_TORCH_COMPILE_LEVEL in [
+        self.do_not_compile = \
+            vllm_config.compilation_config.level in [
             CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
         ] or not supports_dynamo()
         if self.do_not_compile:
             return
-        TorchCompileWrapperWithCustomDispatcher.__init__(self)
+        compilation_counter.num_models_seen += 1
+        TorchCompileWrapperWithCustomDispatcher.__init__(
+            self, compilation_level=vllm_config.compilation_config.level)
 
-    cls.__init__ = __init__  # type: ignore
+    cls.__init__ = __init__
 
     def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
@@ -159,6 +184,8 @@ def __call__(self, *args, **kwargs):
                         raise ValueError(
                             "Unsupported dynamic dimensions"
                             f" {dims} for argument {k} with type {type(arg)}.")
+            # here, it is the starting point of the `torch.compile` process
+            start_monitoring_torch_compile(self.vllm_config)
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
@@ -178,5 +205,5 @@ def __call__(self, *args, **kwargs):
             model_output = self.forward(*args, **kwargs)
             return model_output
 
-    cls.__call__ = __call__  # type: ignore
+    cls.__call__ = __call__
     return cls
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
new file mode 100644
index 0000000000000..e15d7b315c50f
--- /dev/null
+++ b/vllm/compilation/fix_functionalization.py
@@ -0,0 +1,180 @@
+import operator
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+
+from vllm.logger import init_logger
+
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+class FixFunctionalizationPass(VllmInductorPass):
+    """
+    This pass defunctionalizes certain nodes to avoid redundant tensor copies.
+    After this pass, DCE (dead-code elimination) should never be run,
+    as de-functionalized nodes may appear as dead code.
+
+    To add new nodes to defunctionalize, add to the if-elif chain in __call__.
+    """
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_fix_functionalization")
+
+        self.nodes_to_remove: List[torch.fx.Node] = []
+        count = 0
+        for node in graph.nodes:
+            if not is_func(node, auto_functionalized):
+                continue  # Avoid deep if-elif nesting
+
+            kwargs = node.kwargs
+            at_target = node.args[0]
+
+            if at_target == torch.ops._C.rotary_embedding.default:
+                query = kwargs['query']
+                mm_node = query.args[0].args[0]
+
+                # rotary_embedding is a special case: the two mutating inputs
+                # are query and key, which are slices of mm_node.
+                # While functionalized, results at[1] and at[2] are scattered
+                # back into mm_node. After de-functionalization, we can just
+                # use mm_node directly.
+                for idx, user in self.getitem_users(node).items():
+                    for user_of_getitem in user.users:
+                        if is_func(user_of_getitem,
+                                   torch.ops.aten.slice_scatter.default):
+                            user_of_getitem.replace_all_uses_with(mm_node)
+                            self._remove(user_of_getitem)
+                    self._remove(user)
+
+                self.insert_defunctionalized(graph, node)
+                self._remove(node)
+
+            # rms_norm replacements avoid the most copies for LLaMa.
+            elif at_target == torch.ops._C.fused_add_rms_norm.default:
+                mutated_args = {1: 'input', 2: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
+            elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
+                mutated_args = {1: 'result', 2: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
+            elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default:  # noqa: E501
+                mutated_args = {1: 'result', 2: 'scale', 3: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
+            elif at_target in [
+                    torch.ops._C.rms_norm.default,
+                    torch.ops._C.rms_norm_static_fp8_quant.default
+            ]:
+                mutated_args = {1: 'result'}
+                self.defunctionalize(graph, node, mutated_args)
+
+            elif at_target == torch.ops._C.silu_and_mul.default:
+                mutated_args = {1: 'out'}
+                # Because we have an 'out', need to specify args directly
+                self.defunctionalize(graph,
+                                     node,
+                                     mutated_args,
+                                     args=('out', 'input'))
+            else:
+                continue  # skip the count
+
+            count += 1
+
+        self.dump_graph(graph, "before_fix_functionalization_cleanup")
+
+        # Remove the nodes all at once
+        count_removed = len(self.nodes_to_remove)
+        for node in self.nodes_to_remove:
+            graph.erase_node(node)
+
+        logger.debug("De-functionalized %s nodes, removed %s nodes", count,
+                     count_removed)
+        self.dump_graph(graph, "after_fix_functionalization")
+        self.end_and_log()
+
+    def _remove(self, node_or_nodes: Union[torch.fx.Node,
+                                           Iterable[torch.fx.Node]]):
+        """
+        Stage a node (or nodes) for removal at the end of the pass.
+        """
+        if isinstance(node_or_nodes, torch.fx.Node):
+            self.nodes_to_remove.append(node_or_nodes)
+        else:
+            self.nodes_to_remove.extend(node_or_nodes)
+
+    def defunctionalize(self,
+                        graph: torch.fx.Graph,
+                        node: torch.fx.Node,
+                        mutated_args: Dict[int, Union[torch.fx.Node, str]],
+                        args: Optional[Tuple[Union[torch.fx.Node, str],
+                                             ...]] = None):
+        """
+        De-functionalize a node by replacing it with a call to the original.
+        It also replaces the getitem users with the mutated arguments.
+        See replace_users_with_mutated_args and insert_defunctionalized.
+        """
+        self.replace_users_with_mutated_args(node, mutated_args)
+        self.insert_defunctionalized(graph, node, args=args)
+        self._remove(node)
+
+    def replace_users_with_mutated_args(self, node: torch.fx.Node,
+                                        mutated_args: Dict[int,
+                                                           Union[torch.fx.Node,
+                                                                 str]]):
+        """
+        Replace all getitem users of the auto-functionalized node with the
+        mutated arguments.
+        :param node: The auto-functionalized node
+        :param mutated_args: The mutated arguments, indexed by getitem index.
+        If the value of an arg is a string, `node.kwargs[arg]` is used.
+        """
+        for idx, user in self.getitem_users(node).items():
+            arg = mutated_args[idx]
+            arg = node.kwargs[arg] if isinstance(arg, str) else arg
+            user.replace_all_uses_with(arg)
+            self._remove(user)
+
+    def getitem_users(self, node: torch.fx.Node) -> Dict[int, torch.fx.Node]:
+        """
+        Returns the operator.getitem users of the auto-functionalized node,
+        indexed by the index they are getting.
+        """
+        users = {}
+        for user in node.users:
+            if is_func(user, operator.getitem):
+                idx = user.args[1]
+                users[idx] = user
+        return users
+
+    def insert_defunctionalized(self,
+                                graph: torch.fx.Graph,
+                                node: torch.fx.Node,
+                                args: Optional[Tuple[Union[torch.fx.Node, str],
+                                                     ...]] = None):
+        """
+        Insert a new defunctionalized node into the graph before node.
+        If one of the kwargs is 'out', provide args directly,
+        as node.kwargs cannot be used.
+        See https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351
+
+        :param graph: Graph to insert the defunctionalized node into
+        :param node: The auto-functionalized node to defunctionalize
+        :param args: If we cannot use kwargs, specify args directly.
+        If an arg is a string, `node.kwargs[arg]` is used.
+        """  # noqa: E501
+        assert is_func(node, auto_functionalized), \
+            f"node must be auto-functionalized, is {node} instead"
+
+        # Create a new call to the original function
+        with graph.inserting_before(node):
+            function = node.args[0]
+            if args is None:
+                graph.call_function(function, kwargs=node.kwargs)
+            else:
+                # Args passed as strings refer to items in node.kwargs
+                args = tuple(node.kwargs[arg] if isinstance(arg, str) else arg
+                             for arg in args)
+                graph.call_function(function, args=args)
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index eb43604b1399b..cde27bd108212 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -1,133 +1,520 @@
-import operator
-from typing import Iterable, List, Optional
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import torch
+import torch._inductor.pattern_matcher as pm
+# TODO(luka) use vllm.utils once #10836 landed
+from compressed_tensors.quantization import FP8_DTYPE
+from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
-from torch._inductor.pattern_matcher import (Match, PatternMatcherPass,
-                                             fwd_only, register_replacement)
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._ops import OpOverload
 
-from vllm.compilation.config import CompilationConfig
-from vllm.compilation.inductor_pass import InductorPass
+from vllm.config import CompilationConfig
 from vllm.logger import init_logger
 
-logger = init_logger(__name__)
-
+from .fx_utils import find_getitem_maybe
+from .multi_output_match import MultiOutputMatch
+from .vllm_inductor_pass import VllmInductorPass
 
-def rms_pattern_static(result: torch.Tensor, result_rms: torch.Tensor,
-                       input: torch.Tensor, weight: torch.Tensor,
-                       scale: torch.Tensor):
-    at1 = auto_functionalized(torch.ops._C.rms_norm.default,
-                              result=result_rms,
-                              input=input,
-                              weight=weight,
-                              epsilon=1e-5)
-    at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at1[1],
-                              scale=scale)
-
-    # result
-    return at2[1]
-
-
-def rms_replacement_static(result: torch.Tensor, result_rms: torch.Tensor,
-                           input: torch.Tensor, weight: torch.Tensor,
-                           scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.rms_norm_static_fp8_quant.default,
-                             result=result,
-                             input=input,
-                             weight=weight,
-                             scale=scale,
-                             epsilon=1e-5)
-
-    # result
-    return at[1]
-
-
-def rms_pattern_residual_static(result: torch.Tensor, input: torch.Tensor,
-                                residual: torch.Tensor, weight: torch.Tensor,
-                                scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.fused_add_rms_norm.default,
-                             input=input,
-                             residual=residual,
-                             weight=weight,
-                             epsilon=1e-5)
-    at1 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at[1],
-                              scale=scale)
-
-    # result, residual
-    return at1[1], at[2]
-
-
-def rms_replacement_residual_static(result: torch.Tensor, input: torch.Tensor,
-                                    residual: torch.Tensor,
-                                    weight: torch.Tensor, scale: torch.Tensor):
-    at = auto_functionalized(
-        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
-        result=result,
-        input=input,
-        residual=residual,
-        weight=weight,
-        scale=scale,
-        epsilon=1e-5)
-    # result, residual
-    return at[1], at[2]
+logger = init_logger(__name__)
 
 
 def empty_bf16(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
 
 
-def empty_fp8(*args, **kwargs):
-    fp8 = torch.float8_e4m3fn
-    return torch.empty(*args, **kwargs, dtype=fp8, device="cuda")
-
-
 def empty_fp32(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
 
 
-# Utilities for post-processing multi-output matches
-def is_func(node: torch.fx.Node, target) -> bool:
-    return node.op == "call_function" and node.target == target
+RMS_OP = torch.ops._C.rms_norm.default
+RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+
+
+class QuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of quantization.
+    dtype: quantized data type
+    static: static quantization if True, dynamic if False
+    per_tensor: per-tensor quantization if True, per-token if False
+    symmetric: symmetric if True, asymmetric if False
+    """
+    dtype: torch.dtype
+    static: bool
+    per_tensor: bool = True
+    symmetric: bool = True
+
+    def __str__(self):
+        return (f"QuantKey({'static' if self.static else 'dynamic'},"
+                f"{fx.graph.dtype_abbrs[self.dtype]},"
+                f"{'per_tensor' if self.per_tensor else 'per_token'},"
+                f"{'a' if not self.symmetric else ''}symmetric)")
+
+
+kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, True, True)
+kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True)
+kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True)
+
+QUANT_OPS: Dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa
+    kFp8DynamicTensorSym:
+    torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa
+    kFp8DynamicTokenSym:
+    torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa
+}
+
+
+class FusedRMSQuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of RMSNorm + quant fusion.
+    quant: type of quantization
+    fused_add: does the op also perform the residual add
+    """
+    quant: QuantKey
+    fused_add: bool
+
+    def __str__(self):
+        return (f"FusedQuantKey({self.quant}, with"
+                f"{'' if self.fused_add else 'out'} residual)")
+
+
+FUSED_OPS: Dict[FusedRMSQuantKey, OpOverload] = {
+    FusedRMSQuantKey(kFp8StaticTensorSym, False):
+    torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8StaticTensorSym, True):
+    torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8DynamicTokenSym, False):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8DynamicTokenSym, True):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
+}
+
+
+class QuantMultiOutputMatch(MultiOutputMatch):
+
+    def __init__(self, match: pm.Match, quant_op, fused_op):
+        super().__init__(match)
+        assert isinstance(quant_op, OpOverload)
+        assert isinstance(fused_op, OpOverload)
+        self.QUANT_OP = quant_op  # in-place quant op
+        self.FUSED_OP = fused_op  # in-place fused quant op
+
+    def insert_fused_node(self, fused_return_mapping: Dict[int, Tuple[fx.Node,
+                                                                      int]],
+                          **kwargs):
+        """
+        This utility function inserts an auto-functionalized node for FUSED_OP.
+        It also correctly sets its meta value and rebinds the users of the
+        unfused nodes to use the fused node instead.
+
+        :param fused_return_mapping: A dictionary, mapping from getitem indices
+        of the fused node result to a tuple of the old node and a getitem index.
+        :param kwargs: kwargs that get directly forwarded to the auto_fn node
+
+        Example:
+        If we want to replace this graph:
+        _, x1, x2 = auto_fn(op1)
+        _, y1, y2 = auto_fn(op2)
+
+        with
+        _, x1, y2, x2 = auto_fn(FUSED_OP)
 
+        we would call:
+        insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)}
 
-# Returns the first auto_functionalized node with the given op (if it exists)
-def find_auto_fn_maybe(nodes: Iterable[torch.fx.Node],
-                       op) -> Optional[torch.fx.Node]:
-    for node in nodes:
-        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
-            return node
-    return None
+        Note that the 0th element is None for auto-functionalized in-place ops.
+        Hence, others appear 1-indexed.
+        """
+        fused_node = self.insert_auto_fn(self.FUSED_OP, kwargs)
+        indices = fused_return_mapping.keys()
+        getitem_nodes = self.insert_getitems(fused_node, indices)
+
+        # Prepare the meta value, use a list so it's mutable
+        meta_val = [None] * (max(indices) + 1)
+
+        # Iterate through elements of the tuple produced by fused_node
+        for idx, getitem_node in zip(indices, getitem_nodes):
+            old_node, old_idx = fused_return_mapping[idx]
+
+            # If the old value was never used, the old_getitem might not exist
+            old_getitem = find_getitem_maybe(old_node, old_idx)
+            if old_getitem is not None:
+                # Rebind the users of match getitem nodes to use the new nodes.
+                # The old nodes will be removed by DCE at the end of the pass.
+                old_getitem.replace_all_uses_with(getitem_node)
+                getitem_node.meta["val"] = old_getitem.meta["val"]
+
+            # Extract the appropriate meta value
+            # It is present even if the getitem node does not exist
+            meta_val[idx] = old_node.meta["val"][old_idx]
+
+        # Fix the meta value on the new fused node
+        fused_node.meta["val"] = tuple(meta_val)
+
+
+class RMSNormQuantPattern:
+
+    def __init__(self, epsilon: float, key: FusedRMSQuantKey):
+        self.epsilon = epsilon
+        self.quant_dtype = key.quant.dtype
+
+        assert key.quant in QUANT_OPS, \
+            f"unsupported quantization scheme {key.quant}"
+        self.QUANT_OP = QUANT_OPS[key.quant]
+
+        assert key in FUSED_OPS, \
+            f"unsupported fused rmsnorm+quant op for {key}"
+        self.FUSED_OP = FUSED_OPS[key]
+
+
+class RMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        fused_key = FusedRMSQuantKey(fused_add=False,
+                                     quant=QuantKey(dtype=quant_dtype,
+                                                    static=True,
+                                                    per_tensor=True,
+                                                    symmetric=symmetric))
+        super().__init__(epsilon, fused_key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        # Cannot use methods, as the self argument affects tracing
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale)
+
+            # result
+            return at2[1]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result
+            return at[1]
 
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
 
-# Returns the first auto_functionalized node with the given op
-def find_auto_fn(nodes: Iterable[torch.fx.Node], op) -> torch.fx.Node:
-    node = find_auto_fn_maybe(nodes, op)
-    assert node is not None, f"Could not find {op} in nodes {nodes}"
-    return node
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only,
+                                pm_pass)
+
+
+class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=True,
+                                              per_tensor=True,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale)
+
+            # result, residual
+            return at1[1], at[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result, residual
+            return at[1], at[2]
 
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
 
-# Returns the getitem node that extracts the idx-th element from node
-# (if it exists)
-def find_getitem_maybe(node: torch.fx.Node,
-                       idx: int) -> Optional[torch.fx.Node]:
-    for user in node.users:
-        if is_func(user, operator.getitem) and user.args[1] == idx:
-            return user
-    return None
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
 
+    class Match(QuantMultiOutputMatch):
 
-# Returns the getitem node that extracts the idx-th element from node
-def find_getitem(node: torch.fx.Node, idx: int) -> torch.fx.Node:
-    ret = find_getitem_maybe(node, idx)
-    assert ret is not None, f"Could not find getitem {idx} in node {node}"
-    return ret
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
 
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 1
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and residual.
+            # The auto_fn node returns a tuple of (None, result, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # residual_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                # 0 is always None
+                fused_return_mapping = {1: (quant_node, 1), 2: (rms_node, 2)}
+                self.insert_fused_node(fused_return_mapping,
+                                       epsilon=rms_node.kwargs["epsilon"],
+                                       **kwargs)
+
+
+class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 per_tensor: bool,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=False,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              per_tensor=per_tensor,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, scale
+            return at2[1], at2[2]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=None)
+
+            # result, scale
+            return at[1], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
+
+    class Match(QuantMultiOutputMatch):
+
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
 
-class FusionPass(InductorPass):
+            assert len(rms_node.users) == 1
+            assert len(quant_node.users) == 2
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and scale.
+            # The auto_fn node returns a tuple of (None, result, scale).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+                del kwargs["result_rms"]  # not used in the fused op
+
+                fused_return_mapping = {1: (quant_node, 1), 2: (quant_node, 2)}
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    residual=None,  # not used but required
+                    **kwargs)
+
+
+class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 per_tensor: bool = True,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              per_tensor=per_tensor,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, residual, scale
+            return at1[1], at[2], at1[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=residual)
+
+            # result, residual, scale
+            return at[1], at[3], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
+
+    class Match(QuantMultiOutputMatch):
+
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 2
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract result, scale, and residual.
+            # The auto_fn node returns a tuple (None, result, scale, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            # residual_node_new = at[3]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                fused_return_mapping = {
+                    1: (quant_node, 1),  # result
+                    2: (quant_node, 2),  # scale
+                    3: (rms_node, 2),  # residual
+                }
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    **kwargs)
+
+
+class FusionPass(VllmInductorPass):
     """
     This pass fuses a pre-defined set of custom ops into fused ops.
     It uses the torch pattern matcher to find the patterns and replace them.
@@ -142,7 +529,7 @@ class FusionPass(InductorPass):
     _instance: 'Optional[FusionPass]' = None
 
     @classmethod
-    def instance(cls, config: CompilationConfig):
+    def instance(cls, config: CompilationConfig.PassConfig):
         """
         Get the singleton instance of the FusionPass.
         If the instance exists, the config is updated but
@@ -154,46 +541,44 @@ def instance(cls, config: CompilationConfig):
             cls._instance.config = config
         return cls._instance
 
-    def __init__(self, config: CompilationConfig):
+    def __init__(self, config: CompilationConfig.PassConfig):
         assert self.__class__._instance is None, \
             "FusionPass singleton instance already exists"
         super().__init__(config)
 
-        self.matches: List[Match] = []
+        self.matches: List[MultiOutputMatch] = []
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="fusion_pass")
 
-        # Fuse rms_norm + static_scaled_fp8_quant into
-        # rms_norm_static_fp8_quant
-        inputs = [
-            empty_fp8(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(1, 5),
-            empty_fp32(1, 1)
-        ]
-        register_replacement(rms_pattern_static, rms_replacement_static,
-                             inputs, fwd_only, self.patterns)
+        for epsilon in [1e-5, 1e-6]:
+            # Fuse rms_norm + static fp8 quant
+            RMSNormStaticQuantPattern(epsilon,
+                                      FP8_DTYPE).register(self.patterns)
 
-        # Fuse fused_add_rms_norm + static_scaled_fp8_quant into
-        # fused_add_rms_norm_static_fp8_quant
-        # Because pattern has 2 outputs, we need to manually process the match
-        # (see process_matches)
-        inputs = [
-            empty_fp8(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(1, 5),
-            empty_fp32(1, 1)
-        ]
-        register_replacement(rms_pattern_residual_static,
-                             rms_replacement_residual_static,
-                             inputs,
-                             fwd_only,
-                             self.patterns,
-                             extra_check=lambda m: self.record_match(m))
-
-    def record_match(self, match: Match) -> bool:
+            # Matches for patterns below have 2 or more outputs,
+            # so we need to process them manually (see process_matches)
+
+            # Fuse rms_norm + static fp8 quant
+            FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
+                self.patterns, self.record_match)
+
+            # Fuse rms_norm + dynamic per-token fp8 quant
+            RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE,
+                                       per_tensor=False).register(
+                                           self.patterns, self.record_match)
+
+            # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
+            FusedAddRMSNormDynamicQuantPattern(epsilon,
+                                               FP8_DTYPE,
+                                               per_tensor=False).register(
+                                                   self.patterns,
+                                                   self.record_match)
+
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
+    def record_match(self, match: MultiOutputMatch) -> bool:
         # Hijack the extra_check to record the match and
         # save it for post-processing.
         self.matches.append(match)
@@ -201,83 +586,21 @@ def record_match(self, match: Match) -> bool:
         # Return False to prevent automatic replacement.
         return False
 
-    def process_matches(self, graph: torch.fx.Graph):
+    def process_matches(self, graph: fx.Graph):
         """
         Manually process multi-output matches and replace them with fused nodes.
-        This is necessary because the automatic replacement for multi-output
-        matches is broken: https://github.com/pytorch/pytorch/issues/137280
+        See MultiOutputMatch for more details.
         """
         for match in self.matches:
-            # To avoid use-before-definition errors, insert replacement nodes
-            # after the last node in the match.
-            # match.nodes is not guaranteed to be sorted.
-            # Find the last node in the match.
-            for last_node_in_match in reversed(graph.nodes):
-                if last_node_in_match in match.nodes:
-                    break
-            else:
-                raise ValueError("No nodes in graph")
-
-            # Insert a new auto_functionalized node for the fused operation,
-            # as well as getitem nodes to extract the result and residual.
-            # The auto_functionalized node returns a tuple of
-            # (None, result, residual) - None is the function return value.
-            # The resulting graph looks like this:
-            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
-            # result_node_new = at[1]
-            # residual_node_new = at[2]
-            with graph.inserting_after(last_node_in_match):
-                kwargs = match.kwargs
-                kwargs["epsilon"] = 1e-5  # Currently hard-coded in RMSNorm
-
-                fused_node = graph.call_function(
-                    auto_functionalized,
-                    (torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
-                     ),
-                    kwargs=kwargs)
-
-                graph.inserting_after(fused_node)
-                result_node_new = graph.call_function(operator.getitem,
-                                                      (fused_node, 1))
-                residual_node_new = graph.call_function(
-                    operator.getitem, (fused_node, 2))
-
-            # Last part of replacement is rebinding the users of nodes in the
-            # match to use the new nodes.
-
-            # Find the nodes in the match that we need to rebind
-            rms_node = find_auto_fn(match.nodes,
-                                    torch.ops._C.fused_add_rms_norm.default)
-            quant_node = find_auto_fn(
-                match.nodes, torch.ops._C.static_scaled_fp8_quant.default)
-
-            assert len(rms_node.users) == 2
-            assert len(quant_node.users) == 1
-
-            # meta["val"] is used by de-functionalization and has to contain the
-            # value of the node (tuple of tensors) that would be returned by the
-            # functionalized node during tracing.
-
-            rms_tup = rms_node.meta["val"]
-            quant_tup = quant_node.meta["val"]
-
-            # The result of fused_node must be a tuple with the first element
-            # None (the function return value) and the remaining elements
-            # representing the mutated inputs.
-            fused_tup = (None, quant_tup[1], rms_tup[1], rms_tup[2])
-            fused_node.meta["val"] = fused_tup
-
-            # Find the getitem nodes and replace their uses with the new nodes.
-            # The old nodes will be removed by DCE at the end of the pass.
-            find_getitem(rms_node, 2).replace_all_uses_with(residual_node_new)
-            find_getitem(quant_node, 1).replace_all_uses_with(result_node_new)
+            match.process()
 
         # Finally, remove matched nodes
         graph.eliminate_dead_code()
         assert all(node not in graph.nodes for match in self.matches
-                   for node in match.nodes)
+                   for node in match.match.nodes)
 
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: fx.Graph):
+        self.begin()
         self.dump_graph(graph, "before_fusion")
 
         count = self.patterns.apply(graph)
@@ -289,3 +612,4 @@ def __call__(self, graph: torch.fx.Graph):
         logger.debug("Post-processed %s matches", len(self.matches))
         self.dump_graph(graph, "after_fusion")
         self.matches.clear()
+        self.end_and_log()
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
new file mode 100644
index 0000000000000..924e26f2e262e
--- /dev/null
+++ b/vllm/compilation/fx_utils.py
@@ -0,0 +1,42 @@
+import operator
+from typing import Iterable, Optional
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._ops import OpOverload
+
+
+def is_func(node: fx.Node, target) -> bool:
+    return node.op == "call_function" and node.target == target
+
+
+# Returns the first auto_functionalized node with the given op (if it exists)
+def find_auto_fn_maybe(nodes: Iterable[fx.Node],
+                       op: OpOverload) -> Optional[fx.Node]:
+    for node in nodes:
+        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
+            return node
+    return None
+
+
+# Returns the first auto_functionalized node with the given op
+def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
+    node = find_auto_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
+# Returns the getitem node that extracts the idx-th element from node
+# (if it exists)
+def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]:
+    for user in node.users:
+        if is_func(user, operator.getitem) and user.args[1] == idx:
+            return user
+    return None
+
+
+# Returns the getitem node that extracts the idx-th element from node
+def find_getitem(node: fx.Node, idx: int) -> fx.Node:
+    ret = find_getitem_maybe(node, idx)
+    assert ret is not None, f"Could not find getitem {idx} in node {node}"
+    return ret
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index b23351fa19759..f6846c08ac841 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,38 +1,84 @@
+import hashlib
+import inspect
+import types
 from abc import ABC, abstractmethod
+from typing import Any, Callable, Optional, Union
 
 import torch
-
-from vllm.compilation.config import CompilationConfig
-# yapf: disable
-from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank
-from vllm.distributed import (
-    get_tensor_model_parallel_world_size as get_tp_world_size)
-from vllm.distributed import model_parallel_is_initialized as p_is_init
-# yapf: enable
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+from torch import fx
 
 
 class InductorPass(ABC):
+    """
+    General custom inductor pass interface.
+    TODO(torch==2.6) use torch._inductor.custom_graph_pass.CustomGraphPass
+    """
 
     @abstractmethod
     def __call__(self, graph: torch.fx.Graph):
+        """
+        Execute the pass on the given graph.
+        """
         raise NotImplementedError
 
-    def __init__(self, config: CompilationConfig):
-        self.config = config
-
-    def dump_graph(self, graph: torch.fx.Graph, stage: str):
-        if stage in self.config.dump_graph_stages:
-            # Make sure filename includes rank in the distributed setting
-            parallel = p_is_init() and get_tp_world_size() > 1
-            rank = f"-{get_tp_rank()}" if parallel else ""
-            filepath = self.config.dump_graph_dir / f"{stage}{rank}.py"
-
-            logger.info("Printing graph to %s", filepath)
-            with open(filepath, "w") as f:
-                src = graph.python_code(root_module="self", verbose=True).src
-                # Add imports so it's not full of errors
-                print("import torch; from torch import device", file=f)
-                print(src, file=f)
+    def uuid(self) -> Any:
+        """
+        Provide a unique identifier for the pass, used in Inductor code cache.
+        This should depend on the pass implementation, so that changes to the
+        pass result in recompilation.
+        By default, the object source is hashed.
+        """
+        return InductorPass.hash_source(self)
+
+    @staticmethod
+    def hash_source(*srcs: Union[str, Any]):
+        """
+        Utility method to hash the sources of functions or objects.
+        :param srcs: strings or objects to add to the hash.
+        Objects and functions have their source inspected.
+        :return:
+        """
+        hasher = hashlib.sha256()
+        for src in srcs:
+            if isinstance(src, str):
+                src_str = src
+            elif isinstance(src, types.FunctionType):
+                src_str = inspect.getsource(src)
+            else:
+                src_str = inspect.getsource(src.__class__)
+            hasher.update(src_str.encode("utf-8"))
+        return hasher.digest()
+
+
+class CallableInductorPass(InductorPass):
+    """
+    This class is a wrapper for a callable that automatically provides an
+    implementation of the UUID.
+    """
+
+    def __init__(self,
+                 callable: Callable[[fx.Graph], None],
+                 uuid: Optional[Any] = None):
+        self.callable = callable
+        if uuid is None:
+            uuid = InductorPass.hash_source(callable)
+        self._uuid = uuid
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.callable(graph)
+
+    def uuid(self) -> Any:
+        return self._uuid
+
+    def __getstate__(self):
+        """
+        Pickling occurs in the Inductor code cache if a pass is not given to
+        the pass manager but is instead directly added to config as a pass.
+        See PostGradPassManager for more.
+
+        TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
+        """
+        return self._uuid
+
+    def __setstate__(self, state):
+        raise ValueError("Cannot unpickle CallableInductorPass")
diff --git a/vllm/compilation/levels.py b/vllm/compilation/levels.py
deleted file mode 100644
index 19a3a2b526870..0000000000000
--- a/vllm/compilation/levels.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# constants for the levels of the compilation process
-
-
-class CompilationLevel:
-    NO_COMPILATION = 0
-    DYNAMO_AS_IS = 1
-    DYNAMO_ONCE = 2
-    PIECEWISE = 3
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
new file mode 100644
index 0000000000000..b97e40415b41b
--- /dev/null
+++ b/vllm/compilation/monitor.py
@@ -0,0 +1,36 @@
+import os
+import time
+
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+context_manager = None
+torch_compile_start_time: float = 0.0
+
+
+def start_monitoring_torch_compile(vllm_config: VllmConfig):
+    global torch_compile_start_time
+    torch_compile_start_time = time.time()
+
+    compilation_config: CompilationConfig = vllm_config.compilation_config
+    if compilation_config.level == CompilationLevel.PIECEWISE and \
+        compilation_config.debug_dump_path:
+        import depyf
+        path = os.path.join(compilation_config.debug_dump_path,
+                            f"rank_{vllm_config.parallel_config.rank}")
+        global context_manager
+        context_manager = depyf.prepare_debug(path)
+        context_manager.__enter__()
+
+
+def end_monitoring_torch_compile(vllm_config: VllmConfig):
+    compilation_config: CompilationConfig = vllm_config.compilation_config
+    if compilation_config.level == CompilationLevel.PIECEWISE:
+        logger.info("torch.compile takes %.2f s in total",
+                    compilation_config.compilation_time)
+        global context_manager
+        if context_manager is not None:
+            context_manager.__exit__(None, None, None)
+            context_manager = None
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
new file mode 100644
index 0000000000000..0ad648abfbb3a
--- /dev/null
+++ b/vllm/compilation/multi_output_match.py
@@ -0,0 +1,105 @@
+import abc
+import operator
+from abc import abstractmethod
+from typing import Iterable, List, Tuple
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor import pattern_matcher as pm
+from torch._ops import OpOverload
+
+from vllm.compilation.fx_utils import find_auto_fn
+
+
+class MultiOutputMatch(abc.ABC):
+    """
+    This class provides utilities to process multi-output matches and
+    manually insert replacements.
+
+    This is necessary because the automatic replacement for multi-output
+    matches is broken: https://github.com/pytorch/pytorch/issues/137280
+    """
+
+    def __init__(self, match: pm.Match):
+        self.match = match
+
+    @abstractmethod
+    def process(self):
+        """
+        Process a multi-output match and manually insert the replacement.
+
+        This method should:
+        1. Insert the replacement nodes after the last node in the match.
+        2. Rebind the users of nodes in the match to use the new nodes.
+        3. Set meta["val"] for de-functionalization.
+
+        The result of an auto-functionalized node is a tuple of tensors.
+        The first element is the return value of the function, usually None.
+        The remaining elements are the mutated args of the function.
+
+        All auto-functionalized nodes must contain a proper meta["val"],
+        as it is used by de-functionalization. meta["val"] has to contain the
+        value of the node (tuple of tensors) that would be returned by the
+        functionalized node during tracing.
+
+        Existing nodes in the graph all have this property set, but we have
+        to set it manually for new nodes we insert.
+
+        Example:
+        # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None
+        at = auto_functionalized(torch.ops._C.foo.default, a, b, c)
+        # at.meta["val"] = (None, a, c)
+        """
+        raise NotImplementedError
+
+    @property
+    def nodes(self) -> List[fx.Node]:
+        return self.match.nodes
+
+    @property
+    def graph(self) -> fx.Graph:
+        return self.match.graph
+
+    def find_auto_fn(self, op) -> fx.Node:
+        """
+        Find the first auto_functionalized node with the given op in the match.
+        """
+        return find_auto_fn(self.nodes, op)
+
+    def inserting_after_match(self):
+        """
+        Insert nodes after the last node in the match.
+        This is done to avoid use-before-definition errors after inserting
+        replacement nodes.
+        """
+
+        # match.nodes is not guaranteed to be sorted.
+        # Find the last node in the match.
+        for last_node_in_match in reversed(self.graph.nodes):
+            if last_node_in_match in self.match.nodes:
+                break
+        else:
+            raise ValueError("No nodes in graph")
+
+        return self.graph.inserting_after(last_node_in_match)
+
+    def insert_getitems(self, tuple_node: fx.Node,
+                        indices: Iterable[int]) -> Tuple[fx.Node, ...]:
+        """
+        Insert operator.getitem nodes to extract elements from a tuple node.
+
+        :param tuple_node: The tuple node to extract elements from.
+        :param indices: The indices of the elements to extract.
+        :return: Tuple of the new getitem nodes, corresponding to the indices.
+        """
+        with self.graph.inserting_after(tuple_node):
+            return tuple(
+                self.graph.call_function(operator.getitem, (tuple_node, idx))
+                for idx in indices)
+
+    def insert_auto_fn(self, op: OpOverload, kwargs):
+        """
+        Insert an auto_functionalized node with the given op and kwargs.
+        """
+        return self.graph.call_function(auto_functionalized, (op, ),
+                                        kwargs=kwargs)
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
new file mode 100644
index 0000000000000..fb522ae053e97
--- /dev/null
+++ b/vllm/compilation/pass_manager.py
@@ -0,0 +1,77 @@
+from typing import List
+
+from torch import fx as fx
+
+from vllm.config import CompilationConfig
+from vllm.logger import init_logger
+
+from .fix_functionalization import FixFunctionalizationPass
+from .fusion import FusionPass
+from .inductor_pass import InductorPass
+from .reshapes import RedundantReshapesPass
+
+logger = init_logger(__name__)
+
+
+class PostGradPassManager:
+    """
+    The pass manager for post-grad passes.
+    It handles configuration, adding custom passes, and running passes.
+    It also supports pickling, which is used by the Inductor code cache.
+    TODO(torch==2.6), use CustomGraphPass
+    (torch._inductor.custom_graph_pass.CustomGraphPass)
+
+    The order of the post-grad post-passes is:
+    1. passes (constructor parameter)
+    2. default passes (RedundantReshapesPass, FusionPass)
+    3. config["post_grad_custom_post_pass"] (if it exists)
+    4. fix_functionalization
+    This way, all passes operate on a functionalized graph.
+    """
+
+    def __init__(self):
+        self.passes: List[InductorPass] = []
+
+    def __call__(self, graph: fx.Graph):
+        for pass_ in self.passes:
+            pass_(graph)
+
+        # always run fix_functionalization last
+        self.fix_functionalization(graph)
+
+    def configure(self, pass_config: CompilationConfig.PassConfig):
+        self.pass_config = pass_config
+        if pass_config.enable_reshape:
+            self.passes += [RedundantReshapesPass(pass_config)]
+
+        if pass_config.enable_fusion:
+            self.passes += [FusionPass.instance(pass_config)]
+
+        self.fix_functionalization = FixFunctionalizationPass(pass_config)
+
+    def add(self, pass_: InductorPass):
+        assert isinstance(pass_, InductorPass)
+        self.passes.append(pass_)
+
+    def __getstate__(self):
+        """
+        Custom pickling for the pass manager, as some passes cannot be pickled.
+        Pickling occurs because the pass manager is set as the value of
+        `config["post_grad_custom_post_pass"]` in the Inductor config.
+        The config is pickled to act as a key in the Inductor code cache.
+        Any other passes in the config are pickled as well.
+
+        TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
+        """
+        state = {"pass_config": self.pass_config.uuid(), "passes": []}
+        for pass_ in self.passes:
+            state["passes"].append(pass_.uuid())
+        state["passes"].append(self.fix_functionalization.uuid())
+        return state
+
+    def __setstate__(self, state):
+        """
+        Do not allow unpickling of the pass manager.
+        If this is needed in the future, it should properly pickle the passes.
+        """
+        raise ValueError("Cannot unpickle PostGradPassManager")
diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
index 36597e119d2e1..ba28b1f0be7bd 100644
--- a/vllm/compilation/reshapes.py
+++ b/vllm/compilation/reshapes.py
@@ -3,14 +3,15 @@
 import torch.fx
 from torch import SymInt
 
-from vllm.compilation.fusion import is_func
-from vllm.compilation.inductor_pass import InductorPass
 from vllm.logger import init_logger
 
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
+
 logger = init_logger(__name__)
 
 
-class RedundantReshapesPass(InductorPass):
+class RedundantReshapesPass(VllmInductorPass):
     """
     This is an inductor pass that removes redundant reshape operations.
     It is required for RMSNorm-quant fusion to work properly.
@@ -31,6 +32,7 @@ class RedundantReshapesPass(InductorPass):
     """
 
     def __call__(self, graph: torch.fx.Graph):
+        self.begin()
         self.dump_graph(graph, "before_reshapes")
         count = 0
         # Remove no-op reshapes/views:
@@ -56,6 +58,7 @@ def __call__(self, graph: torch.fx.Graph):
         logger.debug("Removed %s no-op reshapes", count)
 
         self.dump_graph(graph, "after_reshapes")
+        self.end_and_log()
 
     def dims_equivalent(self, dim: Union[int, torch.fx.Node],
                         i_dim: Union[int, SymInt]) -> bool:
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
new file mode 100644
index 0000000000000..b8c52a7f46838
--- /dev/null
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -0,0 +1,49 @@
+import time
+
+import torch
+
+from vllm.config import CompilationConfig
+# yapf: disable
+from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank
+from vllm.distributed import (
+    get_tensor_model_parallel_world_size as get_tp_world_size)
+from vllm.distributed import model_parallel_is_initialized as p_is_init
+# yapf: enable
+from vllm.logger import init_logger
+
+from .inductor_pass import InductorPass
+
+logger = init_logger(__name__)
+
+
+class VllmInductorPass(InductorPass):
+    """
+    An inductor pass with access to vLLM PassConfig.
+    It provides timing, logging, and dumping utilities.
+    """
+
+    def __init__(self, config: CompilationConfig.PassConfig):
+        self.config = config
+        self.pass_name = self.__class__.__name__
+
+    def dump_graph(self, graph: torch.fx.Graph, stage: str):
+        if stage in self.config.dump_graph_stages:
+            # Make sure filename includes rank in the distributed setting
+            parallel = p_is_init() and get_tp_world_size() > 1
+            rank = f"-{get_tp_rank()}" if parallel else ""
+            filepath = self.config.dump_graph_dir / f"{stage}{rank}.py"
+
+            logger.info("%s printing graph to %s", self.pass_name, filepath)
+            with open(filepath, "w") as f:
+                src = graph.python_code(root_module="self", verbose=True).src
+                # Add imports so it's not full of errors
+                print("import torch; from torch import device", file=f)
+                print(src, file=f)
+
+    def begin(self):
+        self._start_time = time.perf_counter_ns()
+
+    def end_and_log(self):
+        self._end_time = time.perf_counter_ns()
+        duration_ms = float(self._end_time - self._start_time) / 1.0e6
+        logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms)
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 7366ed4d16b0b..c10241b483169 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -8,8 +8,7 @@
 import torch
 
 import vllm.envs as envs
-
-from .levels import CompilationLevel
+from vllm.config import CompilationLevel, get_current_vllm_config
 
 
 class TorchCompileWrapperWithCustomDispatcher:
@@ -25,20 +24,16 @@ class TorchCompileWrapperWithCustomDispatcher:
         `torch.compile` over the forward method.
     """
 
-    def __init__(self, compiled_callable: Optional[Callable] = None):
+    def __init__(self,
+                 compiled_callable: Optional[Callable] = None,
+                 compilation_level: int = 0):
 
         if compiled_callable is None:
             # default compilation settings
             # compiling the forward method
 
-            # choose the compile backend
-
-            # if the user has set the backend, use it
-            from vllm.plugins import get_torch_compile_backend
-            backend = get_torch_compile_backend()
-            if backend is None:
-                from vllm.compilation.backends import select_default_backend
-                backend = select_default_backend(envs.VLLM_TORCH_COMPILE_LEVEL)
+            vllm_config = get_current_vllm_config()
+            backend = vllm_config.compilation_config.init_backend(vllm_config)
 
             compiled_callable = torch.compile(
                 self.forward,
@@ -54,7 +49,7 @@ def __init__(self, compiled_callable: Optional[Callable] = None):
         # subclasses can use this to switch between the custom dispatcher
         # and the default Dynamo guard mechanism.
         self.use_custom_dispatcher: bool = \
-            envs.VLLM_TORCH_COMPILE_LEVEL >= CompilationLevel.DYNAMO_ONCE
+            compilation_level >= CompilationLevel.DYNAMO_ONCE
 
     def __call__(self, *args, **kwargs):
         """Implement the dispatch logic here, beyond the torch.compile level.
diff --git a/vllm/config.py b/vllm/config.py
index 1c190da1d327e..841977c98632e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,17 +1,26 @@
+import ast
 import copy
 import enum
+import hashlib
 import json
+import os
 import warnings
+from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Dict, Final, List,
-                    Literal, Mapping, Optional, Set, Tuple, Type, Union)
+from pathlib import Path
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict,
+                    Final, List, Literal, Mapping, Optional, Set, Tuple, Type,
+                    Union)
 
 import torch
+from pydantic import BaseModel, Field, PrivateAttr
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
+from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
+                                                     get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
 from vllm.tracing import is_otel_available, otel_import_error_traceback
@@ -19,8 +28,9 @@
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
-from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
-                        identity, print_warning_once)
+from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
+                        get_cpu_memory, print_warning_once, random_uuid,
+                        resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -36,13 +46,27 @@
 
 logger = init_logger(__name__)
 
-_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
+_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
 
-TaskOption = Literal["auto", "generate", "embedding"]
+TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
+                     "score", "reward"]
 
-# "draft" is only used internally for speculative decoding
-_Task = Literal["generate", "embedding", "draft"]
+_ResolvedTask = Literal["generate", "embed", "classify", "score", "reward",
+                        "draft"]
+
+RunnerType = Literal["generate", "pooling", "draft"]
+
+_RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = {
+    "generate": ["generate"],
+    "pooling": ["embed", "classify", "score", "reward"],
+    "draft": ["draft"],
+}
+
+_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
+    task: runner
+    for runner, tasks in _RUNNER_TASKS.items() for task in tasks
+}
 
 HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
                                              PretrainedConfig]]
@@ -84,6 +108,8 @@ class ModelConfig:
             the default version.
         max_model_len: Maximum length of a sequence (including prompt and
             output). If None, will be derived from the model.
+        spec_target_max_model_len: Specify the the maximum length for spec
+            decoding draft models.
         quantization: Quantization method that was used to quantize the model
             weights. If None, we assume the model weights are not quantized.
         quantization_param_path: Path to JSON file containing scaling factors.
@@ -100,6 +126,7 @@ class ModelConfig:
             to eager mode. Additionally for encoder-decoder models, if the
             sequence length of the encoder input is larger than this, we fall
             back to the eager mode.
+        max_logprobs: Maximum number of log probabilities. Defaults to 20.
         disable_sliding_window: Whether to disable sliding window. If True,
             we will disable the sliding window functionality of the model.
             If the model does not support sliding window, this argument is
@@ -112,6 +139,8 @@ class ModelConfig:
             the model name will be the same as `model`.
         limit_mm_per_prompt: Maximum number of data items per modality
             per prompt. Only applicable for multimodal models.
+        use_async_output_proc: Whether to use async output processor.
+            Defaults to True.
         config_format: The config format which shall be loaded.
             Defaults to 'auto' which defaults to 'hf'.
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
@@ -119,47 +148,78 @@ class ModelConfig:
             HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
+        mm_cache_preprocessor: If true, then enables caching of the multi-modal 
+            preprocessor/mapper. Otherwise, the mapper executes each time, and 
+            for better performance consider enabling frontend process.
         override_neuron_config: Initialize non default neuron config or
             override default neuron config that are specific to Neuron devices,
             this argument will be used to configure the neuron config that
             can not be gathered from the vllm arguments.
-        override_pooling_config: Initialize non default pooling config or
-            override default pooling config for the embedding model.
+        override_pooler_config: Initialize non default pooling config or
+            override default pooling config for the pooling model.
+        logits_processor_pattern: Optional regex pattern specifying valid
+            logits processor qualified names that can be passed with the
+            `logits_processors` extra completion argument. Defaults to None, 
+            which allows no processors.
     """
 
-    def __init__(
-            self,
-            model: str,
-            task: Union[TaskOption, _Task],
-            tokenizer: str,
-            tokenizer_mode: str,
-            trust_remote_code: bool,
-            dtype: Union[str, torch.dtype],
-            seed: int,
-            allowed_local_media_path: str = "",
-            revision: Optional[str] = None,
-            code_revision: Optional[str] = None,
-            rope_scaling: Optional[Dict[str, Any]] = None,
-            rope_theta: Optional[float] = None,
-            tokenizer_revision: Optional[str] = None,
-            max_model_len: Optional[int] = None,
-            spec_target_max_model_len: Optional[int] = None,
-            quantization: Optional[str] = None,
-            quantization_param_path: Optional[str] = None,
-            enforce_eager: Optional[bool] = None,
-            max_seq_len_to_capture: Optional[int] = None,
-            max_logprobs: int = 20,
-            disable_sliding_window: bool = False,
-            skip_tokenizer_init: bool = False,
-            served_model_name: Optional[Union[str, List[str]]] = None,
-            limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
-            use_async_output_proc: bool = True,
-            config_format: ConfigFormat = ConfigFormat.AUTO,
-            chat_template_text_format: str = "string",
-            hf_overrides: Optional[HfOverrides] = None,
-            mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-            override_neuron_config: Optional[Dict[str, Any]] = None,
-            override_pooler_config: Optional["PoolerConfig"] = None) -> None:
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.model)
+        factors.append(self.dtype)
+        factors.append(self.quantization)
+        factors.append(self.quantization_param_path)
+        factors.append(self.revision)
+        factors.append(self.code_revision)
+        factors.append(self.trust_remote_code)
+        factors.append(self.rope_scaling)
+        factors.append(self.rope_theta)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __init__(self,
+                 model: str,
+                 task: Union[TaskOption, Literal["draft"]],
+                 tokenizer: str,
+                 tokenizer_mode: str,
+                 trust_remote_code: bool,
+                 dtype: Union[str, torch.dtype],
+                 seed: int,
+                 allowed_local_media_path: str = "",
+                 revision: Optional[str] = None,
+                 code_revision: Optional[str] = None,
+                 rope_scaling: Optional[Dict[str, Any]] = None,
+                 rope_theta: Optional[float] = None,
+                 tokenizer_revision: Optional[str] = None,
+                 max_model_len: Optional[int] = None,
+                 spec_target_max_model_len: Optional[int] = None,
+                 quantization: Optional[str] = None,
+                 quantization_param_path: Optional[str] = None,
+                 enforce_eager: Optional[bool] = None,
+                 max_seq_len_to_capture: Optional[int] = None,
+                 max_logprobs: int = 20,
+                 disable_sliding_window: bool = False,
+                 skip_tokenizer_init: bool = False,
+                 served_model_name: Optional[Union[str, List[str]]] = None,
+                 limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+                 use_async_output_proc: bool = True,
+                 config_format: ConfigFormat = ConfigFormat.AUTO,
+                 hf_overrides: Optional[HfOverrides] = None,
+                 mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+                 mm_cache_preprocessor: bool = False,
+                 override_neuron_config: Optional[Dict[str, Any]] = None,
+                 override_pooler_config: Optional["PoolerConfig"] = None,
+                 logits_processor_pattern: Optional[str] = None) -> None:
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
@@ -168,6 +228,8 @@ def __init__(
         self.seed = seed
         self.revision = revision
         self.code_revision = code_revision
+        self.rope_scaling = rope_scaling
+        self.rope_theta = rope_theta
 
         if hf_overrides is None:
             hf_overrides = {}
@@ -177,7 +239,7 @@ def __init__(
             hf_overrides_fn = hf_overrides
         else:
             hf_overrides_kw = hf_overrides
-            hf_overrides_fn = identity
+            hf_overrides_fn = None
 
         if rope_scaling is not None:
             hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling}
@@ -206,8 +268,15 @@ def __init__(
         self.skip_tokenizer_init = skip_tokenizer_init
 
         hf_config = get_config(self.model, trust_remote_code, revision,
-                               code_revision, config_format, **hf_overrides_kw)
-        hf_config = hf_overrides_fn(hf_config)
+                               code_revision, config_format)
+
+        if hf_overrides_kw:
+            logger.info("Overriding HF config with %s", hf_overrides_kw)
+            hf_config.update(hf_overrides_kw)
+        if hf_overrides_fn:
+            logger.info("Overriding HF config with %s", hf_overrides_fn)
+            hf_config = hf_overrides_fn(hf_config)
+
         self.hf_config = hf_config
 
         self.hf_text_config = get_hf_text_config(self.hf_config)
@@ -216,8 +285,8 @@ def __init__(
             self.model, revision)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.use_async_output_proc = use_async_output_proc
-        self.chat_template_text_format = chat_template_text_format
         self.mm_processor_kwargs = mm_processor_kwargs
+        self.mm_cache_preprocessor = mm_cache_preprocessor
 
         # Set enforce_eager to False if the value is unset.
         if self.enforce_eager is None:
@@ -229,15 +298,26 @@ def __init__(
             (self.hf_text_config.model_type in ["gemma2"]))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
-            sliding_window_len_min = get_min_sliding_window(
-                self.hf_text_config.sliding_window)
-
-            print_warning_once(
-                f"{self.hf_text_config.model_type} has interleaved attention, "
-                "which is currently not supported by vLLM. Disabling sliding "
-                "window and capping the max length to the sliding window size "
-                f"({sliding_window_len_min}).")
-            self.disable_sliding_window = True
+            if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
+                sliding_window_len_min = get_min_sliding_window(
+                    self.hf_text_config.sliding_window)
+
+                print_warning_once(
+                    f"{self.hf_text_config.model_type} has interleaved "
+                    "attention, which is currently not supported by the "
+                    "XFORMERS backend. Disabling sliding window and capping "
+                    "the max length to the sliding window size "
+                    f"({sliding_window_len_min}).")
+                self.disable_sliding_window = True
+            else:
+                # for a model with interleaved attention,
+                # the scheduler and the model treat it as full attention
+                # (i.e., not dropping any tokens outside the window).
+                # only the attention layer itself is aware of the sliding
+                # window, and use the window size to compute the attention.
+                self.hf_text_config.interleaved_sliding_window = sliding_window
+                delattr(self.hf_text_config, "sliding_window")
+                sliding_window = None
 
         self.max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
@@ -254,6 +334,7 @@ def __init__(
             self._verify_tokenizer_mode()
 
         self.is_attention_free = self._init_attention_free()
+        self.is_hybrid = self._init_is_hybrid()
         self.has_inner_state = self._init_has_inner_state()
 
         if current_platform.is_neuron():
@@ -264,7 +345,9 @@ def __init__(
         supported_tasks, task = self._resolve_task(task, self.hf_config)
         self.supported_tasks = supported_tasks
         self.task: Final = task
+
         self.pooler_config = self._init_pooler_config(override_pooler_config)
+        self.logits_processor_pattern = logits_processor_pattern
 
         self._verify_quantization()
         self._verify_cuda_graph()
@@ -292,7 +375,7 @@ def _init_pooler_config(
         override_pooler_config: Optional["PoolerConfig"],
     ) -> Optional["PoolerConfig"]:
 
-        if self.task == "embedding":
+        if self.runner_type == "pooling":
             user_config = override_pooler_config or PoolerConfig()
 
             base_config = get_pooling_config(self.model, self.revision)
@@ -310,6 +393,10 @@ def _init_attention_free(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.is_attention_free_model(architectures)
 
+    def _init_is_hybrid(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return ModelRegistry.is_hybrid_model(architectures)
+
     def _init_has_inner_state(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.model_has_inner_state(architectures)
@@ -322,35 +409,90 @@ def _verify_tokenizer_mode(self) -> None:
                 "either 'auto', 'slow' or 'mistral'.")
         self.tokenizer_mode = tokenizer_mode
 
+    def _get_preferred_task(
+        self,
+        architectures: List[str],
+        supported_tasks: Set[_ResolvedTask],
+    ) -> Optional[_ResolvedTask]:
+        model_id = self.model
+        if get_pooling_config(model_id, self.revision):
+            return "embed"
+        if ModelRegistry.is_cross_encoder_model(architectures):
+            return "score"
+
+        suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [
+            # Other models follow this pattern
+            ("ForCausalLM", "generate"),
+            ("ForConditionalGeneration", "generate"),
+            ("ForSequenceClassification", "classify"),
+            ("ChatModel", "generate"),
+            ("LMHeadModel", "generate"),
+            ("EmbeddingModel", "embed"),
+            ("RewardModel", "reward"),
+        ]
+        _, arch = ModelRegistry.inspect_model_cls(architectures)
+
+        for suffix, pref_task in suffix_to_preferred_task:
+            if arch.endswith(suffix) and pref_task in supported_tasks:
+                return pref_task
+
+        return None
+
     def _resolve_task(
         self,
-        task_option: Union[TaskOption, _Task],
+        task_option: Union[TaskOption, Literal["draft"]],
         hf_config: PretrainedConfig,
-    ) -> Tuple[Set[_Task], _Task]:
+    ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]:
         if task_option == "draft":
             return {"draft"}, "draft"
 
         architectures = getattr(hf_config, "architectures", [])
 
-        task_support: Dict[_Task, bool] = {
+        runner_support: Dict[RunnerType, bool] = {
             # NOTE: Listed from highest to lowest priority,
             # in case the model supports multiple of them
             "generate": ModelRegistry.is_text_generation_model(architectures),
-            "embedding": ModelRegistry.is_embedding_model(architectures),
+            "pooling": ModelRegistry.is_pooling_model(architectures),
         }
-        supported_tasks_lst: List[_Task] = [
-            task for task, is_supported in task_support.items() if is_supported
+        supported_runner_types_lst: List[RunnerType] = [
+            runner_type
+            for runner_type, is_supported in runner_support.items()
+            if is_supported
+        ]
+
+        supported_tasks_lst: List[_ResolvedTask] = [
+            task for runner_type in supported_runner_types_lst
+            for task in _RUNNER_TASKS[runner_type]
         ]
         supported_tasks = set(supported_tasks_lst)
 
         if task_option == "auto":
             selected_task = next(iter(supported_tasks_lst))
 
-            if len(supported_tasks) > 1:
+            if len(supported_tasks_lst) > 1:
+                preferred_task = self._get_preferred_task(
+                    architectures, supported_tasks)
+                if preferred_task is not None:
+                    selected_task = preferred_task
+
                 logger.info(
                     "This model supports multiple tasks: %s. "
                     "Defaulting to '%s'.", supported_tasks, selected_task)
         else:
+            # Aliases
+            if task_option == "embedding":
+                preferred_task = self._get_preferred_task(
+                    architectures, supported_tasks)
+                if preferred_task != "embed":
+                    msg = ("The 'embedding' task will be restricted to "
+                           "embedding models in a future release. Please "
+                           "pass `--task classify`, `--task score`, or "
+                           "`--task reward` explicitly for other pooling "
+                           "models.")
+                    warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
+                task_option = preferred_task or "embed"
+
             if task_option not in supported_tasks:
                 msg = (
                     f"This model does not support the '{task_option}' task. "
@@ -369,18 +511,12 @@ def _parse_quant_hf_config(self):
         return quant_cfg
 
     def _verify_quantization(self) -> None:
-        supported_quantization = [*QUANTIZATION_METHODS]
-        rocm_supported_quantization = [
-            "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-            "fbgemm_fp8"
-        ]
+        supported_quantization = QUANTIZATION_METHODS
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
             "compressed-tensors", "experts_int8"
         ]
-        tpu_supported_quantization = ["tpu_int8"]
-        neuron_supported_quantization = ["neuron_quant"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -391,7 +527,8 @@ def _verify_quantization(self) -> None:
             quant_method = quant_cfg.get("quant_method", "").lower()
 
             # Detect which checkpoint is it
-            for _, method in QUANTIZATION_METHODS.items():
+            for name in QUANTIZATION_METHODS:
+                method = get_quantization_config(name)
                 quantization_override = method.override_quantization_method(
                     quant_cfg, self.quantization)
                 if quantization_override:
@@ -414,32 +551,12 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"Unknown quantization method: {self.quantization}. Must "
                     f"be one of {supported_quantization}.")
-            if current_platform.is_rocm(
-            ) and self.quantization not in rocm_supported_quantization:
-                raise ValueError(
-                    f"{self.quantization} quantization is currently not "
-                    f"supported in ROCm.")
-            if current_platform.is_tpu(
-            ) and self.quantization not in tpu_supported_quantization:
-                raise ValueError(
-                    f"{self.quantization} quantization is currently not "
-                    f"supported in TPU Backend.")
+            current_platform.verify_quantization(self.quantization)
             if self.quantization not in optimized_quantization_methods:
                 logger.warning(
                     "%s quantization is not fully "
                     "optimized yet. The speed can be slower than "
                     "non-quantized models.", self.quantization)
-            if (self.quantization == "awq" and current_platform.is_rocm()
-                    and not envs.VLLM_USE_TRITON_AWQ):
-                logger.warning(
-                    "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
-                    " is not set, enabling VLLM_USE_TRITON_AWQ.")
-                envs.VLLM_USE_TRITON_AWQ = True
-            if current_platform.is_neuron(
-            ) and self.quantization not in neuron_supported_quantization:
-                raise ValueError(
-                    f"{self.quantization} quantization is currently not "
-                    f"supported in Neuron Backend.")
 
     def _verify_cuda_graph(self) -> None:
         if self.max_seq_len_to_capture is None:
@@ -481,13 +598,12 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
-        if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"):
+        if not current_platform.is_async_output_supported(self.enforce_eager):
             logger.warning(
-                "Async output processing is only supported for CUDA, TPU, XPU "
-                "and HPU."
-                "Disabling it for other platforms.")
+                "Async output processing is not supported on the "
+                "current platform type %s.", current_platform.device_type)
             self.use_async_output_proc = False
             return
 
@@ -497,22 +613,12 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-        # If the feature combo become valid
-        if device_config.device_type == "cuda" and self.enforce_eager:
-            logger.warning(
-                "To see benefits of async output processing, enable CUDA "
-                "graph. Since, enforce-eager is enabled, async output "
-                "processor cannot be used")
-            self.use_async_output_proc = not self.enforce_eager
-            return
-
-        # Async postprocessor is not necessary with embedding mode
+        # Async postprocessor is not necessary for pooling models
         # since there is no token generation
-        if self.task == "embedding":
+        if self.runner_type == "pooling":
             self.use_async_output_proc = False
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if speculative_config:
             logger.warning("Async output processing is not supported with"
@@ -650,26 +756,51 @@ def get_num_attention_heads(self,
         num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
         return num_heads // parallel_config.tensor_parallel_size
 
-    def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+    def get_layers_start_end_indices(
+            self, parallel_config: "ParallelConfig") -> Tuple[int, int]:
         from vllm.distributed.utils import get_pp_indices
         total_num_hidden_layers = getattr(self.hf_text_config,
                                           "num_hidden_layers", 0)
         pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
         pp_size = parallel_config.pipeline_parallel_size
         start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
-        return end - start
+        return start, end
 
-    def get_num_attention_layers(self,
-                                 parallel_config: "ParallelConfig") -> int:
-        if self.is_attention_free:
-            return 0
-
-        num_layers = self.get_num_layers(parallel_config)
+    def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+        start, end = self.get_layers_start_end_indices(parallel_config)
+        return end - start
 
-        # Transformers supports layers_block_type @property
-        layers = getattr(self.hf_config, "layers_block_type",
-                         ["attention"] * num_layers)
-        return len([t for t in layers if t == "attention"])
+    def get_num_layers_by_block_type(
+        self,
+        parallel_config: "ParallelConfig",
+        block_type: LayerBlockType = LayerBlockType.attention,
+    ) -> int:
+        # This function relies on 'layers_block_type' in hf_config,
+        # for w/o this attribute, we will need to have workarounds like so
+        attn_block_type = block_type == LayerBlockType.attention
+        is_transformer = not self.is_hybrid and not self.is_attention_free
+        start, end = self.get_layers_start_end_indices(parallel_config)
+
+        if is_transformer:
+            # Handle the basic case first
+            return end - start if attn_block_type else 0
+        elif self.is_attention_free:
+            # Attention free
+            # Note that this code assumes there
+            # is only one type of attention-free block type.
+            return 0 if attn_block_type else end - start
+        else:
+            # Hybrid model
+            layers_block_type_value = getattr(self.hf_config,
+                                              "layers_block_type", None)
+            if layers_block_type_value is None:
+                raise ValueError("The model is an hybrid without a"
+                                 "layers_block_type in the hf_config,"
+                                 "cannot determine the num of "
+                                 f"{block_type.value} layers")
+
+            return sum(t == block_type.value
+                       for t in layers_block_type_value[start:end])
 
     def get_multimodal_config(self) -> "MultiModalConfig":
         """
@@ -696,6 +827,19 @@ def uses_mrope(self) -> bool:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
+    @property
+    def is_cross_encoder(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return ModelRegistry.is_cross_encoder_model(architectures)
+
+    @property
+    def supported_runner_types(self) -> Set[RunnerType]:
+        return {_TASK_RUNNER[task] for task in self.supported_tasks}
+
+    @property
+    def runner_type(self) -> RunnerType:
+        return _TASK_RUNNER[self.task]
+
 
 class CacheConfig:
     """Configuration for the KV cache.
@@ -706,10 +850,33 @@ class CacheConfig:
             vLLM execution.
         swap_space: Size of the CPU swap space per GPU (in GiB).
         cache_dtype: Data type for kv cache storage.
+        is_attention_free: Whether the model is attention-free.
         num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
             profiled num_gpu_blocks if specified. Does nothing if None.
+        sliding_window: Sliding window size for the KV cache. Can not work with
+            prefix caching enabled.
+        enable_prefix_caching: Whether to enable prefix caching.
+        cpu_offload_gb: Size of the CPU offload buffer in GiB.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.cache_dtype)
+        # `cpu_offload_gb` does not use `torch.compile` yet.
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __init__(
         self,
         block_size: int,
@@ -750,6 +917,10 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
+        if (current_platform.is_cuda() and self.block_size is not None
+                and self.block_size > 32):
+            raise ValueError("CUDA Paged Attention kernel only supports "
+                             f"block sizes up to 32. Got {self.block_size}.")
 
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
@@ -806,6 +977,24 @@ class TokenizerPoolConfig:
     pool_type: Union[str, Type["BaseTokenizerGroup"]]
     extra_config: dict
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         if self.pool_type not in ("ray", ) and not isinstance(
                 self.pool_type, type):
@@ -876,6 +1065,7 @@ class LoadConfig:
             "tensorizer" will use CoreWeave's tensorizer library for
                 fast weight loading.
             "bitsandbytes" will load nf4 type weights.
+        model_loader_extra_config: The extra config for the model loader.
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's
             checkpoints.
@@ -887,12 +1077,32 @@ class LoadConfig:
         default_factory=dict)
     ignore_patterns: Optional[Union[List[str], str]] = None
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         model_loader_extra_config = self.model_loader_extra_config or {}
         if isinstance(model_loader_extra_config, str):
             self.model_loader_extra_config = json.loads(
                 model_loader_extra_config)
-        self._verify_load_format()
+        if isinstance(self.load_format, str):
+            load_format = self.load_format.lower()
+            self.load_format = LoadFormat(load_format)
 
         if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
             logger.info(
@@ -901,96 +1111,86 @@ def __post_init__(self):
         else:
             self.ignore_patterns = ["original/**/*"]
 
-    def _verify_load_format(self) -> None:
-        if not isinstance(self.load_format, str):
-            return
 
-        load_format = self.load_format.lower()
-        self.load_format = LoadFormat(load_format)
+@dataclass
+class ParallelConfig:
+    """Configuration for the distributed execution."""
 
-        rocm_not_supported_load_format: List[str] = []
-        if current_platform.is_rocm(
-        ) and load_format in rocm_not_supported_load_format:
-            rocm_supported_load_format = [
-                f for f in LoadFormat.__members__
-                if (f not in rocm_not_supported_load_format)
-            ]
-            raise ValueError(
-                f"load format '{load_format}' is not supported in ROCm. "
-                f"Supported load formats are "
-                f"{rocm_supported_load_format}")
+    pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
+    tensor_parallel_size: int = 1  # Number of tensor parallel groups.
 
+    # Deprecated, use distributed_executor_backend instead.
+    worker_use_ray: Optional[bool] = None
 
-class ParallelConfig:
-    """Configuration for the distributed execution.
+    # Maximum number of multiple batches
+    # when load model sequentially. To avoid RAM OOM when using tensor
+    # parallel and large models.
+    max_parallel_loading_workers: Optional[int] = None
 
-    Args:
-        pipeline_parallel_size: Number of pipeline parallel groups.
-        tensor_parallel_size: Number of tensor parallel groups.
-        worker_use_ray: Deprecated, use distributed_executor_backend instead.
-        max_parallel_loading_workers: Maximum number of multiple batches
-            when load model sequentially. To avoid RAM OOM when using tensor
-            parallel and large models.
-        disable_custom_all_reduce: Disable the custom all-reduce kernel and
-            fall back to NCCL.
-        tokenizer_pool_config: Config for the tokenizer pool.
-            If None, will use synchronous tokenization.
-        ray_workers_use_nsight: Whether to profile Ray workers with nsight, see
-            https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
-        placement_group: ray distributed model workers placement group.
-        distributed_executor_backend: Backend to use for distributed model
-            workers, either "ray" or "mp" (multiprocessing). If the product
-            of pipeline_parallel_size and tensor_parallel_size is less than
-            or equal to the number of GPUs available, "mp" will be used to
-            keep processing on a single host. Otherwise, this will default
-            to "ray" if Ray is installed and fail otherwise. Note that tpu
-            and hpu only support Ray for distributed inference.
-    """
+    # Disable the custom all-reduce kernel and fall back to NCCL.
+    disable_custom_all_reduce: bool = False
 
-    def __init__(
-        self,
-        pipeline_parallel_size: int,
-        tensor_parallel_size: int,
-        worker_use_ray: Optional[bool] = None,
-        max_parallel_loading_workers: Optional[int] = None,
-        disable_custom_all_reduce: bool = False,
-        tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
-        ray_workers_use_nsight: bool = False,
-        placement_group: Optional["PlacementGroup"] = None,
-        distributed_executor_backend: Optional[Union[
-            str, Type["ExecutorBase"]]] = None,
-    ) -> None:
-        self.pipeline_parallel_size = pipeline_parallel_size
-        self.tensor_parallel_size = tensor_parallel_size
-        self.distributed_executor_backend = distributed_executor_backend
-        self.max_parallel_loading_workers = max_parallel_loading_workers
-        self.disable_custom_all_reduce = disable_custom_all_reduce
-        self.tokenizer_pool_config = tokenizer_pool_config
-        self.ray_workers_use_nsight = ray_workers_use_nsight
-        self.placement_group = placement_group
-        self.world_size = pipeline_parallel_size * self.tensor_parallel_size
-
-        if worker_use_ray:
+    # Config for the tokenizer pool. If None, will use synchronous tokenization.
+    tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
+
+    # Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
+    ray_workers_use_nsight: bool = False
+
+    # ray distributed model workers placement group.
+    placement_group: Optional["PlacementGroup"] = None
+
+    # Backend to use for distributed model
+    # workers, either "ray" or "mp" (multiprocessing). If the product
+    # of pipeline_parallel_size and tensor_parallel_size is less than
+    # or equal to the number of GPUs available, "mp" will be used to
+    # keep processing on a single host. Otherwise, this will default
+    # to "ray" if Ray is installed and fail otherwise. Note that tpu
+    # and hpu only support Ray for distributed inference.
+    distributed_executor_backend: Optional[Union[str,
+                                                 Type["ExecutorBase"]]] = None
+
+    # the full name of the worker class to use. If "auto", the worker class
+    # will be determined based on the platform.
+    worker_cls: str = "auto"
+    sd_worker_cls: str = "auto"
+
+    world_size: int = field(init=False)
+
+    rank: int = 0
+
+    def compute_hash(self):
+        """
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.pipeline_parallel_size)
+        factors.append(self.tensor_parallel_size)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __post_init__(self) -> None:
+        self.world_size = self.pipeline_parallel_size * \
+            self.tensor_parallel_size
+
+        if self.worker_use_ray:
             if self.distributed_executor_backend is None:
                 self.distributed_executor_backend = "ray"
             elif not self.use_ray:
                 raise ValueError(f"worker-use-ray can't be used with "
                                  f"distributed executor backend "
                                  f"'{self.distributed_executor_backend}'.")
-
-        if current_platform.is_tpu() and self.world_size > 1:
+        ray_only_devices = ["tpu", "hpu"]
+        if (current_platform.device_type in ray_only_devices
+                and self.world_size > 1):
             if self.distributed_executor_backend is None:
                 self.distributed_executor_backend = "ray"
             if self.distributed_executor_backend != "ray":
                 raise ValueError(
-                    "TPU backend only supports Ray for distributed inference.")
-
-        if current_platform.is_hpu() and self.world_size > 1:
-            if self.distributed_executor_backend is None:
-                self.distributed_executor_backend = "ray"
-            if self.distributed_executor_backend != "ray":
-                raise ValueError(
-                    "HPU backend only supports Ray for distributed inference.")
+                    f"{current_platform.device_type.upper()} backend only "
+                    "supports Ray for distributed inference.")
 
         if self.distributed_executor_backend is None and self.world_size > 1:
             # We use multiprocessing by default if world_size fits on the
@@ -1021,7 +1221,6 @@ def __init__(
                         backend)
 
         self._verify_args()
-        self.rank: int = 0
 
     @property
     def use_ray(self) -> bool:
@@ -1054,100 +1253,138 @@ def _verify_args(self) -> None:
                              "run with Ray.")
 
 
+@dataclass
 class SchedulerConfig:
-    """Scheduler configuration.
+    """Scheduler configuration."""
 
-    Args:
-        task: The task to use the model for.
-        max_num_batched_tokens: Maximum number of tokens to be processed in
-            a single iteration.
-        max_num_seqs: Maximum number of sequences to be processed in a single
-            iteration.
-        max_model_len: Maximum length of a sequence (including prompt
-            and generated text).
-        num_lookahead_slots: The number of slots to allocate per sequence per
-            step, beyond the known token ids. This is used in speculative
-            decoding to store KV activations of tokens which may or may not be
-            accepted.
-        delay_factor: Apply a delay (of delay factor multiplied by previous
-            prompt latency) before scheduling next prompt.
-        enable_chunked_prefill: If True, prefill requests can be chunked based
-            on the remaining max_num_batched_tokens.
-        preemption_mode: Whether to perform preemption by swapping or
-            recomputation. If not specified, we determine the mode as follows:
-            We use recomputation by default since it incurs lower overhead than
-            swapping. However, when the sequence group has multiple sequences
-            (e.g., beam search), recomputation is not currently supported. In
-            such a case, we use swapping instead.
-        send_delta_data: Private API. If used, scheduler sends delta data to
-            workers instead of an entire data. It should be enabled only
-            when SPMD worker architecture is enabled. I.e.,
-            VLLM_USE_RAY_SPMD_WORKER=1
-        policy: The scheduling policy to use. "fcfs" (default) or "priority".
-    """
+    runner_type: str = "generate"  # The runner type to launch for the model.
 
-    def __init__(self,
-                 task: _Task,
-                 max_num_batched_tokens: Optional[int],
-                 max_num_seqs: int,
-                 max_model_len: int,
-                 num_lookahead_slots: int = 0,
-                 delay_factor: float = 0.0,
-                 enable_chunked_prefill: bool = False,
-                 is_multimodal_model: bool = False,
-                 preemption_mode: Optional[str] = None,
-                 num_scheduler_steps: int = 1,
-                 multi_step_stream_outputs: bool = False,
-                 send_delta_data: bool = False,
-                 policy: str = "fcfs") -> None:
-        if max_num_batched_tokens is None:
-            if enable_chunked_prefill:
-                if num_scheduler_steps > 1:
+    # Maximum number of tokens to be processed in a single iteration.
+    max_num_batched_tokens: int = field(default=None)  # type: ignore
+
+    # Maximum number of sequences to be processed in a single iteration.
+    max_num_seqs: int = 128
+
+    # Maximum length of a sequence (including prompt and generated text).
+    max_model_len: int = 8192
+
+    # Maximum number of sequences that can be partially prefilled concurrently
+    max_num_partial_prefills: int = 1
+
+    # Maximum number of "very long prompt" sequences that can be prefilled
+    # concurrently (long is defined by long_prefill_threshold)
+    max_long_partial_prefills: int = 1
+
+    # Set a percentage of the context length that determines which
+    # sequences are considered "long"
+    long_prefill_threshold: float = 0.04
+
+    # calculate context length that determines which sequences are
+    # considered "long"
+    long_prefill_token_threshold = int(max_model_len * long_prefill_threshold)
+
+    # The number of slots to allocate per sequence per
+    # step, beyond the known token ids. This is used in speculative
+    # decoding to store KV activations of tokens which may or may not be
+    # accepted.
+    num_lookahead_slots: int = 0
+
+    # Apply a delay (of delay factor multiplied by previous
+    # prompt latency) before scheduling next prompt.
+    delay_factor: float = 0.0
+
+    # If True, prefill requests can be chunked based
+    # on the remaining max_num_batched_tokens.
+    enable_chunked_prefill: bool = False
+
+    is_multimodal_model: bool = False
+
+    # FIXME(woosuk & ywang96): Below are placeholder values. We need to
+    # calculate the actual values from the configurations.
+    # Multimodal encoder run compute budget, only used in V1
+    max_num_encoder_input_tokens = 16384
+
+    # Multimodal encoder cache size, only used in V1
+    encoder_cache_size = 16384
+
+    # Whether to perform preemption by swapping or
+    # recomputation. If not specified, we determine the mode as follows:
+    # We use recomputation by default since it incurs lower overhead than
+    # swapping. However, when the sequence group has multiple sequences
+    # (e.g., beam search), recomputation is not currently supported. In
+    # such a case, we use swapping instead.
+    preemption_mode: Optional[str] = None
+
+    num_scheduler_steps: int = 1
+
+    multi_step_stream_outputs: bool = False
+
+    # Private API. If used, scheduler sends delta data to
+    # workers instead of an entire data. It should be enabled only
+    # when SPMD worker architecture is enabled. I.e.,
+    # VLLM_USE_RAY_SPMD_WORKER=1
+    send_delta_data: bool = False
+
+    # The scheduling policy to use. "fcfs" (default) or "priority".
+    policy: str = "fcfs"
+
+    chunked_prefill_enabled: bool = field(init=False)
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        if self.max_num_batched_tokens is None:
+            if self.enable_chunked_prefill:
+                if self.num_scheduler_steps > 1:
                     # Multi-step Chunked-Prefill doesn't allow prompt-chunking
                     # for now. Have max_num_batched_tokens set to max_model_len
                     # so we don't reject sequences on account of a short
                     # max_num_batched_tokens.
-                    max_num_batched_tokens = max(max_model_len, 2048)
+                    self.max_num_batched_tokens = max(self.max_model_len, 2048)
                 else:
-                    # It is the values that have the best balance between ITL
-                    # and TTFT on A100. Note it is not optimized for throughput.
-                    max_num_batched_tokens = 512
+                    # This value is chosen to have a balance between ITL
+                    # and TTFT. Note it is not optimized for throughput.
+                    self.max_num_batched_tokens = 2048
             else:
                 # If max_model_len is too short, use 2048 as the default value
                 # for higher throughput.
-                max_num_batched_tokens = max(max_model_len, 2048)
+                self.max_num_batched_tokens = max(self.max_model_len, 2048)
 
-            if task == "embedding":
-                # For embedding, choose specific value for higher throughput
-                max_num_batched_tokens = max(
-                    max_num_batched_tokens,
-                    _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS,
+            if self.runner_type == "pooling":
+                # Choose specific value for higher throughput
+                self.max_num_batched_tokens = max(
+                    self.max_num_batched_tokens,
+                    _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
                 )
-            if is_multimodal_model:
+            if self.is_multimodal_model:
                 # The value needs to be at least the number of multimodal tokens
-                max_num_batched_tokens = max(
-                    max_num_batched_tokens,
+                self.max_num_batched_tokens = max(
+                    self.max_num_batched_tokens,
                     _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                 )
 
-        self.max_num_batched_tokens = max_num_batched_tokens
-
-        if enable_chunked_prefill:
+        if self.enable_chunked_prefill:
             logger.info(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
                 self.max_num_batched_tokens)
 
-        self.task: Final = task
-        self.max_num_seqs = max_num_seqs
-        self.max_model_len = max_model_len
-        self.num_lookahead_slots = num_lookahead_slots
-        self.delay_factor = delay_factor
-        self.chunked_prefill_enabled = enable_chunked_prefill
-        self.preemption_mode = preemption_mode
-        self.num_scheduler_steps = num_scheduler_steps
-        self.multi_step_stream_outputs = multi_step_stream_outputs
-        self.send_delta_data = send_delta_data
-        self.policy = policy
+        self.chunked_prefill_enabled = self.enable_chunked_prefill
         self._verify_args()
 
     def _verify_args(self) -> None:
@@ -1186,25 +1423,32 @@ def is_multi_step(self) -> bool:
 
 class DeviceConfig:
     device: Optional[torch.device]
+    device_type: str
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # the device/platform information will be summarized
+        # by torch/vllm automatically.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
 
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
             # Automated device type detection
-            if current_platform.is_cuda_alike():
-                self.device_type = "cuda"
-            elif current_platform.is_neuron():
-                self.device_type = "neuron"
-            elif current_platform.is_hpu():
-                self.device_type = "hpu"
-            elif current_platform.is_openvino():
-                self.device_type = "openvino"
-            elif current_platform.is_tpu():
-                self.device_type = "tpu"
-            elif current_platform.is_cpu():
-                self.device_type = "cpu"
-            elif current_platform.is_xpu():
-                self.device_type = "xpu"
-            else:
+            self.device_type = current_platform.device_type
+            if not self.device_type:
                 raise RuntimeError("Failed to infer device type")
         else:
             # Device type is assigned explicitly
@@ -1227,6 +1471,24 @@ class SpeculativeConfig:
     decoding with top-1 proposals.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # spec decode does not use `torch.compile` yet.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @staticmethod
     def maybe_create_spec_config(
         target_model_config: ModelConfig,
@@ -1401,16 +1663,6 @@ def maybe_create_spec_config(
                     draft_hf_config
             )
 
-            if (enable_chunked_prefill and \
-                 speculative_draft_tensor_parallel_size != 1):
-                # TODO - Investigate why the error reported in
-                # https://github.com/vllm-project/vllm/pull/9291#issuecomment-2463266258
-                # is happening and re-enable it.
-                raise ValueError(
-                    "Chunked prefill and speculative decoding can be enabled "
-                    "simultaneously only for draft models with tensor "
-                    "parallel size 1.")
-
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
                     speculative_max_model_len,
@@ -1677,6 +1929,24 @@ class LoRAConfig:
     long_lora_scaling_factors: Optional[Tuple[float]] = None
     bias_enabled: bool = False
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # LoRA is not compatible with `torch.compile` .
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         # Setting the maximum rank to 256 should be able to satisfy the vast
         # majority of applications.
@@ -1712,10 +1982,11 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
-            raise ValueError("LoRA is not supported with chunked prefill yet.")
+            logger.warning("LoRA with chunked prefill is still experimental "
+                           "and may be unstable.")
 
 
 @dataclass
@@ -1725,6 +1996,24 @@ class PromptAdapterConfig:
     max_cpu_prompt_adapters: Optional[int] = None
     prompt_adapter_dtype: Optional[torch.dtype] = None
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
 
         if self.max_prompt_adapters < 1:
@@ -1753,16 +2042,34 @@ class MultiModalConfig:
     for each :class:`~vllm.multimodal.MultiModalPlugin`.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     # TODO: Add configs to init vision tower or not.
 
 
 @dataclass
 class PoolerConfig:
-    """Controls the behavior of output pooling in embedding models."""
+    """Controls the behavior of output pooling in pooling models."""
 
     pooling_type: Optional[str] = None
     """
-    The pooling method of the embedding model. This should be a key in
+    The pooling method of the pooling model. This should be a key in
     :class:`vllm.model_executor.layers.pooler.PoolingType`.
     """
 
@@ -1780,18 +2087,36 @@ class PoolerConfig:
 
     step_tag_id: Optional[int] = None
     """
-    If set, only the score corresponding to the ``step_tag_id`` in the 
+    If set, only the score corresponding to the ``step_tag_id`` in the
     generated sentence should be returned. Otherwise, the scores for all tokens
     are returned.
     """
 
     returned_token_ids: Optional[List[int]] = None
     """
-    A list of indices for the vocabulary dimensions to be extracted, 
-    such as the token IDs of ``good_token`` and ``bad_token`` in the 
+    A list of indices for the vocabulary dimensions to be extracted,
+    such as the token IDs of ``good_token`` and ``bad_token`` in the
     ``math-shepherd-mistral-7b-prm`` model.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @staticmethod
     def from_json(json_str: str) -> "PoolerConfig":
         return PoolerConfig(**json.loads(json_str))
@@ -2022,11 +2347,30 @@ def get_served_model_name(model: str,
 class DecodingConfig:
     """Dataclass which contains the decoding strategy of the engine"""
 
-    # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
-    guided_decoding_backend: str = 'outlines'
+    # Which guided decoding algo to use.
+    # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
+    guided_decoding_backend: str = 'xgrammar'
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
 
     def __post_init__(self):
-        valid_guided_backends = ['outlines', 'lm-format-enforcer']
+        valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
         backend = self.guided_decoding_backend
         if backend not in valid_guided_backends:
             raise ValueError(f"Invalid guided_decoding_backend '{backend},"
@@ -2046,6 +2390,24 @@ class ObservabilityConfig:
     # If set, collects the model execute time for the request.
     collect_model_execute_time: bool = False
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         if not is_otel_available() and self.otlp_traces_endpoint is not None:
             raise ValueError(
@@ -2054,6 +2416,456 @@ def __post_init__(self):
                 f"installed. Original error:\n{otel_import_error_traceback}")
 
 
+class KVTransferConfig(BaseModel):
+    """Configuration for distributed KV cache transfer."""
+
+    # The KV connector for vLLM to transmit KV caches between vLLM instances.
+    kv_connector: Optional[str] = None
+
+    # The device used by kv connector to buffer the KV cache.
+    # Currently only support 'cuda'.
+    kv_buffer_device: Optional[str] = "cuda"
+
+    # The buffer size for TorchDistributedConnector. Measured in number of
+    # bytes. Recommended value: 1e9 (about 1GB).
+    kv_buffer_size: float = 1e9
+
+    # Whether this vLLM instance produces, consumes KV cache, or both. Choices
+    # are 'kv_producer', 'kv_consumer', and 'both'.
+    kv_role: Optional[str] = None
+
+    # The rank of this vLLM instance in the KV cache transfer. Typical value:
+    # 0 for prefill instance, 1 for decode instance.
+    # Currently only 1P1D is supported.
+    kv_rank: Optional[int] = None
+
+    # The number of parallel instances for KV cache transfer. For
+    # PyNcclConnector, this should be 2.
+    kv_parallel_size: int = 1
+
+    # The KV connector ip, used to build distributed connection
+    kv_ip: str = "127.0.0.1"
+
+    # The KV connector port, used to build distributed connection
+    kv_port: int = 14579
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
+    @classmethod
+    def from_cli(cls, cli_value: str) -> "KVTransferConfig":
+        """Parse the CLI value for the kv cache transfer config."""
+        return KVTransferConfig.model_validate_json(cli_value)
+
+    def model_post_init(self, __context: Any) -> None:
+        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
+        if all([
+                self.kv_connector is not None, self.kv_connector
+                not in supported_kv_connector
+        ]):
+            raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. "
+                             f"Supported connectors are "
+                             f"{supported_kv_connector}.")
+
+        if self.kv_role is not None and self.kv_role not in [
+                "kv_producer", "kv_consumer", "kv_both"
+        ]:
+            raise ValueError(
+                f"Unsupported kv_role: {self.kv_role}. "
+                f"Supported roles are `kv_producer`, `kv_consumer`, "
+                f"and `kv_both`")
+
+        if self.kv_connector is not None and self.kv_role is None:
+            raise ValueError("Please specify kv_disagg_role when kv_connector "
+                             "is set, supported roles are `kv_producer`, "
+                             "`kv_consumer`, and `kv_both`")
+
+    @property
+    def is_kv_transfer_instance(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in ["kv_producer", "kv_consumer", "kv_both"]
+
+    @property
+    def need_kv_parallel_group(self) -> bool:
+        # for those database-based connector, vLLM does not need to create
+        # parallel group, and in that case the kv parallel size will be 1.
+        return self.kv_connector is not None and self.kv_parallel_size > 1
+
+    @property
+    def is_kv_producer(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in ["kv_producer", "kv_both"]
+
+    @property
+    def is_kv_consumer(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in ["kv_consumer", "kv_both"]
+
+
+class CompilationLevel:
+    # constants for the levels of the compilation process
+    NO_COMPILATION = 0
+    DYNAMO_AS_IS = 1
+    DYNAMO_ONCE = 2
+    PIECEWISE = 3
+
+
+class CompilationConfig(BaseModel):
+    """
+    Configuration for compilation.
+    It has three parts:
+    - Top-level Compilation control:
+        - level: the level of compilation.
+            - 0: no compilation.
+            - 1: dynamo as is.
+            - 2: dynamo once.
+            - 3: piecewise compilation.
+        - debug_dump_path: the path to dump the debug information.
+        - cache_dir: the directory to store the compiled graph, to
+            accelerate Inductor compilation. By default, it will use
+            model-related information to generate a cache directory.
+        - backend: the backend for compilation. It needs to be a string.
+            - "" (empty string): use the default backend.
+            - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
+            - "full.module.name": a qualified name which can be used to import the backend function.
+            We use string to avoid serialization issues when using compilation in a distributed setting.
+            When the compilation level is 1 or 2, the backend is used for the compilation directly (it sees the whole graph).
+            When the compilation level is 3, the backend is used for the piecewise compilation (it sees a part of the graph).
+        - custom_ops: fine-grained control over which custom ops to enable/disable.
+            Use 'all' to enable all, 'none' to disable all.
+            Also specify a list of custom op names to enable (prefixed with a '+'),
+            or disable (prefixed with a '-').
+            Examples:
+                - 'all,-op1' to enable all except op1
+                - 'none,+op1,+op2' to enable only op1 and op2
+            By default, all custom ops are enabled when running without Inductor
+                and disabled when running with Inductor (compile_level >= Inductor).
+        - splitting_ops: a list of ops to split the full graph into subgraphs, used in piecewise compilation.
+    - CudaGraph capture:
+        - use_cudagraph: whether to use cudagraph inside compilation.
+            - False: cudagraph inside compilation is not used.
+            - True: cudagraph inside compilation is used. It requires
+                that all input buffers have fixed addresses, and all
+                splitting ops write their outputs to input buffers.
+            Note that this is orthogonal to the cudagraph capture logic
+            outside of compilation.
+            TODO: move outside cudagraph logic into compilation.
+            torch.compile will handle cudagraph capture logic in the future.
+        - cudagraph_capture_sizes: sizes to capture cudagraph.
+            - None (default): capture sizes are inferred from vllm config.
+            - List[int]: capture sizes are specified as given.
+        - cudagraph_num_of_warmups: number of warmup runs for cudagraph.
+            It means the first several runs will be treated as warmup runs.
+            Only after that, the execution will be recorded, and the recorded
+            cudagraph will be used for subsequent runs.
+        - cudagraph_copy_inputs: whether to copy input tensors for
+            cudagraph. If the caller can guarantee that the same input buffers
+            are always used, it can set this to False. Otherwise, it should
+            set this to True, and the compiler will copy the input to an
+            internally managed buffer. Default is False.
+    - Inductor compilation:
+        - use_inductor: whether to use inductor compilation.
+            - False: inductor compilation is not used. graph runs in eager.
+            - True: inductor compilation is used. one graph for symbolic shape
+                is compiled. In addition, compile for cudagraph sizes that are
+                in candidate_compile_sizes, using configurations
+                in inductor_compile_config.
+        - candidate_compile_sizes: sizes to compile for inductor.
+        - inductor_compile_config: additional configurations for inductor.
+            - None: use default configurations.
+        - inductor_passes: additional passes for inductor. It is a dictionary
+            from pass name to pass function qualified name. We use function
+            name because the config uses json format. If we pass the config
+            from Python, functions can also be passed directly via Python object
+            constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
+        - custom inductor passes: see PassConfig for more details
+
+    Why we have different sizes for cudagraph and inductor:
+    - cudagraph: a cudagraph captured for a specific size can only be used
+        for the same size. We need to capture all the sizes we want to use.
+    - inductor: a graph compiled by inductor for a general shape can be used
+        for different sizes. Inductor can also compile for specific sizes,
+        where it can have more information to optimize the graph with fully
+        static shapes. However, we find the general shape compilation is
+        sufficient for most cases. It might be beneficial to compile for
+        certain small batchsizes, where inductor is good at optimizing.
+    """ # noqa
+    level: int = 0
+    debug_dump_path: str = ""
+    cache_dir: str = ""
+    backend: str = ""
+    custom_ops: List[str] = Field(default_factory=list)
+    splitting_ops: List[str] = Field(default=None)  # type: ignore
+
+    use_inductor: bool = True
+    candidate_compile_sizes: Optional[List[int]] = Field(default=None)
+    inductor_compile_config: Dict = Field(default_factory=dict)
+    inductor_passes: Dict[str, str] = Field(default_factory=dict)
+
+    use_cudagraph: bool = False
+    cudagraph_num_of_warmups: int = 0
+    cudagraph_capture_sizes: Optional[List[int]] = None
+    cudagraph_copy_inputs: bool = False
+
+    class PassConfig(BaseModel):
+        """
+        Configuration for custom Inductor passes.
+        This is separate from general CompilationConfig so that inductor passes
+        don't all have access to full configuration - that would create a cycle
+        as the PassManager is set as a property of config.
+        - dump_graph_stages: list of stages for which we want to dump the graph.
+            Each pass defines its own stages (before, after, maybe in-between).
+        - dump_graph_dir: directory to dump the graphs. Default is .
+        - enable_fusion: whether to enable the custom fusion pass.
+        - enable_reshape: whether to enable the custom reshape elimination pass.
+            TODO better pass enabling system.
+        """
+        dump_graph_stages: List[str] = Field(default_factory=list)
+        dump_graph_dir: Path = Field(default=Path("."))
+        enable_fusion: bool = True
+        enable_reshape: bool = True
+
+        def uuid(self):
+            """
+            Produces a hash unique to the pass configuration.
+            Any new fields that affect compilation should be added to the hash.
+            Do not include dump_graph_* in the hash - they don't affect
+            compilation.
+            """
+            dict_ = self.model_dump(
+                include={"enable_fusion", "enable_reshape"})
+            encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
+            return hashlib.sha256(encoded).digest()
+
+        def model_post_init(self, __context: Any) -> None:
+            if not self.enable_reshape and self.enable_fusion:
+                print_warning_once(
+                    "Fusion enabled but reshape elimination disabled."
+                    "RMSNorm + quant (fp8) fusion might not work")
+
+    pass_config: PassConfig = Field(default_factory=PassConfig)
+
+    # not configurable, computed after init
+    compile_sizes: List[int] = PrivateAttr
+    capture_sizes: List[int] = PrivateAttr
+    max_capture_size: int = PrivateAttr
+    # optimization:
+    # Intuitively, bs_to_padded_graph_size should be Dict[int, int].
+    # since we know all keys are in a range [0, max_capture_size],
+    # we can optimize it to List[int] for better lookup performance.
+    bs_to_padded_graph_size: List[int] = PrivateAttr
+
+    # keep track of enabled and disabled custom ops
+    enabled_custom_ops: Counter[str] = PrivateAttr
+    disabled_custom_ops: Counter[str] = PrivateAttr
+    compilation_time: float = PrivateAttr
+    # should be InductorHashCache, but Pydantic does not support it
+    inductor_hash_cache: Any = PrivateAttr
+
+    # Per-model forward context
+    # Mainly used to store attention cls
+    # Map from layer name to the attention cls
+    static_forward_context: Dict[str, Any] = PrivateAttr
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.level)
+        factors.append(self.backend)
+        factors.append(self.custom_ops)
+        factors.append(self.splitting_ops)
+        factors.append(self.use_inductor)
+        factors.append(self.inductor_compile_config)
+        factors.append(self.inductor_passes)
+        factors.append(self.pass_config.uuid())
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __repr__(self) -> str:
+        exclude = {
+            "static_forward_context",
+            "enabled_custom_ops",
+            "disabled_custom_ops",
+            "compilation_time",
+            "bs_to_padded_graph_size",
+            "pass_config",
+        }
+        return self.model_dump_json(exclude=exclude, exclude_unset=True)
+
+    __str__ = __repr__
+
+    @classmethod
+    def from_cli(cls, cli_value: str) -> "CompilationConfig":
+        """Parse the CLI value for the compilation config."""
+        if cli_value in ["0", "1", "2", "3"]:
+            return cls(level=int(cli_value))
+        # do not use `eval`, it is dangerous and can execute arbitrary code
+        dict_value = ast.literal_eval(cli_value)
+        return CompilationConfig.model_validate(dict_value)
+
+    def model_post_init(self, __context: Any) -> None:
+
+        count_none = self.custom_ops.count("none")
+        count_all = self.custom_ops.count("all")
+        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
+
+        if self.splitting_ops is None:
+            if envs.VLLM_USE_V1:
+                # v1 must split the graph on attention ops
+                # for piecewise cudagraph
+                self.splitting_ops = [
+                    "vllm.unified_attention",
+                    "vllm.unified_attention_with_output",
+                ]
+            else:
+                # v0 can use full graph compilation without splitting,
+                # splitting is optional.
+                # right now we still need it. kv cache shape
+                # will be included in the graph if we don't split
+                # the graph.
+                # TODO: hide kv cache in static forward context
+                # so that inductor does not see it.
+                self.splitting_ops = [
+                    "vllm.unified_attention",
+                    "vllm.unified_attention_with_output",
+                ]
+
+        for k, v in self.inductor_passes.items():
+            if not isinstance(v, str):
+                assert callable(v), (
+                    f"pass {k} should be callable or a qualified name")
+                self.inductor_compile_config[k] = v if isinstance(
+                    v, InductorPass) else CallableInductorPass(v)
+                continue
+
+            # resolve function from qualified name
+            names = v.split(".")
+            module = ".".join(names[:-1])
+            func_name = names[-1]
+            func = __import__(module).__dict__[func_name]
+            self.inductor_compile_config[k] = func if isinstance(
+                func, InductorPass) else CallableInductorPass(func)
+
+        self.enabled_custom_ops = Counter()
+        self.disabled_custom_ops = Counter()
+        self.static_forward_context = {}
+        self.compilation_time = 0.0
+
+    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
+        if self.level == CompilationLevel.NO_COMPILATION:
+            raise ValueError("No compilation level is set.")
+
+        from torch._dynamo.backends.registry import list_backends
+        torch_backends = list_backends(exclude_tags=tuple())
+        if self.level in [
+                CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE
+        ]:
+            if self.backend == "":
+                return "eager"
+            if self.backend in torch_backends:
+                return self.backend
+            return resolve_obj_by_qualname(self.backend)
+
+        # TODO: pass user-specified backend to piecewise compilation
+        # merge with the config use_inductor
+        assert self.level == CompilationLevel.PIECEWISE
+
+        if not self.cache_dir:
+            # no provided cache dir, generate one based on the known factors
+            # that affects the compilation. if none of the factors change,
+            # the cache dir will be the same so that we can reuse the compiled
+            # graph.
+            hash_key = vllm_config.compute_hash()
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
+                f"rank_{vllm_config.parallel_config.rank}")
+            os.makedirs(cache_dir, exist_ok=True)
+            self.cache_dir = cache_dir
+
+            disabled = envs.VLLM_DISABLE_COMPILE_CACHE
+            from vllm.compilation.backends import InductorHashCache
+            self.inductor_hash_cache: InductorHashCache = InductorHashCache(
+                self.cache_dir, disabled=disabled)
+            if disabled:
+                logger.info("vLLM's torch.compile cache is disabled.")
+            else:
+                logger.info(
+                    "Using cache directory: %s for vLLM's torch.compile",
+                    self.cache_dir)
+
+        from vllm.compilation.backends import VllmBackend
+        return VllmBackend(vllm_config)
+
+    def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
+        """To complete the initialization of config,
+        we need to know the cudagraph sizes."""
+
+        if self.cudagraph_capture_sizes is None:
+            self.capture_sizes = sizes_to_specialize
+        else:
+            self.capture_sizes = self.cudagraph_capture_sizes
+            logger.info(("cudagraph sizes specified by model runner"
+                         " %s is overridden by config %s"),
+                        sizes_to_specialize, self.cudagraph_capture_sizes)
+
+        if self.candidate_compile_sizes is None:
+            self.candidate_compile_sizes = []
+        self.compile_sizes = [
+            x for x in self.candidate_compile_sizes if x in self.capture_sizes
+        ]
+        ignored_sizes = [
+            x for x in self.candidate_compile_sizes
+            if x not in self.capture_sizes
+        ]
+        if ignored_sizes:
+            logger.warning(("candidate_compile_sizes %s are ignored "
+                            "because they are not cudagraph capture sizes."),
+                           ignored_sizes)
+
+        # sort to make sure cudagraph capture sizes are in descending order
+        self.capture_sizes.sort(reverse=True)
+        self.max_capture_size = self.capture_sizes[
+            0] if self.capture_sizes else 0
+
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_capture_size + 1)
+        ]
+        for end, start in zip(self.capture_sizes,
+                              self.capture_sizes[1:] + [0]):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
+        self.bs_to_padded_graph_size[
+            self.max_capture_size] = self.max_capture_size
+
+
 @dataclass
 class VllmConfig:
     """Dataclass which contains all vllm-related configuration. This
@@ -2062,10 +2874,10 @@ class VllmConfig:
 
     model_config: ModelConfig = field(default=None, init=True)  # type: ignore
     cache_config: CacheConfig = field(default=None, init=True)  # type: ignore
-    parallel_config: ParallelConfig = field(default=None,
-                                            init=True)  # type: ignore
-    scheduler_config: SchedulerConfig = field(default=None,
-                                              init=True)  # type: ignore
+    parallel_config: ParallelConfig = field(default_factory=ParallelConfig,
+                                            init=True)
+    scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig,
+                                              init=True)
     device_config: DeviceConfig = field(default=None,
                                         init=True)  # type: ignore
     load_config: LoadConfig = field(default=None, init=True)  # type: ignore
@@ -2075,6 +2887,79 @@ class VllmConfig:
     observability_config: Optional[ObservabilityConfig] = None
     prompt_adapter_config: Optional[PromptAdapterConfig] = None
     quant_config: Optional[QuantizationConfig] = None
+    compilation_config: CompilationConfig = field(default=None,
+                                                  init=True)  # type: ignore
+    kv_transfer_config: KVTransferConfig = field(default=None,
+                                                 init=True)  # type: ignore
+    instance_id: str = ""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        # summarize system state
+        from torch._inductor.codecache import CacheBase
+        system_factors = CacheBase.get_system()
+        factors.append(system_factors)
+
+        # summarize pytorch state
+        from torch._inductor.codecache import torch_key
+        torch_factors = torch_key()
+        factors.append(torch_factors)
+
+        # summarize vllm config
+        vllm_factors: List[Any] = []
+        from vllm import __version__
+        vllm_factors.append(__version__)
+        if self.model_config:
+            vllm_factors.append(self.model_config.compute_hash())
+        if self.cache_config:
+            vllm_factors.append(self.cache_config.compute_hash())
+        if self.parallel_config:
+            vllm_factors.append(self.parallel_config.compute_hash())
+        if self.scheduler_config:
+            vllm_factors.append(self.scheduler_config.compute_hash())
+        if self.device_config:
+            vllm_factors.append(self.device_config.compute_hash())
+        if self.load_config:
+            vllm_factors.append(self.load_config.compute_hash())
+        if self.lora_config:
+            vllm_factors.append(self.lora_config.compute_hash())
+        if self.speculative_config:
+            vllm_factors.append(self.speculative_config.compute_hash())
+        if self.decoding_config:
+            vllm_factors.append(self.decoding_config.compute_hash())
+        if self.observability_config:
+            vllm_factors.append(self.observability_config.compute_hash())
+        if self.prompt_adapter_config:
+            vllm_factors.append(self.prompt_adapter_config.compute_hash())
+        if self.quant_config:
+            pass  # should be captured by model_config.quantization
+        if self.compilation_config:
+            vllm_factors.append(self.compilation_config.compute_hash())
+        if self.kv_transfer_config:
+            vllm_factors.append(self.kv_transfer_config.compute_hash())
+
+        factors.append(vllm_factors)
+
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        return hash_str
+
+    def pad_for_cudagraph(self, batch_size: int) -> int:
+        # if batch_size > self.compilation_config.max_capture_size,
+        # it should raise an IndexError.
+        # the caller should make sure the batch_size is within the range,
+        # i.e., batch_size <= self.compilation_config.max_capture_size
+        return self.compilation_config.bs_to_padded_graph_size[batch_size]
 
     @staticmethod
     def _get_quantization_config(
@@ -2104,7 +2989,15 @@ def _get_quantization_config(
             return quant_config
         return None
 
-    def with_hf_config(self, hf_config: PretrainedConfig) -> "VllmConfig":
+    def with_hf_config(
+        self,
+        hf_config: PretrainedConfig,
+        architectures: Optional[list[str]] = None,
+    ) -> "VllmConfig":
+        if architectures is not None:
+            hf_config = copy.deepcopy(hf_config)
+            hf_config.architectures = architectures
+
         model_config = copy.deepcopy(self.model_config)
         model_config.hf_config = hf_config
 
@@ -2135,43 +3028,199 @@ def __post_init__(self):
             self.quant_config = VllmConfig._get_quantization_config(
                 self.model_config, self.load_config)
 
+        if self.scheduler_config is not None and \
+            self.model_config is not None and \
+            self.scheduler_config.chunked_prefill_enabled and \
+            self.model_config.dtype == torch.float32 and \
+            current_platform.get_device_capability() == (7, 5):
+            print_warning_once(
+                "Turing devices tensor cores do not support float32 matmul. "
+                "To workaround this limitation, vLLM will set 'ieee' input "
+                "precision for chunked prefill triton kernels.")
+
+        if self.compilation_config is None:
+            self.compilation_config = CompilationConfig()
+        if envs.VLLM_USE_V1 and not self.model_config.enforce_eager:
+            # NOTE(woosuk): Currently, we use inductor because the piecewise
+            # CUDA graphs do not work properly with the custom CUDA kernels.
+            # FIXME(woosuk): Disable inductor to reduce the compilation time
+            # and avoid any potential issues with the inductor.
+            self.compilation_config.custom_ops = ["none"]
+            self.compilation_config.use_cudagraph = True
+            self.compilation_config.use_inductor = True
+            self.compilation_config.cudagraph_num_of_warmups = 1
+            self.compilation_config.pass_config.enable_fusion = False
+            self.compilation_config.pass_config.enable_reshape = False
+            self.compilation_config.level = CompilationLevel.PIECEWISE
+
+        self._set_cudagraph_sizes()
+
+        if self.cache_config is not None and \
+            self.cache_config.cpu_offload_gb > 0 and \
+            self.compilation_config.level != CompilationLevel.NO_COMPILATION:
+            logger.warning(
+                "CPU offload is not supported with `torch.compile` yet."
+                " Disabling `torch.compile`.")
+            self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
+        if self.lora_config is not None and self.compilation_config.level !=\
+             CompilationLevel.NO_COMPILATION:
+            logger.warning("LoRA is not supported with `torch.compile` yet. "
+                           "Disabling `torch.compile`.")
+            self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
+        current_platform.check_and_update_config(self)
+
+        if not self.instance_id:
+            self.instance_id = random_uuid()[:5]
+
+    def _set_cudagraph_sizes(self):
+        """
+        cudagraph batchsize padding logic:
+
+        `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible
+        batch sizes that cudagraph will capture.
+
+        Depending on the engine's configuration of `max_num_seqs`, the
+        candidate batch sizes to capture cudagraph will shrink to the subset
+        which just cover the range of `[1, max_num_seqs]`. In the common case,
+        `max_num_seqs` is 256, and the cudagraph batch sizes will be
+        `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`.
+
+        However, if users specify the cudagraph capture sizes through
+        compilation config, we will use the specified sizes instead.
+
+        In the end, `vllm_config.compilation_config.capture_sizes` will be the
+        final sizes to capture cudagraph (in descending order).
+
+        During runtime, if batchsize is larger than
+        `vllm_config.compilation_config.capture_sizes`,
+        no cudagraph will be used.
+        If the batch size is no larger than
+        `vllm_config.compilation_config.capture_sizes`,
+        we can quickly find the padded graph size for a given batch size by
+        looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
+        """
+
+        # calculate the default `batch_size_capture_list`
+        if not envs.VLLM_USE_V1:
+            batch_size_capture_list = []
+            max_batchsize_to_capture = 0
+            if self.scheduler_config is not None and \
+                self.model_config is not None and \
+                    not self.model_config.enforce_eager:
+
+                possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
+                # find the minimum size that is larger than max_num_seqs,
+                # which then becomes the max_batchsize_to_capture
+                larger_sizes = [
+                    x for x in possible_sizes
+                    if x >= self.scheduler_config.max_num_seqs
+                ]
+                if larger_sizes:
+                    max_batchsize_to_capture = larger_sizes[0]
+                else:
+                    max_batchsize_to_capture = possible_sizes[-1]
+
+                # filter out the sizes that are
+                # larger than max_batchsize_to_capture
+                batch_size_capture_list = [
+                    size for size in possible_sizes
+                    if size <= max_batchsize_to_capture
+                ]
+        else:
+            batch_size_capture_list = []
+            if self.model_config is not None and \
+                not self.model_config.enforce_eager:
+                batch_size_capture_list = [1, 2, 4
+                                           ] + [i for i in range(8, 513, 8)]
+
+        self.compilation_config.init_with_cudagraph_sizes(
+            batch_size_capture_list)
+
     def __str__(self):
-        return ("model=%r, speculative_config=%r, tokenizer=%r, "
-        "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
-        "override_neuron_config=%s, tokenizer_revision=%s, "
-        "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
-        "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
-        "pipeline_parallel_size=%d, "
-        "disable_custom_all_reduce=%s, quantization=%s, "
-        "enforce_eager=%s, kv_cache_dtype=%s, "
-        "quantization_param_path=%s, device_config=%s, "
-        "decoding_config=%r, observability_config=%r, "
-        "seed=%d, served_model_name=%s, "
-        "num_scheduler_steps=%d, enable_prefix_caching=%s, "
-        "use_async_output_proc=%s, mm_processor_kwargs=%s") % \
-        (self.model_config.model, self.speculative_config,
-        self.model_config.tokenizer,
-        self.model_config.skip_tokenizer_init,
-        self.model_config.tokenizer_mode,
-        self.model_config.revision,
-        self.model_config.override_neuron_config,
-        self.model_config.tokenizer_revision,
-        self.model_config.trust_remote_code,
-        self.model_config.dtype,
-        self.model_config.max_model_len,
-        self.load_config.download_dir,
-        self.load_config.load_format,
-        self.parallel_config.tensor_parallel_size,
-        self.parallel_config.pipeline_parallel_size,
-        self.parallel_config.disable_custom_all_reduce,
-        self.model_config.quantization,
-        self.model_config.enforce_eager,
-        self.cache_config.cache_dtype,
-        self.model_config.quantization_param_path,
-        self.device_config.device, self.decoding_config,
-        self.observability_config, self.model_config.seed,
-        self.model_config.served_model_name,
-        self.scheduler_config.num_scheduler_steps,
-        self.cache_config.enable_prefix_caching,
-        self.model_config.use_async_output_proc,
-        self.model_config.mm_processor_kwargs)
+        return (
+            f"model={self.model_config.model!r},"
+            f" speculative_config={self.speculative_config!r},"
+            f" tokenizer={self.model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
+            f" tokenizer_mode={self.model_config.tokenizer_mode}, "
+            f"revision={self.model_config.revision}, "
+            f"override_neuron_config={self.model_config.override_neuron_config},"
+            f" tokenizer_revision={self.model_config.tokenizer_revision}, "
+            f"trust_remote_code={self.model_config.trust_remote_code}, "
+            f"dtype={self.model_config.dtype}, "
+            f"max_seq_len={self.model_config.max_model_len},"
+            f" download_dir={self.load_config.download_dir!r}, "
+            f"load_format={self.load_config.load_format}, "
+            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
+            f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
+            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
+            f"quantization={self.model_config.quantization}, "
+            f"enforce_eager={self.model_config.enforce_eager}, "
+            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
+            f"quantization_param_path={self.model_config.quantization_param_path},"
+            f" device_config={self.device_config.device}, "
+            f"decoding_config={self.decoding_config!r}, "
+            f"observability_config={self.observability_config!r}, "
+            f"seed={self.model_config.seed}, "
+            f"served_model_name={self.model_config.served_model_name}, "
+            f"num_scheduler_steps={self.scheduler_config.num_scheduler_steps}, "
+            f"multi_step_stream_outputs={self.scheduler_config.multi_step_stream_outputs}, "  # noqa
+            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
+            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
+            f"use_async_output_proc={self.model_config.use_async_output_proc}, "
+            f"mm_cache_preprocessor={self.model_config.mm_cache_preprocessor!r}, "  # noqa
+            f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
+            f"pooler_config={self.model_config.pooler_config!r}, "
+            f"compilation_config={self.compilation_config!r}")
+
+
+_current_vllm_config: Optional[VllmConfig] = None
+
+
+@contextmanager
+def set_current_vllm_config(vllm_config: VllmConfig):
+    """
+    Temporarily set the current VLLM config.
+    Used during model initialization.
+    We save the current VLLM config in a global variable,
+    so that all modules can access it, e.g. custom ops
+    can access the VLLM config to determine how to dispatch.
+    """
+    global _current_vllm_config
+    old_vllm_config = _current_vllm_config
+    from vllm.compilation.counter import compilation_counter
+    num_models_seen = compilation_counter.num_models_seen
+    try:
+        _current_vllm_config = vllm_config
+        yield
+    finally:
+        logger.debug("enabled custom ops: %s",
+                     vllm_config.compilation_config.enabled_custom_ops)
+        logger.debug("disabled custom ops: %s",
+                     vllm_config.compilation_config.disabled_custom_ops)
+        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
+            and compilation_counter.num_models_seen == num_models_seen:
+            # If the model supports compilation,
+            # compilation_counter.num_models_seen should be increased
+            # by at least 1.
+            # If it is not increased, it means the model does not support
+            # compilation (does not have @support_torch_compile decorator).
+            logger.warning(
+                "`torch.compile` is turned on, but the model %s"
+                " does not support it. Please open an issue on GitHub"
+                "if you want it to be supported.",
+                vllm_config.model_config.model)
+        _current_vllm_config = old_vllm_config
+
+
+def get_current_vllm_config() -> VllmConfig:
+    if _current_vllm_config is None:
+        # in ci, usually when we test custom ops/modules directly,
+        # we don't set the vllm config. In that case, we set a default
+        # config.
+        logger.warning("Current VLLM config is not set.")
+        from vllm.config import VllmConfig
+        return VllmConfig()
+    return _current_vllm_config
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index d10cb29ef4a7c..dca0b3fe8d304 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -80,7 +80,8 @@ def get_num_required_blocks(token_ids: List[int],
 
     def allocate(self,
                  token_ids: List[int],
-                 device: Device = Device.GPU) -> None:
+                 device: Device = Device.GPU,
+                 extra_hash: Optional[int] = None) -> None:
         """Allocates memory blocks for storing the given sequence of token IDs.
 
         This method allocates the required number of blocks to store the given
@@ -90,12 +91,16 @@ def allocate(self,
             token_ids (List[int]): The sequence of token IDs to be stored.
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefixcaching block.
         """
         assert not self._is_allocated
         assert token_ids
         blocks = self._allocate_blocks_for_token_ids(prev_block=None,
                                                      token_ids=token_ids,
-                                                     device=device)
+                                                     device=device,
+                                                     extra_hash=extra_hash)
         self.update(blocks)
         self._num_full_slots = len(token_ids)
 
@@ -108,7 +113,8 @@ def update(self, blocks: List[Block]) -> None:
     def append_token_ids(self,
                          token_ids: List[int],
                          num_lookahead_slots: int = 0,
-                         num_computed_slots: Optional[int] = None) -> None:
+                         num_computed_slots: Optional[int] = None,
+                         extra_hash: Optional[int] = None) -> None:
         """Appends a sequence of token IDs to the existing blocks in the
         BlockTable.
 
@@ -130,6 +136,9 @@ def append_token_ids(self,
                 Without sliding window, None can be passed.
                 Without chunked prefill, it should be the same as
                 _num_full_slots.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
         """
         assert self._is_allocated, "no blocks have been allocated"
         assert len(self._blocks) > 0
@@ -149,7 +158,8 @@ def append_token_ids(self,
         # Ensure there are enough empty slots for the new tokens plus
         # lookahead slots
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
-                                    num_lookahead_slots)
+                                    num_lookahead_slots,
+                                    extra_hash=extra_hash)
 
         # Update the blocks with the new tokens
         first_block_idx = self._num_full_slots // self._block_size
@@ -160,7 +170,9 @@ def append_token_ids(self,
 
         self._num_full_slots += len(token_ids)
 
-    def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
+    def ensure_num_empty_slots(self,
+                               num_empty_slots: int,
+                               extra_hash: Optional[int] = None) -> None:
         """Ensures that the BlockTable has at least the specified number of
         empty slots available.
 
@@ -171,6 +183,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
 
         Args:
             num_empty_slots (int): The minimum number of empty slots required.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
         """
         # Currently the block table only supports
         # appending tokens to GPU blocks.
@@ -187,7 +202,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
             assert len(self._blocks) > 0
             self._blocks.append(
                 self._allocator.allocate_mutable_block(
-                    prev_block=self._blocks[-1], device=device))
+                    prev_block=self._blocks[-1],
+                    device=device,
+                    extra_hash=extra_hash))
 
     def fork(self) -> "BlockTable":
         """Creates a new BlockTable instance with a copy of the blocks from the
@@ -259,9 +276,12 @@ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
         # ones after the appended ones.
         return sequence_token_ids[self.num_full_slots:]
 
-    def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
-                                       token_ids: List[int],
-                                       device: Device) -> List[Block]:
+    def _allocate_blocks_for_token_ids(
+            self,
+            prev_block: Optional[Block],
+            token_ids: List[int],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
         blocks: List[Block] = []
 
         block_token_ids = []
@@ -275,8 +295,10 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
         if block_token_ids:
             blocks.extend(
                 self._allocator.allocate_immutable_blocks(
-                    prev_block, block_token_ids=block_token_ids,
-                    device=device))
+                    prev_block,
+                    block_token_ids=block_token_ids,
+                    device=device,
+                    extra_hash=extra_hash))
             prev_block = blocks[-1]
 
         if tail_token_ids:
@@ -284,7 +306,7 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
             cur_token_ids = tail_token_ids[0]
 
             block = self._allocator.allocate_mutable_block(
-                prev_block=prev_block, device=device)
+                prev_block=prev_block, device=device, extra_hash=extra_hash)
             block.append_token_ids(cur_token_ids)
 
             blocks.append(block)
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index eb190adfbe802..c03b5932eafb6 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -177,7 +177,8 @@ def __init__(self, block_size: int, create_block: Block.Factory,
                                    token_ids=[],
                                    block_size=self._block_size,
                                    allocator=self._allocator,
-                                   block_id=None))
+                                   block_id=None,
+                                   extra_hash=None))
 
     def increase_pool(self):
         """Doubles the internal pool size
@@ -194,10 +195,15 @@ def increase_pool(self):
                                    token_ids=[],
                                    block_size=self._block_size,
                                    allocator=self._allocator,
-                                   block_id=None))
-
-    def init_block(self, prev_block: Optional[Block], token_ids: List[int],
-                   block_size: int, physical_block_id: Optional[int]) -> Block:
+                                   block_id=None,
+                                   extra_hash=None))
+
+    def init_block(self,
+                   prev_block: Optional[Block],
+                   token_ids: List[int],
+                   block_size: int,
+                   physical_block_id: Optional[int],
+                   extra_hash: Optional[int] = None) -> Block:
         if len(self._free_ids) == 0:
             self.increase_pool()
             assert len(self._free_ids) > 0
@@ -210,7 +216,8 @@ def init_block(self, prev_block: Optional[Block], token_ids: List[int],
             token_ids=token_ids,
             block_size=block_size,
             allocator=block._allocator,  # type: ignore[attr-defined] 
-            block_id=physical_block_id)
+            block_id=physical_block_id,
+            extra_hash=extra_hash)
         block.pool_id = pool_id  # type: ignore[attr-defined]
         return block
 
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 9727f6e19b84e..3a57487a6cd8a 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -121,23 +121,32 @@ def allocate_or_get_null_block(self) -> Block:
                 self.allocate_mutable_block(None, Device.GPU))
         return self._null_block
 
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               device: Device) -> Block:
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
         """Allocates a new mutable block on the specified device.
 
         Args:
             prev_block (Optional[Block]): The previous block to in the sequence.
                 Used for prefix hashing.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             Block: The newly allocated mutable block.
         """
-        return self._allocators[device].allocate_mutable_block(prev_block)
-
-    def allocate_immutable_blocks(self, prev_block: Optional[Block],
-                                  block_token_ids: List[List[int]],
-                                  device: Device) -> List[Block]:
+        return self._allocators[device].allocate_mutable_block(
+            prev_block, extra_hash=extra_hash)
+
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
         """Allocates a new group of immutable blocks with the provided block 
         token IDs on the specified device.
 
@@ -147,17 +156,22 @@ def allocate_immutable_blocks(self, prev_block: Optional[Block],
             block_token_ids (List[int]): The list of block token IDs to be 
                 stored in the new blocks.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             List[Block]: The newly allocated list of immutable blocks 
                 containing the provided block token IDs.
         """
         return self._allocators[device].allocate_immutable_blocks(
-            prev_block, block_token_ids)
+            prev_block, block_token_ids, extra_hash=extra_hash)
 
-    def allocate_immutable_block(self, prev_block: Optional[Block],
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
                                  token_ids: List[int],
-                                 device: Device) -> Block:
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
         """Allocates a new immutable block with the provided token IDs on the
         specified device.
 
@@ -167,13 +181,16 @@ def allocate_immutable_block(self, prev_block: Optional[Block],
             token_ids (List[int]): The list of token IDs to be stored in the new
                 block.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             Block: The newly allocated immutable block containing the provided
                 token IDs.
         """
         return self._allocators[device].allocate_immutable_block(
-            prev_block, token_ids)
+            prev_block, token_ids, extra_hash=extra_hash)
 
     def free(self, block: Block) -> None:
         """Frees the memory occupied by the given block.
@@ -306,14 +323,6 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         device = Device.GPU
         return self._allocators[device].mark_blocks_as_computed(block_ids)
 
-    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
-                               block_ids: List[int],
-                               skip_last_block_id: bool) -> List[int]:
-        # Prefix caching only supported on GPU.
-        device = Device.GPU
-        return self._allocators[device].get_computed_block_ids(
-            prev_computed_block_ids, block_ids, skip_last_block_id)
-
     def get_common_computed_block_ids(
             self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         # Prefix caching only supported on GPU.
@@ -342,6 +351,13 @@ def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
         self._swap_mapping.clear()
         return list(mapping.items())
 
+    def find_cached_blocks_prefix(
+        self,
+        block_hashes: List[int],
+        device: Device = Device.GPU,
+    ) -> List[int]:
+        return self._allocators[device].find_cached_blocks_prefix(block_hashes)
+
 
 class NullBlock(Block):
     """
@@ -388,6 +404,10 @@ def is_full(self):
     def prev_block(self):
         return self._proxy.prev_block
 
+    @property
+    def extra_hash(self):
+        return None
+
     @property
     def computed(self):
         return self._proxy.computed
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 72bbab1dcea5d..985a1098b6cd1 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -50,6 +50,11 @@ def is_full(self) -> bool:
     def prev_block(self) -> Optional["Block"]:
         pass
 
+    @property
+    @abstractmethod
+    def extra_hash(self) -> Optional[int]:
+        return None
+
     @property
     @abstractmethod
     def computed(self) -> bool:
@@ -81,6 +86,8 @@ def __call__(
             block_size: int,
             allocator: "BlockAllocator",
             block_id: Optional[int] = None,
+            computed: bool = False,
+            extra_hash: Optional[int] = None,
         ) -> "Block":
             pass
 
@@ -99,18 +106,20 @@ def content_hash(self) -> Optional[int]:
 class BlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block]) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               extra_hash: Optional[int]) -> Block:
         pass
 
     @abstractmethod
     def allocate_immutable_block(self, prev_block: Optional[Block],
-                                 token_ids: List[int]) -> Block:
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int]) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable_blocks(
-            self, prev_block: Optional[Block],
-            block_token_ids: List[List[int]]) -> List[Block]:
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  extra_hash: Optional[int]) -> List[Block]:
         pass
 
     @abstractmethod
@@ -159,12 +168,6 @@ def mark_blocks_as_accessed(self, block_ids: List[int],
     def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         pass
 
-    @abstractmethod
-    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
-                               block_ids: List[int],
-                               skip_last_block_id: bool) -> List[int]:
-        pass
-
     @abstractmethod
     def get_common_computed_block_ids(
             self, computed_seq_block_ids: List[List[int]]) -> List[int]:
@@ -192,24 +195,39 @@ def get_prefix_cache_hit_rate(self) -> float:
     class NoFreeBlocksError(ValueError):
         pass
 
+    @abstractmethod
+    def find_cached_blocks_prefix(
+        self,
+        block_hashes: List[int],
+    ) -> List[int]:
+        pass
+
 
 class DeviceAwareBlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               device: Device) -> Block:
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable_block(self, prev_block: Optional[Block],
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
                                  token_ids: List[int],
-                                 device: Device) -> Block:
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable_blocks(self, prev_block: Optional[Block],
-                                  block_token_ids: List[List[int]],
-                                  device: Device) -> List[Block]:
+    def allocate_immutable_blocks(
+        self,
+        prev_block: Optional[Block],
+        block_token_ids: List[List[int]],
+        device: Device,
+        extra_hash: Optional[int] = None,
+    ) -> List[Block]:
         pass
 
     @abstractmethod
@@ -246,12 +264,6 @@ def mark_blocks_as_accessed(self, block_ids: List[int],
     def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         pass
 
-    @abstractmethod
-    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
-                               block_ids: List[int],
-                               skip_last_block_id: bool) -> List[int]:
-        pass
-
     @abstractmethod
     def get_common_computed_block_ids(
             self, computed_seq_block_ids: List[List[int]]) -> List[int]:
@@ -284,3 +296,11 @@ def allocate_or_get_null_block(self) -> Block:
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         """Prefix cache hit rate. -1 means not supported or disabled."""
         pass
+
+    @abstractmethod
+    def find_cached_blocks_prefix(
+        self,
+        block_hashes: List[int],
+        device: Device = Device.GPU,
+    ) -> List[int]:
+        pass
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 9341a518d11c6..9b94918ab38ef 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -63,6 +63,7 @@ def __init__(
     def allocate_immutable_block(self,
                                  prev_block: Optional[Block],
                                  token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
                                  device: Optional[Device] = None) -> Block:
         """Allocates a new immutable block with the given token IDs, linked to
         the previous block.
@@ -85,6 +86,7 @@ def allocate_immutable_blocks(
             self,
             prev_block: Optional[Block],
             block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
             device: Optional[Device] = None) -> List[Block]:
         assert device is None
         num_blocks = len(block_token_ids)
@@ -106,6 +108,7 @@ def allocate_immutable_blocks(
 
     def allocate_mutable_block(self,
                                prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
                                device: Optional[Device] = None) -> Block:
         """Allocates a new mutable block, linked to the previous block.
 
@@ -262,13 +265,6 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         """
         pass
 
-    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
-                               block_ids: List[int],
-                               skip_last_block_id: bool) -> List[int]:
-        """No prefix caching here => return empty list
-        """
-        return []
-
     def get_common_computed_block_ids(
             self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         """Determine blocks that can be skipped in prefill.
@@ -329,6 +325,10 @@ def swap_in(self, blocks: List[Block]) -> None:
     def get_prefix_cache_hit_rate(self) -> float:
         return -1
 
+    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
+        # Not applicable for naive block allocator.
+        return []
+
 
 class NaiveBlock(Block):
     """An implementation of the Block class that does not support prefix
@@ -358,7 +358,8 @@ def __init__(self,
                  block_size: int,
                  allocator: BlockAllocator,
                  block_id: Optional[int] = None,
-                 _cow_target: Optional[Block] = None):
+                 _cow_target: Optional[Block] = None,
+                 extra_hash: Optional[int] = None):
         self._token_ids: List[int] = []
         self._block_size = block_size
         self._prev_block = prev_block
@@ -444,6 +445,10 @@ def block_size(self) -> int:
     def prev_block(self) -> Optional["Block"]:
         return self._prev_block
 
+    @property
+    def extra_hash(self):
+        return None
+
     @property
     def content_hash(self) -> Optional[int]:
         return None
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 57527e39b9bdd..1238303234deb 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,13 +1,18 @@
 """Token blocks."""
+import sys
+from bisect import bisect_left
 from os.path import commonprefix
-from typing import Dict, FrozenSet, Iterable, List, Optional, Set, Tuple
+from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
+                    Tuple)
 
 from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
                                     get_all_blocks_recursively)
-from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
+from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
+                                        DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
                                          NaiveBlockAllocator)
 from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
+from vllm.sequence import Sequence
 
 PrefixHash = int
 
@@ -121,6 +126,7 @@ def _create_block(
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
+        extra_hash: Optional[int] = None,
     ) -> Block:
         # Bind block to self.
         allocator = self
@@ -132,11 +138,13 @@ def _create_block(
             block_id=block_id,
             allocator=allocator,
             computed=computed,
+            extra_hash=extra_hash,
         )
 
     def allocate_immutable_block(self,
                                  prev_block: Optional[Block],
                                  token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
                                  device: Optional[Device] = None) -> Block:
         """Allocates an immutable block with the given token IDs, reusing cached
         blocks if possible.
@@ -155,7 +163,8 @@ def allocate_immutable_block(self,
         block = self._block_pool.init_block(prev_block=prev_block,
                                             token_ids=token_ids,
                                             block_size=self._block_size,
-                                            physical_block_id=None)
+                                            physical_block_id=None,
+                                            extra_hash=extra_hash)
         assert block.content_hash is not None
 
         cached_block_id = self._cached_blocks.get(block.content_hash, None)
@@ -168,7 +177,7 @@ def allocate_immutable_block(self,
         self._block_pool.free_block(block)
 
         # No cached block => Allocate a new block
-        block = self.allocate_mutable_block(prev_block)
+        block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
         block.append_token_ids(token_ids)
         return block
 
@@ -176,17 +185,20 @@ def allocate_immutable_blocks(
             self,
             prev_block: Optional[Block],
             block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
             device: Optional[Device] = None) -> List[Block]:
         blocks = []
         for token_ids in block_token_ids:
             prev_block = self.allocate_immutable_block(prev_block=prev_block,
                                                        token_ids=token_ids,
-                                                       device=device)
+                                                       device=device,
+                                                       extra_hash=extra_hash)
             blocks.append(prev_block)
         return blocks
 
     def allocate_mutable_block(self,
                                prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
                                device: Optional[Device] = None) -> Block:
         """Allocates a mutable block. If there are no free blocks, this will
         evict unused cached blocks.
@@ -205,7 +217,8 @@ def allocate_mutable_block(self,
         block = self._block_pool.init_block(prev_block=prev_block,
                                             token_ids=[],
                                             block_size=self._block_size,
-                                            physical_block_id=block_id)
+                                            physical_block_id=block_id,
+                                            extra_hash=extra_hash)
         assert not block.computed
         assert block.content_hash is None
         return block
@@ -377,7 +390,8 @@ def fork(self, last_block: Block) -> List[Block]:
                 prev_block=prev_block,
                 token_ids=block.token_ids,
                 block_size=self._block_size,
-                physical_block_id=block_id)
+                physical_block_id=block_id,
+                extra_hash=block.extra_hash)
 
             forked_blocks.append(forked_block)
             prev_block = forked_blocks[-1]
@@ -534,26 +548,6 @@ def block_is_computed(self, block_id: int) -> bool:
         else:
             return block_id in self.evictor
 
-    def get_computed_block_ids(self,
-                               prev_computed_block_ids: List[int],
-                               block_ids: List[int],
-                               skip_last_block_id: bool = True) -> List[int]:
-        prev_prefix_size = len(prev_computed_block_ids)
-        cur_size = len(block_ids)
-        if skip_last_block_id:
-            cur_size -= 1
-
-        # Sanity checks
-        assert cur_size >= 0
-        assert prev_prefix_size <= cur_size
-
-        ret = prev_computed_block_ids
-        for i in range(prev_prefix_size, cur_size):
-            block_id = block_ids[i]
-            if self.block_is_computed(block_id):
-                ret.append(block_id)
-        return ret
-
     def get_common_computed_block_ids(
             self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         """Return the block ids that are common for a given sequence group.
@@ -623,10 +617,12 @@ def swap_in(self, blocks: List[Block]) -> None:
             # existing "block" object
             if block.is_full:
                 tmp_block = self.allocate_immutable_block(
-                    prev_block=block.prev_block, token_ids=block.token_ids)
+                    prev_block=block.prev_block,
+                    token_ids=block.token_ids,
+                    extra_hash=block.extra_hash)
             else:
                 tmp_block = self.allocate_mutable_block(
-                    prev_block=block.prev_block)
+                    prev_block=block.prev_block, extra_hash=block.extra_hash)
                 tmp_block.append_token_ids(block.token_ids)
 
             block_id = tmp_block.block_id
@@ -634,6 +630,47 @@ def swap_in(self, blocks: List[Block]) -> None:
 
             block.block_id = block_id  # Assign block_id
 
+    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
+        """
+        Given a list of block hashes, return the prefix of the block hashes that
+        are all cached.
+
+        Since a block's block hash includes the hashes of all previous blocks,
+        and we only allocate/deallocate blocks in the entire sequence, so if a
+        block is cached, then all previous blocks are also cached. With this
+        property, we can use binary search to find the prefix of cached blocks.
+
+        Args:
+            block_hashes (List[int]): The list of block hashes.
+
+        Returns:
+            List[int]: The prefix of the `block_hashes` that are cached.
+        """
+
+        def _block_is_cached(block_hash: PrefixHash) -> bool:
+            if block_hash not in self._cached_blocks:
+                return False
+
+            cached_block_id = self._cached_blocks[block_hash]
+            # We only consider the blocks that are marked as computed.
+            return self.block_is_computed(cached_block_id)
+
+        def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int:
+
+            # python <= 3.10 don't have the key argument
+            if sys.version_info < (3, 10):
+                a = [key(e) for e in a]
+                return bisect_left(a, x)
+            else:
+                return bisect_left(a, x, key=key)
+
+        # Look for the first block that's not cached, and returns the prefix
+        # i.e. blocks that are cached.
+        idx = _bisect_left(block_hashes,
+                           True,
+                           key=lambda x: not _block_is_cached(x))
+        return block_hashes[:idx]
+
 
 class PrefixCachingBlock(Block):
     """A block implementation that supports prefix caching.
@@ -653,6 +690,8 @@ class PrefixCachingBlock(Block):
             caching block allocator associated with this block.
         block_id (Optional[int], optional): The physical block index
             of this block. Defaults to None.
+        extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
     """
 
     def __init__(
@@ -663,6 +702,7 @@ def __init__(
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
+        extra_hash: Optional[int] = None,
     ):
         assert isinstance(allocator, PrefixCachingBlockAllocator), (
             "Currently this class is only tested with "
@@ -676,6 +716,7 @@ def __init__(
         self._allocator = allocator
         self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
         self._computed = computed
+        self._extra_hash = extra_hash
 
         # On the first time, we create the block object, and next we only
         # reinitialize it
@@ -785,6 +826,10 @@ def token_ids(self) -> List[int]:
     def prev_block(self) -> Optional[Block]:
         return self._prev_block
 
+    @property
+    def extra_hash(self) -> Optional[int]:
+        return self._extra_hash
+
     @property
     def content_hash(self) -> Optional[int]:
         """Return the content-based hash of the current block, or None if it is
@@ -815,18 +860,19 @@ def content_hash(self) -> Optional[int]:
         self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
             is_first_block,
             prev_block_hash,
-            cur_block_token_ids=self.token_ids)
+            cur_block_token_ids=self.token_ids,
+            extra_hash=self._extra_hash)
         return self._cached_content_hash
 
     @staticmethod
-    def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
-                          cur_block_token_ids: List[int]) -> int:
+    def hash_block_tokens(is_first_block: bool,
+                          prev_block_hash: Optional[int],
+                          cur_block_token_ids: List[int],
+                          extra_hash: Optional[int] = None) -> int:
         """Computes a hash value corresponding to the contents of a block and
         the contents of the preceding block(s). The hash value is used for
         prefix caching.
 
-        NOTE: Content-based hashing does not yet support LoRA.
-
         Parameters:
         - is_first_block (bool): A flag indicating if the block is the first in
             the sequence.
@@ -834,95 +880,144 @@ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
             if this is the first block.
         - cur_block_token_ids (List[int]): A list of token ids in the current
             block. The current block is assumed to be full.
+        - extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
 
         Returns:
         - int: The computed hash value for the block.
         """
         assert (prev_block_hash is None) == is_first_block
-        return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
+        return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
+                     extra_hash))
 
 
 class ComputedBlocksTracker:
-    """Handles caching of per-sequence computed block ids. 
-        When a sequence appears for the first time, it traverses all of the 
-        blocks and detects the prefix of blocks that is computed. On the
-        subsequent times, it only traverses the new blocks that were added 
-        and updates the already recorded prefix of blocks with the newly 
-        computed blocks.
-
-        To avoid redundant traversals, the algorithm also detects when there
-        is a "gap" in the computed prefix. For example, if we have blocks =
-        [1,2,3,4,5], and we have detected [1,2,3] as the computed prefix, then
-        we won't try to add more computed blocks to [1,2,3] in this sequence
-        iteration, and will add more computed blocks only after the sequence is
-        freed and reused again.
-
-        Note that currently, for a given sequence, we also skip the last 
-        block id for caching purposes, to avoid caching of a full sequence
     """
+    Tracks the computed blocks for each sequence.
 
-    def __init__(self, allocator):
-        self._allocator = allocator
-        self._cached_computed_seq_blocks: Dict[int, Tuple[List[int],
-                                                          bool]] = {}
-
-    def add_seq(self, seq_id: int) -> None:
-        """Start tracking seq_id
-        """
-        assert seq_id not in self._cached_computed_seq_blocks
-        self._cached_computed_seq_blocks[seq_id] = ([], False)
+    Internally, it maintains a map from sequence id to the list of block hashes
+    for the sequence. We cache the hashes of the full blocks for each sequence,
+    and make sure the hash is calculated in the same way as the allocator.
+    When a sequence is being decoded, we also update the sequence's hash
+    accordingly and incrementally.
 
-    def remove_seq(self, seq_id: int) -> None:
-        """Stop tracking seq_id
-        """
-        assert seq_id in self._cached_computed_seq_blocks
-        del self._cached_computed_seq_blocks[seq_id]
-
-    def get_cached_computed_blocks_and_update(
-            self, seq_id: int, block_ids: List[int]) -> List[int]:
-        """ Look at the class documentation for details
-        """
-        # Ensure seq_id is already tracked
-        assert seq_id in self._cached_computed_seq_blocks
-
-        # Get cached data (may be empty on the first time)
-        prev_computed_block_ids, has_gap = self._cached_computed_seq_blocks[
-            seq_id]
-
-        if has_gap:
-            # When gap is detected, we do not add more computed blocks at this
-            # sequence iteration
-            return prev_computed_block_ids
-
-        # We do not consider the last block id for caching purposes.
-        num_cur_blocks = len(block_ids) - 1
-        assert num_cur_blocks >= 0
-
-        if len(prev_computed_block_ids) >= num_cur_blocks:
-            # Cache HIT
-            assert len(prev_computed_block_ids) == num_cur_blocks
-            return prev_computed_block_ids
-
-        # If here, then we may possibly add more computed blocks. As a result,
-        # traverse the additional blocks after prev_computed_block_ids to
-        # detect more computed blocks and add them.
-
-        # Incremental init for seq_id => Look only at the new blocks
-        computed_block_ids = self._allocator.get_computed_block_ids(  # noqa: E501
-            prev_computed_block_ids,
-            block_ids,
-            skip_last_block_id=
-            True,  # We skip last block id to avoid caching of full seq
-        )
+    From the sequence hash, with prefix caching enabled, we could also calculate
+    the number of cached tokens for the sequence by looking up the number of
+    cached block hashes in the allocator.
+    """
 
-        # Detect if there is a "gap"
-        has_gap = len(computed_block_ids) < num_cur_blocks
+    def __init__(
+        self,
+        allocator: DeviceAwareBlockAllocator,
+        block_size: int,
+        enable_caching: bool,
+    ):
+        self._allocator = allocator
+        self._block_size = block_size
+        self._enable_caching = enable_caching
+
+        # A map from seq_id to the list of block hashes for the
+        # sequence. This is so that we don't have to recompute the block hashes
+        # for the sequence when we need to check if the sequence is cached.
+        # Note a block that's not full will not have its hash calculated and
+        # recorded.
+        self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {}
+
+        # A map from seq_id to the number of tokens that are cached for the
+        # sequence.
+        # We need this so that a sequence in continuous prefill doesn't
+        # accidentally see its cached token count change. See comments in
+        # `get_num_cached_tokens` for more details.
+        self._seq_id_to_num_tokens_computed: Dict[int, int] = {}
+
+    def _update_seq_hashes(self, seq: Sequence) -> None:
+        """Incrementally update the sequence's block hashes and record them."""
+        assert self._enable_caching
+
+        block_hashes_recorded = self._seq_id_to_blocks_hashes.get(
+            seq.seq_id, [])
+        cur_num_blocks_recorded = len(block_hashes_recorded)
+        token_ids = seq.get_token_ids()
+        assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, (
+            f"The sequence has {len(token_ids)} tokens, but"
+            f" already recorded {cur_num_blocks_recorded} blocks. "
+            "This should not happen since we assume blocks are "
+            "only appended other than recomputation. When the sequence is "
+            "recomputed, we should have removed the info of the old blocks.")
+        # Update the computed block hashes for the sequence. Since only full
+        # blocks are considered as "computed", we take floor here.
+        num_computed_blocks = len(token_ids) // self._block_size
+
+        # We need to know the hash of the previous block to compute the hash of
+        # the current block so that blocks could be uniquely identified across
+        # sequences of prefixes.
+        prev_block_hash = (None if cur_num_blocks_recorded == 0 else
+                           block_hashes_recorded[-1])
+        # Only update the computed block hashes for the new blocks
+        for i in range(cur_num_blocks_recorded, num_computed_blocks):
+            assert len(token_ids) >= (i + 1) * self._block_size
+            block_token_ids = token_ids[i * self._block_size:(i + 1) *
+                                        self._block_size]
+
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
+            # This has to be kept in sync with the allocator's hash
+            # calculation.
+            block_hash = PrefixCachingBlock.hash_block_tokens(
+                is_first_block=prev_block_hash is None,
+                prev_block_hash=prev_block_hash,
+                cur_block_token_ids=block_token_ids,
+                extra_hash=extra_hash,
+            )
+            block_hashes_recorded.append(block_hash)
+            prev_block_hash = block_hash
+
+        self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded
+
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        if not self._enable_caching:
+            return 0
+
+        # We always try to update the sequence hashes on the fly.
+        # This is to ensure that we don't miss any cached tokens for the
+        # sequence during decode.
+        # This routine should only update hash for any new blocks too.
+        self._update_seq_hashes(seq)
+
+        num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get(
+            seq.seq_id, None)
+
+        # TODO(rickyx): This hack could be removed once we mark blocks as
+        # computed correctly with chunked prefills.
+        if num_computed_tokens_prev is not None and seq.is_prefill():
+            # For a sequence that is still in prefill, we don't
+            # recompute the number of cached tokens.
+            # This also handles correctly chunked prefill since currently
+            # we mark blocks as computed even if the sequence is still partially
+            # prefilled. So a continuously prefilled sequence should not
+            # see its cached token count change while running.
+            return num_computed_tokens_prev
+
+        block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id]
+
+        # This is O(logN), where N is the number of blocks.
+        num_cached_blocks = len(
+            self._allocator.find_cached_blocks_prefix(block_hashes))
+        num_cached_tokens = num_cached_blocks * self._block_size
+        self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens
+        return num_cached_tokens
 
-        # Record
-        self._cached_computed_seq_blocks[seq_id] = (computed_block_ids,
-                                                    has_gap)
+    def remove_seq(self, seq_id: int) -> None:
+        """Stop tracking the sequence."""
+        if not self._enable_caching:
+            return
+        assert seq_id in self._seq_id_to_blocks_hashes
+        del self._seq_id_to_blocks_hashes[seq_id]
 
-        return computed_block_ids
+        assert seq_id in self._seq_id_to_num_tokens_computed
+        del self._seq_id_to_num_tokens_computed[seq_id]
 
 
 class LastAccessBlocksTracker:
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 21f4c63b6572d..b41e848221882 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -101,7 +101,7 @@ def __init__(
         self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
 
         self._computed_blocks_tracker = ComputedBlocksTracker(
-            self.block_allocator)
+            self.block_allocator, self.block_size, self.enable_caching)
         self._last_access_blocks_tracker = LastAccessBlocksTracker(
             self.block_allocator)
 
@@ -151,8 +151,13 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
             max_block_sliding_window=self.max_block_sliding_window,
         )
         if seq.get_token_ids():
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
             # Add blocks to the block table only if the sequence is non empty.
-            block_table.allocate(seq.get_token_ids())
+            block_table.allocate(token_ids=seq.get_token_ids(),
+                                 extra_hash=extra_hash)
 
         return block_table
 
@@ -170,7 +175,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         self.block_tables[seq.seq_id] = block_table
 
         # Track seq
-        self._computed_blocks_tracker.add_seq(seq.seq_id)
         self._last_access_blocks_tracker.add_seq(seq.seq_id)
 
         # Assign the block table for each sequence.
@@ -178,7 +182,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
             self.block_tables[seq.seq_id] = block_table.fork()
 
             # Track seq
-            self._computed_blocks_tracker.add_seq(seq.seq_id)
             self._last_access_blocks_tracker.add_seq(seq.seq_id)
 
         # Allocate cross-attention block table for encoder sequence
@@ -240,6 +243,7 @@ def append_slots(
             token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
             num_lookahead_slots=num_lookahead_slots,
             num_computed_slots=seq.data.get_num_computed_tokens(),
+            extra_hash=seq.extra_hash(),
         )
         # Return any new copy-on-writes.
         new_cows = self.block_allocator.clear_copy_on_writes()
@@ -314,11 +318,13 @@ def get_common_computed_block_ids(
         """
         computed_seq_block_ids = []
         for seq in seqs:
-            computed_seq_block_ids.append(
-                self._computed_blocks_tracker.
-                get_cached_computed_blocks_and_update(
-                    seq.seq_id,
-                    self.block_tables[seq.seq_id].physical_block_ids))
+            all_blocks = self.block_tables[seq.seq_id].physical_block_ids
+            num_cached_tokens = (
+                self._computed_blocks_tracker.get_num_cached_tokens(seq))
+            assert num_cached_tokens % self.block_size == 0
+            num_cached_blocks = num_cached_tokens // self.block_size
+            computed_block_ids = all_blocks[:num_cached_blocks]
+            computed_seq_block_ids.append(computed_block_ids)
 
         # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
         return self.block_allocator.get_common_computed_block_ids(
@@ -332,7 +338,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         self.block_tables[child_seq.seq_id] = src_block_table.fork()
 
         # Track child seq
-        self._computed_blocks_tracker.add_seq(child_seq.seq_id)
         self._last_access_blocks_tracker.add_seq(child_seq.seq_id)
 
     def can_swap_in(self, seq_group: SequenceGroup,
@@ -503,3 +508,9 @@ def _can_swap(self,
             return AllocStatus.OK
         else:
             return AllocStatus.LATER
+
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        """Get the number of tokens in blocks that are already computed and
+        cached in the block manager for the sequence.
+        """
+        return self._computed_blocks_tracker.get_num_cached_tokens(seq)
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index ed7e06cab2996..44adc4158abec 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -1,6 +1,7 @@
 import enum
+import heapq
 from abc import ABC, abstractmethod
-from typing import OrderedDict, Tuple
+from typing import Dict, List, Tuple
 
 
 class EvictionPolicy(enum.Enum):
@@ -75,8 +76,14 @@ class LRUEvictor(Evictor):
     highest num_hashed_tokens value, then one will be chose arbitrarily
     """
 
+    # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
+    # queue relative to the free table size. When this threshold is exceeded,
+    # a cleanup operation is triggered to reduce memory usage.
+    CLEANUP_THRESHOLD = 50
+
     def __init__(self):
-        self.free_table: OrderedDict[int, BlockMetaData] = OrderedDict()
+        self.free_table: Dict[int, BlockMetaData] = {}
+        self.priority_queue = []
 
     def __contains__(self, block_id: int) -> bool:
         return block_id in self.free_table
@@ -85,34 +92,50 @@ def evict(self) -> Tuple[int, int]:
         if len(self.free_table) == 0:
             raise ValueError("No usable cache memory left")
 
-        evicted_block, evicted_block_id = None, None
-        # The blocks with the lowest timestamps should be placed consecutively
-        # at the start of OrderedDict. Loop through all these blocks to
-        # find the one with maximum number of hashed tokens.
-        for _id, block in self.free_table.items():
-            if evicted_block is None:
-                evicted_block, evicted_block_id = block, _id
-                continue
-            if evicted_block.last_accessed < block.last_accessed:
-                break
-            if evicted_block.num_hashed_tokens < block.num_hashed_tokens:
-                evicted_block, evicted_block_id = block, _id
-
-        assert evicted_block is not None
-        assert evicted_block_id is not None
-        self.free_table.pop(evicted_block_id)
-
-        return evicted_block_id, evicted_block.content_hash
+        while self.priority_queue:
+            # We do not remove outdated entries from the priority queue at the
+            # time of updating the last_accessed timestamp. Instead, outdated
+            # entries are filtered out here during eviction. Outdated entries
+            # would either not in the free table, or have older last accessed
+            # time.
+            last_accessed, _, block_id, content_hash = heapq.heappop(
+                self.priority_queue)
+            if (block_id in self.free_table and
+                    self.free_table[block_id].last_accessed == last_accessed):
+                self.free_table.pop(block_id)
+                return block_id, content_hash
+
+        raise ValueError("No usable cache memory left")
 
     def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
             last_accessed: float):
         self.free_table[block_id] = BlockMetaData(content_hash,
                                                   num_hashed_tokens,
                                                   last_accessed)
+        heapq.heappush(
+            self.priority_queue,
+            (last_accessed, -num_hashed_tokens, block_id, content_hash))
+        self._cleanup_if_necessary()
 
     def update(self, block_id: int, last_accessed: float):
         self.free_table[block_id].last_accessed = last_accessed
 
+    def _cleanup_if_necessary(self):
+        if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(
+                self.free_table):
+            self._cleanup()
+
+    def _cleanup(self):
+        new_priority_queue: List[Tuple[float, int, int, int]] = []
+
+        for block_id, block in self.free_table.items():
+            new_priority_queue.append(
+                (block.last_accessed, -block.num_hashed_tokens, block_id,
+                 block.content_hash))
+        heapq.heapify(new_priority_queue)
+
+        self.priority_queue = new_priority_queue
+
     def remove(self, block_id: int):
         if block_id not in self.free_table:
             raise ValueError(
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 9501a516bf020..b10b8d3f4a5bf 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -121,3 +121,7 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup,
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         """Prefix cache hit rate. -1 means not supported or disabled."""
         pass
+
+    @abstractmethod
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        pass
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
index a337392bbed53..a47e594518534 100644
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -8,7 +8,7 @@
 class PlaceholderBlockSpaceManager(BlockSpaceManager):
     """A version of BlockSpaceManager for use in environments
     where block management is not required. 
-    For example: embedding models or attention-free models like Mamba.
+    For example: pooling models or attention-free models like Mamba.
 
     This class provides the same interface as BlockSpaceManager, but its
     methods perform no actions or return simple values like True in specific
@@ -89,3 +89,6 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup,
 
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         return -1
+
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        return 0
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index af4671ec29be9..02220153f6cc6 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -15,7 +15,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta,
-                           SequenceStatus)
+                           SequenceStage, SequenceStatus)
 from vllm.utils import Device, PyObjectCache
 
 logger = init_logger(__name__)
@@ -37,6 +37,7 @@ class PreemptionMode(enum.Enum):
     recompute them when the sequences are resumed, treating the sequences as
     new prompts.
     """
+
     SWAP = enum.auto()
     RECOMPUTE = enum.auto()
 
@@ -52,15 +53,21 @@ class SchedulingBudget:
     happen if we only have chunked prefill scheduling, we can remove this
     feature from the API when chunked prefill is enabled by default.
     """
+
     token_budget: int
     max_num_seqs: int
     _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
     _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
+    # Number of cached tokens in the batch.
+    _num_cached_tokens: int = 0
+    # Number of actual non-cached tokens in the batch.
     _num_batched_tokens: int = 0
     _num_curr_seqs: int = 0
 
     def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
-        assert num_new_tokens != 0
+        # We allow num_new_tokens to be 0 when the entire sequence has
+        # been cached.
+        assert num_new_tokens >= 0
         assert num_new_seqs != 0
         return (self.num_batched_tokens + num_new_tokens <= self.token_budget
                 and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
@@ -68,12 +75,18 @@ def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
     def remaining_token_budget(self):
         return self.token_budget - self.num_batched_tokens
 
-    def add_num_batched_tokens(self, req_id: str, num_batched_tokens: int):
+    def add_num_batched_tokens(self,
+                               req_id: str,
+                               num_batched_tokens: int,
+                               num_cached_tokens: int = 0):
         if req_id in self._request_ids_num_batched_tokens:
             return
+        assert num_cached_tokens >= 0
+        assert num_batched_tokens >= 0
 
         self._request_ids_num_batched_tokens.add(req_id)
         self._num_batched_tokens += num_batched_tokens
+        self._num_cached_tokens += num_cached_tokens
 
     def subtract_num_batched_tokens(self, req_id: str,
                                     num_batched_tokens: int):
@@ -101,6 +114,10 @@ def num_batched_tokens(self):
     def num_curr_seqs(self):
         return self._num_curr_seqs
 
+    @property
+    def num_cached_tokens(self):
+        return self._num_cached_tokens
+
 
 @dataclass
 class ScheduledSequenceGroup:
@@ -115,6 +132,7 @@ class ScheduledSequenceGroup:
 @dataclass
 class SchedulerOutputs:
     """The scheduling decision made from a scheduler."""
+
     # Scheduled sequence groups.
     scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
     # Number of prefill groups scheduled.
@@ -151,9 +169,18 @@ def is_empty(self) -> bool:
                 and not self.blocks_to_swap_out and not self.blocks_to_copy)
 
     def _sort_by_lora_ids(self):
-        self.scheduled_seq_groups = sorted(
-            self.scheduled_seq_groups,
-            key=lambda g: (g.seq_group.lora_int_id, g.seq_group.request_id))
+        assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
+
+        def key_fn(group: ScheduledSequenceGroup):
+            key = (group.seq_group.lora_int_id, group.seq_group.request_id)
+            if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
+                # Sort sequence groups so that all prefills come before all
+                # decodes as required by chunked prefill.
+                return (not group.seq_group.is_prefill(), *key)
+            return key
+
+        self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
+                                           key=key_fn)
 
     @property
     def lora_requests(self) -> Set[LoRARequest]:
@@ -179,6 +206,7 @@ class SchedulerRunningOutputs:
     Could contain prefill (prefill that's chunked) or decodes. If there's not
     enough memory, it can be preempted (for recompute) or swapped out.
     """
+
     # Selected sequences that are running and in a decoding phase.
     decode_seq_groups: List[ScheduledSequenceGroup]
     # Selected sequences that are running and in a prefill phase.
@@ -220,6 +248,7 @@ class SchedulerSwappedInOutputs:
 
     Could contain prefill (prefill that's chunked) or decodes.
     """
+
     # Selected sequences that are going to be swapped in and is in a
     # decoding phase.
     decode_seq_groups: List[ScheduledSequenceGroup]
@@ -254,6 +283,7 @@ class SchedulerPrefillOutputs:
     Could contain a fresh prefill requests or preempted requests that need
     to be recomputed from scratch.
     """
+
     # Selected sequences for prefill.
     seq_groups: List[ScheduledSequenceGroup]
     # Ignored sequence groups.
@@ -270,23 +300,27 @@ def create_empty(cls) -> "SchedulerPrefillOutputs":
 
 
 def seq_group_metadata_builder():
-    return SequenceGroupMetadata(request_id="",
-                                 is_prompt=False,
-                                 seq_data={},
-                                 sampling_params=None,
-                                 block_tables={})
+    return SequenceGroupMetadata(
+        request_id="",
+        is_prompt=False,
+        seq_data={},
+        sampling_params=None,
+        block_tables={},
+    )
 
 
 def scheduler_running_outputs_builder():
-    return SchedulerRunningOutputs(decode_seq_groups=[],
-                                   prefill_seq_groups=[],
-                                   preempted=[],
-                                   swapped_out=[],
-                                   blocks_to_swap_out=[],
-                                   blocks_to_copy=[],
-                                   num_lookahead_slots=0,
-                                   prefill_seq_groups_list=[],
-                                   decode_seq_groups_list=[])
+    return SchedulerRunningOutputs(
+        decode_seq_groups=[],
+        prefill_seq_groups=[],
+        preempted=[],
+        swapped_out=[],
+        blocks_to_swap_out=[],
+        blocks_to_copy=[],
+        num_lookahead_slots=0,
+        prefill_seq_groups_list=[],
+        decode_seq_groups_list=[],
+    )
 
 
 def scheduled_seq_group_builder():
@@ -295,6 +329,99 @@ def scheduled_seq_group_builder():
     # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
 
 
+@dataclass
+class PartialPrefillMetadata:
+    """Holds information about the partial prefills that are currently running
+    during a single iteration of the Scheduler.
+    When chunked prefill is enabled, we allow a certain number of seqs to be
+    partially prefilled during each iteration. Having multiple partial prefills
+    in flight allows us to minimize TTFT and avoid decode starvation in cases
+    where a single sequence group with a very large prompt blocks the queue for
+    too many iterations.
+    The number of long prefill requests is limited so that smaller
+    requests may jump the queue in front of them and get to the decode
+    phase faster.
+    """
+
+    # A minimum bound on the total number of prefills running during this
+    # scheduling step
+    partial_prefills: int
+
+    # The number of long prefill requests currently running
+    long_partial_prefills: int
+
+    scheduler_config: SchedulerConfig
+
+    def cannot_schedule(self, seq_group: SequenceGroup) -> bool:
+        """When concurrent partial prefills are enabled,
+        we limit the number of long requests and only accept
+        shorter requests from the queue while running them
+        concurrently"""
+        return (seq_group.first_seq.get_num_new_tokens() >
+                self.scheduler_config.long_prefill_token_threshold
+                and self.long_partial_prefills >=
+                self.scheduler_config.max_long_partial_prefills
+                and self.scheduler_config.max_num_partial_prefills > 1)
+
+    def increment_partial_prefills(self, seq_group: SequenceGroup) -> None:
+        # When a new prefill is scheduled, we need to know if it is a
+        # long request
+        if (seq_group.first_seq.get_num_new_tokens() >
+                self.scheduler_config.long_prefill_token_threshold):
+            self.long_partial_prefills += 1
+
+    @classmethod
+    def from_queues(
+        cls,
+        running: Deque[SequenceGroup],
+        waiting: Deque[SequenceGroup],
+        scheduler_config: SchedulerConfig,
+    ) -> "PartialPrefillMetadata":
+        """Create a PartialPrefillMetadata object from the current state of
+        the scheduler's queues.
+        This accounts for the currently running prefill requests, and peeks into
+        the waiting queue to see if there are more prefills to potentially be
+        scheduled during this iteration."""
+        partial_prefills = 0
+        long_partial_prefills = 0
+
+        waiting_partial_prefills = 0
+        waiting_long_prefills = 0
+
+        for sg in running:
+            # TODO: Check if this stage is correctly updated before scheduling
+            if sg.first_seq.data.stage == SequenceStage.PREFILL:
+                partial_prefills += 1
+                if (sg.first_seq.get_num_new_tokens() >
+                        scheduler_config.long_prefill_token_threshold):
+                    long_partial_prefills += 1
+
+        for sg in waiting:
+            # Don't bother looping through the rest of the queue if we know
+            # there are already at
+            # least max_partial_prefills requests to fill
+            if (partial_prefills + waiting_partial_prefills >=
+                    scheduler_config.max_num_partial_prefills):
+                break
+
+            # Don't count long requests from the waiting queue if we aren't
+            # going to schedule them anyway
+            if (sg.first_seq.get_num_new_tokens() >
+                    scheduler_config.long_prefill_token_threshold):
+                if (long_partial_prefills + waiting_long_prefills >=
+                        scheduler_config.max_long_partial_prefills):
+                    continue
+                waiting_long_prefills += 1
+            waiting_partial_prefills += 1
+
+        return PartialPrefillMetadata(
+            partial_prefills=min(partial_prefills + waiting_partial_prefills,
+                                 scheduler_config.max_num_partial_prefills),
+            long_partial_prefills=long_partial_prefills,
+            scheduler_config=scheduler_config,
+        )
+
+
 class Scheduler:
 
     def __init__(
@@ -313,7 +440,7 @@ def __init__(
         self.lora_config = lora_config
 
         version = "selfattn"
-        if (self.scheduler_config.task == "embedding"
+        if (self.scheduler_config.runner_type == "pooling"
                 or self.cache_config.is_attention_free):
             version = "placeholder"
 
@@ -334,7 +461,8 @@ def __init__(
             num_gpu_blocks=num_gpu_blocks,
             num_cpu_blocks=num_cpu_blocks,
             sliding_window=self.cache_config.sliding_window,
-            enable_caching=self.cache_config.enable_prefix_caching)
+            enable_caching=self.cache_config.enable_prefix_caching,
+        )
 
         # Sequence groups in the WAITING state.
         # Contain new prefill or preempted requests.
@@ -395,6 +523,18 @@ def __init__(
         # for processing and deallocation by the free_finished_seq_groups()
         self._async_stopped: List[SequenceGroup] = []
 
+        # List with the chunk sizes to hand out to each sequence depending
+        # on how many partial prefills are running. This is slightly faster than
+        # running an integer division every time a prefill is scheduled.
+        # This splits the budget evenly among all prefills.
+        self.partial_prefill_budget_lookup_list = [0] * (
+            self.scheduler_config.max_num_partial_prefills + 1)
+        self.partial_prefill_budget_lookup_list[0] = (
+            scheduler_config.max_num_batched_tokens)
+        for i in range(1, self.scheduler_config.max_num_partial_prefills + 1):
+            self.partial_prefill_budget_lookup_list[i] = (
+                scheduler_config.max_num_batched_tokens // i)
+
     @property
     def next_cache_id(self):
         return (self.cache_id + 1) % self.num_cache_iters
@@ -474,8 +614,8 @@ def _free_seq_group_cross_attn_blocks(
             self.block_manager.free_cross(seq_group)
 
     def has_unfinished_seqs(self) -> bool:
-        return len(self.waiting) != 0 or len(self.running) != 0 or len(
-            self.swapped) != 0
+        return (len(self.waiting) != 0 or len(self.running) != 0
+                or len(self.swapped) != 0)
 
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         return self.block_manager.get_prefix_cache_hit_rate(device)
@@ -494,6 +634,7 @@ def _schedule_running(
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
         enable_chunking: bool = False,
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
     ) -> SchedulerRunningOutputs:
         """Schedule sequence groups that are running.
 
@@ -508,12 +649,14 @@ def _schedule_running(
                 chunked number of tokens are scheduled  if
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
-    
+            partial_prefill_metadata: information about the partial prefills
+            that are currently running
+
         Returns:
             SchedulerRunningOutputs.
         """
-        ret: SchedulerRunningOutputs = \
-            self._scheduler_running_outputs_cache[self.cache_id].get_object()
+        ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[
+            self.cache_id].get_object()
         ret.blocks_to_swap_out.clear()
         ret.blocks_to_copy.clear()
         ret.decode_seq_groups.clear()
@@ -541,9 +684,23 @@ def _schedule_running(
         assert len(self._async_stopped) == 0
         while running_queue:
             seq_group = running_queue[0]
-            num_running_tokens = self._get_num_new_tokens(
-                seq_group, SequenceStatus.RUNNING, enable_chunking, budget)
+            # We discard the cached tokens info here because we don't need it
+            # for running sequence:
+            #   1. If a sequence is running with chunked prefill, the cached
+            #      tokens info was already used for the first prefill.
+            #   2. If a sequence is running with non-chunked prefill, then
+            #      there it's a decoding sequence, and the cached tokens info is
+            #      irrelevant.
+            num_uncached_new_tokens, _ = \
+                self._get_num_new_uncached_and_cached_tokens(
+                seq_group,
+                SequenceStatus.RUNNING,
+                enable_chunking,
+                budget,
+                partial_prefill_metadata,
+            )
 
+            num_running_tokens = num_uncached_new_tokens
             if num_running_tokens == 0:
                 # No budget => Stop
                 break
@@ -554,8 +711,8 @@ def _schedule_running(
             # to process the final tokens. The check below avoids this extra
             # decode run when the model max len is reached, in order to avoid
             # a memory overflow.
-            if self.use_async_output_proc and seq_group.seqs[0].get_len(
-            ) > self.scheduler_config.max_model_len:
+            if (self.use_async_output_proc and seq_group.seqs[0].get_len() >
+                    self.scheduler_config.max_model_len):
                 self._async_stopped.append(seq_group)
                 continue
 
@@ -614,8 +771,9 @@ def _schedule_running(
                 self._append_slots(seq_group, blocks_to_copy, enable_chunking)
                 is_prefill = seq_group.is_prefill()
 
-                scheduled_seq_group: ScheduledSequenceGroup = \
-                    self._scheduled_seq_group_cache[self.cache_id].get_object()
+                scheduled_seq_group: ScheduledSequenceGroup = (
+                    self._scheduled_seq_group_cache[
+                        self.cache_id].get_object())
                 scheduled_seq_group.seq_group = seq_group
                 if is_prefill:
                     scheduled_seq_group.token_chunk_size = num_running_tokens
@@ -692,7 +850,8 @@ def _schedule_swapped(
                 logger.warning(
                     "Failing the request %s because there's not enough kv "
                     "cache blocks to run the entire sequence.",
-                    seq_group.request_id)
+                    seq_group.request_id,
+                )
                 for seq in seq_group.get_seqs():
                     seq.status = SequenceStatus.FINISHED_IGNORED
                 infeasible_seq_groups.append(seq_group)
@@ -715,13 +874,15 @@ def _schedule_swapped(
             # The total number of sequences in the RUNNING state should not
             # exceed the maximum number of sequences.
             num_new_seqs = seq_group.get_max_num_running_seqs()
-            num_new_tokens = self._get_num_new_tokens(seq_group,
-                                                      SequenceStatus.SWAPPED,
-                                                      enable_chunking, budget)
-
-            if (num_new_tokens == 0
-                    or not budget.can_schedule(num_new_tokens=num_new_tokens,
-                                               num_new_seqs=num_new_seqs)):
+            num_new_tokens_uncached, num_new_tokens_cached = (
+                self._get_num_new_uncached_and_cached_tokens(
+                    seq_group, SequenceStatus.SWAPPED, enable_chunking,
+                    budget))
+
+            if num_new_tokens_uncached == 0 or not budget.can_schedule(
+                    num_new_tokens=num_new_tokens_uncached,
+                    num_new_seqs=num_new_seqs,
+            ):
                 break
 
             if lora_int_id > 0 and curr_loras is not None:
@@ -732,12 +893,19 @@ def _schedule_swapped(
             is_prefill = seq_group.is_prefill()
             if is_prefill:
                 prefill_seq_groups.append(
-                    ScheduledSequenceGroup(seq_group,
-                                           token_chunk_size=num_new_tokens))
+                    ScheduledSequenceGroup(
+                        seq_group,
+                        token_chunk_size=num_new_tokens_uncached +
+                        num_new_tokens_cached,
+                    ))
             else:
                 decode_seq_groups.append(
                     ScheduledSequenceGroup(seq_group, token_chunk_size=1))
-            budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens)
+            budget.add_num_batched_tokens(
+                seq_group.request_id,
+                num_batched_tokens=num_new_tokens_uncached,
+                num_cached_tokens=num_new_tokens_cached,
+            )
             budget.add_num_seqs(seq_group.request_id, num_new_seqs)
 
         swapped_queue.extendleft(leftover_swapped)
@@ -753,16 +921,17 @@ def _schedule_swapped(
         )
 
     def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
-        if self.scheduler_config.chunked_prefill_enabled and \
-                not self.scheduler_config.is_multi_step:
+        if (self.scheduler_config.chunked_prefill_enabled
+                and not self.scheduler_config.is_multi_step):
             prompt_limit = self.scheduler_config.max_model_len
         else:
-            prompt_limit = min(self.scheduler_config.max_model_len,
-                               self.scheduler_config.max_num_batched_tokens)
+            prompt_limit = min(
+                self.scheduler_config.max_model_len,
+                self.scheduler_config.max_num_batched_tokens,
+            )
 
         # Model is fine tuned with long context. Return the fine tuned max_len.
-        if (seq_group.lora_request
-                and seq_group.lora_request.long_lora_max_len):
+        if seq_group.lora_request and seq_group.lora_request.long_lora_max_len:
             assert prompt_limit <= seq_group.lora_request.long_lora_max_len
             return seq_group.lora_request.long_lora_max_len
         else:
@@ -770,7 +939,7 @@ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
 
     def _get_priority(self,
                       seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
-        """ Get the priority of the sequence group.
+        """Get the priority of the sequence group.
         Highest preference to user-defined priority, followed by arrival time.
         Args:
             seq_group: The sequence group input.
@@ -803,35 +972,39 @@ def _schedule_priority_preemption(
         if waiting_queue:
             seq_group = waiting_queue.popleft()
             num_new_seqs = seq_group.get_max_num_running_seqs()
-            num_new_tokens = self._get_num_new_tokens(seq_group,
-                                                      SequenceStatus.WAITING,
-                                                      False, budget)
+            num_new_tokens_uncached, _ = \
+                self._get_num_new_uncached_and_cached_tokens(
+                seq_group, SequenceStatus.WAITING, False, budget)
 
-            #Only preempt if priority inversion exists
+            # Only preempt if priority inversion exists
             while running_queue and self._get_priority(
                     running_queue[-1]) > self._get_priority(seq_group):
-                #Only preempt if waiting sequence cannot be allocated
+                # Only preempt if waiting sequence cannot be allocated
                 can_allocate = self.block_manager.can_allocate(seq_group)
-                if (num_new_tokens and can_allocate == AllocStatus.OK
-                        and budget.can_schedule(num_new_tokens=num_new_tokens,
-                                                num_new_seqs=num_new_seqs)):
+                if (num_new_tokens_uncached > 0
+                        and can_allocate == AllocStatus.OK
+                        and budget.can_schedule(
+                            num_new_tokens=num_new_tokens_uncached,
+                            num_new_seqs=num_new_seqs,
+                        )):
                     break
 
-                #Adjust budget to remove the victim sequence group
+                # Adjust budget to remove the victim sequence group
                 vseq_group = running_queue.pop()
-                num_running_tokens = self._get_num_new_tokens(
-                    vseq_group, SequenceStatus.RUNNING, False, budget)
-                budget.subtract_num_batched_tokens(vseq_group.request_id,
-                                                   num_running_tokens)
+                num_running_tokens_uncached, _ = (
+                    self._get_num_new_uncached_and_cached_tokens(
+                        vseq_group, SequenceStatus.RUNNING, False, budget))
+                budget.subtract_num_batched_tokens(
+                    vseq_group.request_id, num_running_tokens_uncached)
                 num_running_seqs = vseq_group.get_max_num_running_seqs()
                 budget.subtract_num_seqs(vseq_group.request_id,
                                          num_running_seqs)
 
-                #Preempt out the victim sequence group
+                # Preempt out the victim sequence group
                 self._preempt(vseq_group, blocks_to_swap_out)
                 waiting_queue.appendleft(vseq_group)
                 force_preemption_count += 1
-            #Put the sequence back into the waiting queue
+            # Put the sequence back into the waiting queue
             waiting_queue.appendleft(seq_group)
 
         waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
@@ -845,6 +1018,7 @@ def _schedule_prefills(
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
         enable_chunking: bool = False,
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
     ) -> SchedulerPrefillOutputs:
         """Schedule sequence groups that are in prefill stage.
 
@@ -865,10 +1039,20 @@ def _schedule_prefills(
                 chunked number of tokens are scheduled  if
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
+            partial_prefill_metadata: information about the partial prefills
+                that are currently running
 
         Returns:
             SchedulerPrefillOutputs.
         """
+        if budget.remaining_token_budget() == 0:
+            # Do nothing: Can't add any more prefill anyway
+            return SchedulerPrefillOutputs(
+                seq_groups=[],
+                ignored_seq_groups=[],
+                num_lookahead_slots=self._get_num_lookahead_slots(
+                    is_prefill=True, enable_chunking=enable_chunking),
+            )
         ignored_seq_groups: List[SequenceGroup] = []
         seq_groups: List[ScheduledSequenceGroup] = []
 
@@ -882,9 +1066,21 @@ def _schedule_prefills(
             assert len(waiting_seqs) == 1, (
                 "Waiting sequence group should have only one prompt "
                 "sequence.")
-            num_new_tokens = self._get_num_new_tokens(seq_group,
-                                                      SequenceStatus.WAITING,
-                                                      enable_chunking, budget)
+            if (partial_prefill_metadata is not None
+                    and partial_prefill_metadata.cannot_schedule(seq_group)):
+                leftover_waiting_sequences.appendleft(seq_group)
+                waiting_queue.popleft()
+                continue
+            num_new_tokens_uncached, num_new_tokens_cached = (
+                self._get_num_new_uncached_and_cached_tokens(
+                    seq_group,
+                    SequenceStatus.WAITING,
+                    enable_chunking,
+                    budget,
+                    partial_prefill_metadata=partial_prefill_metadata,
+                ))
+            num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
+
             if not enable_chunking:
                 num_prompt_tokens = waiting_seqs[0].get_len()
                 assert num_new_tokens == num_prompt_tokens
@@ -893,7 +1089,10 @@ def _schedule_prefills(
             if num_new_tokens > prompt_limit:
                 logger.warning(
                     "Input prompt (%d tokens) is too long"
-                    " and exceeds limit of %d", num_new_tokens, prompt_limit)
+                    " and exceeds limit of %d",
+                    num_new_tokens,
+                    prompt_limit,
+                )
                 for seq in waiting_seqs:
                     seq.status = SequenceStatus.FINISHED_IGNORED
                 ignored_seq_groups.append(seq_group)
@@ -914,7 +1113,9 @@ def _schedule_prefills(
                 logger.warning(
                     "Input prompt (%d tokens) + lookahead slots (%d) is "
                     "too long and exceeds the capacity of block_manager",
-                    num_new_tokens, num_lookahead_slots)
+                    num_new_tokens,
+                    num_lookahead_slots,
+                )
                 for seq in waiting_seqs:
                     seq.status = SequenceStatus.FINISHED_IGNORED
                 ignored_seq_groups.append(seq_group)
@@ -935,10 +1136,18 @@ def _schedule_prefills(
                     waiting_queue.popleft()
                     continue
 
+            if (budget.num_batched_tokens >=
+                    self.scheduler_config.max_num_batched_tokens):
+                # We've reached the budget limit - since there might be
+                # continuous prefills in the running queue, we should break
+                # to avoid scheduling any new prefills.
+                break
+
             num_new_seqs = seq_group.get_max_num_running_seqs()
-            if (num_new_tokens == 0
-                    or not budget.can_schedule(num_new_tokens=num_new_tokens,
-                                               num_new_seqs=num_new_seqs)):
+            if num_new_tokens_uncached == 0 or not budget.can_schedule(
+                    num_new_tokens=num_new_tokens_uncached,
+                    num_new_seqs=num_new_seqs,
+            ):
                 break
 
             # Can schedule this request.
@@ -947,6 +1156,9 @@ def _schedule_prefills(
             waiting_queue.popleft()
             self._allocate_and_set_running(seq_group)
 
+            if partial_prefill_metadata is not None:
+                partial_prefill_metadata.increment_partial_prefills(seq_group)
+
             if enable_chunking and self.scheduler_config.is_multi_step:
                 blocks_to_copy: List[Tuple[int, int]] = []
                 # init_multi_step_from_lookahead_slots happens in append_slots
@@ -962,12 +1174,17 @@ def _schedule_prefills(
                     num_scheduler_steps=self.scheduler_config.
                     num_scheduler_steps,
                     is_multi_step=self.scheduler_config.is_multi_step,
-                    enable_chunking=enable_chunking)
+                    enable_chunking=enable_chunking,
+                )
 
             seq_groups.append(
                 ScheduledSequenceGroup(seq_group=seq_group,
                                        token_chunk_size=num_new_tokens))
-            budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens)
+            budget.add_num_batched_tokens(
+                seq_group.request_id,
+                num_batched_tokens=num_new_tokens_uncached,
+                num_cached_tokens=num_new_tokens_cached,
+            )
             budget.add_num_seqs(seq_group.request_id, num_new_seqs)
 
         # Queue requests that couldn't be scheduled.
@@ -979,11 +1196,12 @@ def _schedule_prefills(
             seq_groups=seq_groups,
             ignored_seq_groups=ignored_seq_groups,
             num_lookahead_slots=self._get_num_lookahead_slots(
-                is_prefill=True, enable_chunking=enable_chunking))
+                is_prefill=True, enable_chunking=enable_chunking),
+        )
 
     def _schedule_default(self) -> SchedulerOutputs:
         """Schedule queued requests.
-        
+
         The current policy is designed to optimize the throughput. First,
         it batches as many prefill requests as possible. And it schedules
         decodes. If there's a pressure on GPU memory, decode requests can
@@ -999,9 +1217,9 @@ def _schedule_default(self) -> SchedulerOutputs:
         for seq_group in self.running:
             budget.add_num_seqs(seq_group.request_id,
                                 seq_group.get_max_num_running_seqs())
-        curr_loras = set(
+        curr_loras = (set(
             seq_group.lora_int_id for seq_group in self.running
-            if seq_group.lora_int_id > 0) if self.lora_enabled else None
+            if seq_group.lora_int_id > 0) if self.lora_enabled else None)
 
         prefills = SchedulerPrefillOutputs.create_empty()
         running_scheduled = SchedulerRunningOutputs.create_empty()
@@ -1027,12 +1245,13 @@ def _schedule_default(self) -> SchedulerOutputs:
 
             # If any sequence group is preempted, do not swap in any sequence
             # group. because it means there's no slot for new running requests.
-            if len(running_scheduled.preempted) + len(
-                    running_scheduled.swapped_out) == 0:
-                swapped_in = self._schedule_swapped(budget, curr_loras)
+            if (len(running_scheduled.preempted) +
+                    len(running_scheduled.swapped_out) == 0):
+                swapped_in = \
+                    self._schedule_swapped(budget, curr_loras)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert budget.num_batched_tokens <= \
+            self.scheduler_config.max_num_batched_tokens
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1049,8 +1268,8 @@ def _schedule_default(self) -> SchedulerOutputs:
 
         # Update swapped requests.
         self.swapped.extend(running_scheduled.swapped_out)
-        preempted = (len(running_scheduled.preempted) +
-                     len(running_scheduled.swapped_out))
+        preempted = len(running_scheduled.preempted) + len(
+            running_scheduled.swapped_out)
 
         # There should be no prefill from running queue because this policy
         # doesn't allow chunked prefills.
@@ -1075,7 +1294,8 @@ def _schedule_default(self) -> SchedulerOutputs:
         return SchedulerOutputs(
             scheduled_seq_groups=scheduled_seq_groups,
             num_prefill_groups=num_prefill_groups,
-            num_batched_tokens=budget.num_batched_tokens,
+            num_batched_tokens=budget.num_batched_tokens +
+            budget.num_cached_tokens,
             blocks_to_swap_in=swapped_in.blocks_to_swap_in,
             blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy,
@@ -1087,7 +1307,7 @@ def _schedule_default(self) -> SchedulerOutputs:
 
     def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         """Schedule queued requests.
-        
+
         Chunked prefill allows to chunk prefill requests, batch them together
         with decode requests. This policy 1. schedule as many decoding requests
         as possible. 2. schedule chunked prefill requests that are not
@@ -1108,10 +1328,20 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         prefills = SchedulerPrefillOutputs.create_empty()
         swapped_in = SchedulerSwappedInOutputs.create_empty()
 
+        # Create partial prefill metadata
+        partial_prefill_metadata = PartialPrefillMetadata.from_queues(
+            running=self.running,
+            waiting=self.waiting,
+            scheduler_config=self.scheduler_config,
+        )
+
         # Decoding should be always scheduled first by fcfs.
-        running_scheduled = self._schedule_running(budget,
-                                                   curr_loras,
-                                                   enable_chunking=True)
+        running_scheduled = self._schedule_running(
+            budget,
+            curr_loras,
+            enable_chunking=True,
+            partial_prefill_metadata=partial_prefill_metadata,
+        )
 
         # Schedule swapped out requests.
         # If preemption happens, it means we don't have space for swap-in.
@@ -1119,13 +1349,15 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
                 running_scheduled.swapped_out) == 0:
             swapped_in = self._schedule_swapped(budget, curr_loras)
 
-        # Schedule new prefills.
-        prefills = self._schedule_prefills(budget,
-                                           curr_loras,
-                                           enable_chunking=True)
+        prefills = self._schedule_prefills(
+            budget,
+            curr_loras,
+            enable_chunking=True,
+            partial_prefill_metadata=partial_prefill_metadata,
+        )
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert budget.num_batched_tokens <= \
+            self.scheduler_config.max_num_batched_tokens
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1148,23 +1380,34 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         # Update swapped requests.
         self.swapped.extend(running_scheduled.swapped_out)
         # Put prefills first due to Attention backend ordering assumption.
+        scheduled_seq_groups = (prefills.seq_groups +
+                                running_scheduled.prefill_seq_groups +
+                                swapped_in.prefill_seq_groups +
+                                running_scheduled.decode_seq_groups +
+                                swapped_in.decode_seq_groups)
+        num_prefill_groups = (len(prefills.seq_groups) +
+                              len(swapped_in.prefill_seq_groups) +
+                              len(running_scheduled.prefill_seq_groups))
+        # If all prompts, then we set num_lookahead_slots to 0
+        # this allows us to go through the `no_spec` path in
+        # `spec_decode_worker.py`
+        all_prefills = len(scheduled_seq_groups) == num_prefill_groups
+        num_lookahead_slots = (0 if
+                               (all_prefills
+                                and not self.scheduler_config.is_multi_step)
+                               else running_scheduled.num_lookahead_slots)
         return SchedulerOutputs(
-            scheduled_seq_groups=(prefills.seq_groups +
-                                  running_scheduled.prefill_seq_groups +
-                                  swapped_in.prefill_seq_groups +
-                                  running_scheduled.decode_seq_groups +
-                                  swapped_in.decode_seq_groups),
-            num_prefill_groups=(len(prefills.seq_groups) +
-                                len(swapped_in.prefill_seq_groups) +
-                                len(running_scheduled.prefill_seq_groups)),
-            num_batched_tokens=budget.num_batched_tokens,
+            scheduled_seq_groups=scheduled_seq_groups,
+            num_prefill_groups=num_prefill_groups,
+            num_batched_tokens=budget.num_batched_tokens +
+            budget.num_cached_tokens,
             blocks_to_swap_in=swapped_in.blocks_to_swap_in,
             blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
             blocks_to_copy=running_scheduled.blocks_to_copy +
             swapped_in.blocks_to_copy,
             ignored_seq_groups=prefills.ignored_seq_groups +
             swapped_in.infeasible_seq_groups,
-            num_lookahead_slots=running_scheduled.num_lookahead_slots,
+            num_lookahead_slots=num_lookahead_slots,
             running_queue_size=len(self.running),
             preempted=(len(running_scheduled.preempted) +
                        len(running_scheduled.swapped_out)),
@@ -1282,8 +1525,8 @@ def schedule(
                 # NOTE: We use get_len instead of get_prompt_len because when
                 # a sequence is preempted, prefill includes previous generated
                 # output tokens.
-                if (token_chunk_size + num_computed_tokens <
-                        seqs[0].data.get_len()):
+                if token_chunk_size + num_computed_tokens < seqs[
+                        0].data.get_len():
                     do_sample = False
 
             # It assumes the scheduled_seq_groups is ordered by
@@ -1303,14 +1546,17 @@ def schedule(
                     encoder_seq_data=encoder_seq_data,
                     cross_block_table=cross_block_table,
                     state=seq_group.state,
+                    token_type_ids=seq_group.token_type_ids,
                     # `multi_modal_data` will only be present for the 1st comm
                     # between engine and worker.
                     # the subsequent comms can still use delta, but
                     # `multi_modal_data` will be None.
-                    multi_modal_data=seq_group.multi_modal_data
-                    if scheduler_outputs.num_prefill_groups > 0 else None,
-                    multi_modal_placeholders=seq_group.multi_modal_placeholders
-                    if scheduler_outputs.num_prefill_groups > 0 else None,
+                    multi_modal_data=(seq_group.multi_modal_data if
+                                      scheduler_outputs.num_prefill_groups > 0
+                                      else None),
+                    multi_modal_placeholders=(
+                        seq_group.multi_modal_placeholders
+                        if scheduler_outputs.num_prefill_groups > 0 else None),
                     mm_processor_kwargs=seq_group.mm_processor_kwargs,
                     prompt_adapter_request=seq_group.prompt_adapter_request,
                 )
@@ -1416,10 +1662,12 @@ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             seq.status = SequenceStatus.RUNNING
 
-    def _append_slots(self,
-                      seq_group: SequenceGroup,
-                      blocks_to_copy: List[Tuple[int, int]],
-                      enable_chunking: bool = False) -> None:
+    def _append_slots(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_copy: List[Tuple[int, int]],
+        enable_chunking: bool = False,
+    ) -> None:
         """Appends new slots to the sequences in the given sequence group.
 
         Args:
@@ -1440,7 +1688,8 @@ def _append_slots(self,
             num_lookahead_slots,
             num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
             is_multi_step=self.scheduler_config.is_multi_step,
-            enable_chunking=enable_chunking)
+            enable_chunking=enable_chunking,
+        )
 
         seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
         if self.scheduler_config.is_multi_step and enable_chunking:
@@ -1483,8 +1732,11 @@ def _preempt(self, seq_group: SequenceGroup,
                 "not enough KV cache space. This can affect the end-to-end "
                 "performance. Increase gpu_memory_utilization or "
                 "tensor_parallel_size to provide more KV cache memory. "
-                "total_num_cumulative_preemption=%d", seq_group.request_id,
-                preemption_mode, self.num_cumulative_preemption + 1)
+                "total_num_cumulative_preemption=%d",
+                seq_group.request_id,
+                preemption_mode,
+                self.num_cumulative_preemption + 1,
+            )
         self.num_cumulative_preemption += 1
 
         if preemption_mode == PreemptionMode.RECOMPUTE:
@@ -1547,10 +1799,9 @@ def _passed_delay(self, now: float) -> bool:
         if self.scheduler_config.delay_factor > 0 and self.waiting:
             earliest_arrival_time = min(
                 [e.metrics.arrival_time for e in self.waiting])
-            passed_delay = (
-                (now - earliest_arrival_time) >
-                (self.scheduler_config.delay_factor * self.last_prompt_latency)
-                or not self.running)
+            passed_delay = (now - earliest_arrival_time) > (
+                self.scheduler_config.delay_factor *
+                self.last_prompt_latency) or not self.running
         else:
             passed_delay = True
         return passed_delay
@@ -1584,64 +1835,187 @@ def _get_num_lookahead_slots(self, is_prefill: bool,
 
         return self.scheduler_config.num_lookahead_slots
 
-    def _get_num_new_tokens(self, seq_group: SequenceGroup,
-                            status: SequenceStatus, enable_chunking: bool,
-                            budget: SchedulingBudget) -> int:
-        """Get the next new tokens to compute for a given sequence group
-            that's in a given `status`.
+    def _get_num_new_uncached_and_cached_tokens(
+        self,
+        seq_group: SequenceGroup,
+        status: SequenceStatus,
+        enable_chunking: bool,
+        budget: SchedulingBudget,
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
+    ) -> Tuple[int, int]:
+        """
+        Returns the number of new uncached and cached tokens to schedule for a
+        given sequence group that's in a given `status`.
 
         The API could chunk the number of tokens to compute based on `budget`
         if `enable_chunking` is True. If a sequence group has multiple
         sequences (e.g., running beam search), it means it is in decoding
         phase, so chunking doesn't happen.
 
-        Returns 0 if the new token cannot be computed due to token budget.
+        Returns (0, 0) if the new token cannot be computed due to token budget.
+
+        The cached tokens's blocks are already computed, and the attention
+        backend will reuse the cached blocks rather than recomputing them. So
+        the scheduler could schedule these cached tokens "for free".
+
+        Args:
+            seq_group: The sequence group to get the number of new tokens to
+                schedule.
+            status: The status of the sequences to get the number of new tokens
+                to schedule.
+            enable_chunking: Whether to chunk the number of tokens to compute.
+            budget: The budget to chunk the number of tokens to compute.
+            partial_prefill_metadata: information about the partial prefills
+                that are currently running
+
+
+        Returns:
+            A tuple of two ints. The first int is the number of new uncached
+            tokens to schedule. The second int is the number of cached tokens.
+            If no more new tokens can be scheduled, returns (0, 0).
         """
-        num_new_tokens = 0
+        num_cached_new_tokens = 0
+        num_uncached_new_tokens = 0
+
         seqs = seq_group.get_seqs(status=status)
+        # Compute the number of new uncached and cached tokens for
+        # each sequence.
         for seq in seqs:
-            num_new_tokens += seq.get_num_new_tokens()
-        assert num_new_tokens > 0
-        # Chunk if a running request cannot fit in the given budget.
-        # If number of seq > 1, it means it is doing beam search
-        # in a decode phase. Do not chunk.
+            if not seq.is_prefill():
+                # Decode sequences should always just have 1 uncached token
+                # TODO(rickyx): Actually is this still correct for multi-step?
+                num_uncached_new_tokens += 1
+                continue
+
+            num_computed_tokens_seq = seq.get_num_computed_tokens()
+            all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
+            if not self.cache_config.enable_prefix_caching:
+                # If prefix caching is not enabled, all new tokens are uncached.
+                num_uncached_new_tokens += all_num_new_tokens_seq
+                continue
+
+            # NOTE: the cache token might be currently in a block that's in an
+            # evictor meaning that it's not yet allocated. However, we don't
+            # exclude such tokens in the cache count because it will be
+            # guaranteed to be allocated later if the sequence can be allocated.
+            num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
+                seq)
+
+            # Sanity check.
+            if num_cached_tokens_seq < num_computed_tokens_seq:
+                # This should only happen with chunked prefill, and
+                # the seq is still in prefill. The `num_cached_tokens_seq`
+                # is the value we calculated on scheduling the first prefill.
+                # For subsequent continuous prefill steps, we cached the
+                # number of cache tokens for the sequence so the cached token
+                # count could be less than the number of computed tokens.
+                # See comments on `ComputedBlocksTracker` for more details.
+                assert (
+                    seq.is_prefill() and seq.status == SequenceStatus.RUNNING
+                    and self.scheduler_config.chunked_prefill_enabled
+                ), ("Number of cached tokens should not be less than the "
+                    "number of computed tokens for a sequence that's still "
+                    f"in prefill. But there are {num_cached_tokens_seq} cached "
+                    f"tokens and {num_computed_tokens_seq} computed tokens "
+                    f"for sequence {seq.seq_id}.")
+
+            num_cached_new_tokens_seq = max(
+                0, num_cached_tokens_seq - num_computed_tokens_seq)
+            num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
+                                           num_cached_new_tokens_seq)
+
+            num_uncached_new_tokens += num_uncached_new_tokens_seq
+            num_cached_new_tokens += num_cached_new_tokens_seq
+
+        if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
+            # For a fully cached hit sequence, we actually need to recompute the
+            # last token. So we need at least 1 uncached token to schedule.
+            # See ModelRunner._compute_for_prefix_cache_hit for more details.
+            num_uncached_new_tokens = 1
+            num_cached_new_tokens -= 1
+
         if enable_chunking and len(seqs) == 1:
-            remaining_token_budget = budget.remaining_token_budget()
-            if self.scheduler_config.is_multi_step:
-                # The current multi-step + chunked prefill capability does
-                # not actually support chunking prompts.
-                #
-                # Therefore, `num_new_tokens` is computed in the same fashion
-                # for both multi-step+chunked-prefill &
-                # multi-step+chunked-prefill+APC
-                #
-                # Prompts with more tokens than the current remaining budget
-                # are postponed to future scheduler steps
-                if num_new_tokens > self._get_prompt_limit(seq_group):
-                    # If the seq_group is in prompt-stage, pass the
-                    # num_new_tokens as-is so the caller can ignore
-                    # the sequence.
-                    pass
-                else:
-                    num_new_tokens = 0 \
-                        if num_new_tokens > remaining_token_budget \
-                        else num_new_tokens
-            elif self.cache_config.enable_prefix_caching:
-                # When prefix caching is enabled, we always allocate
-                # the number of new tokens that is dividable by the block
-                # size to avoid partial block matching.
-                block_size = self.cache_config.block_size
-                remainder = budget.token_budget % block_size
-                if remainder != 0:
-                    raise ValueError("When enabling chunked prefill and "
-                                     "prefix caching, max_num_batched_tokens "
-                                     "(chunk size) must be dividable by "
-                                     "block size, but got chunk_size "
-                                     f"({budget.token_budget}) % block_size "
-                                     f"({block_size}) = {remainder}")
-                if remaining_token_budget < num_new_tokens:
-                    num_new_tokens = (remaining_token_budget //
-                                      block_size) * block_size
-            else:
-                num_new_tokens = min(num_new_tokens, remaining_token_budget)
+            # Chunk if a running request cannot fit in the given budget.
+            # If number of seq > 1, it means it is doing beam search
+            # in a decode phase. Do not chunk.
+            num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
+                self.scheduler_config,
+                self.cache_config,
+                budget,
+                self._get_prompt_limit(seq_group),
+                num_uncached_new_tokens,
+                self.partial_prefill_budget_lookup_list,
+                partial_prefill_metadata,
+            )
+
+        return num_uncached_new_tokens, num_cached_new_tokens
+
+    @staticmethod
+    def _chunk_new_tokens_to_schedule(
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+        budget: SchedulingBudget,
+        prompt_limit: int,
+        num_new_tokens: int,
+        partial_prefill_budget_lookup_list: List[int],
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
+    ) -> int:
+        """
+        Chunks the number of new tokens to schedule based on the budget when
+        chunked prefill is enabled.
+
+        Args:
+            scheduler_config: The scheduler config.
+            cache_config: The cache config.
+            budget: The budget to chunk the number of tokens to compute.
+            prompt_limit: The maximum number of tokens allowed in a prompt.
+            num_new_tokens: The number of new tokens to schedule.
+
+        Returns:
+            The number of new tokens to schedule after chunking.
+        """
+        remaining_token_budget = budget.remaining_token_budget()
+        if scheduler_config.is_multi_step:
+            # The current multi-step + chunked prefill capability does
+            # not actually support chunking prompts.
+            #
+            # Therefore, `num_new_tokens` is computed in the same fashion
+            # for both multi-step+chunked-prefill &
+            # multi-step+chunked-prefill+APC
+            #
+            # Prompts with more tokens than the current remaining budget
+            # are postponed to future scheduler steps
+            if num_new_tokens > prompt_limit:
+                # If the seq_group is in prompt-stage, pass the
+                # num_new_tokens as-is so the caller can ignore
+                # the sequence.
+                return num_new_tokens
+
+            return 0 if num_new_tokens > \
+                remaining_token_budget else num_new_tokens
+
+        # Get the number of tokens to allocate to this prefill slot
+        prefill_slot_budget = (remaining_token_budget
+                               if partial_prefill_metadata is None else
+                               partial_prefill_budget_lookup_list[
+                                   partial_prefill_metadata.partial_prefills])
+
+        if cache_config.enable_prefix_caching:
+            # When prefix caching is enabled and we're partially prefilling
+            # a sequence, we always allocate a number of new tokens that is
+            # divisible by the block size to avoid partial block matching.
+            block_size = cache_config.block_size
+            # Don't exceed either the total budget or slot budget.
+            # Take min of those and get the next lowest multiple of the
+            # block size:
+            remaining_token_budget = (
+                min(remaining_token_budget, prefill_slot_budget) //
+                block_size) * block_size
+            # NB: In the case where num_new_tokens < budget, we are
+            # finishing prefill for this sequence, so we do not need to
+            # allocate a full block.
+
+        num_new_tokens = min(num_new_tokens, remaining_token_budget,
+                             prefill_slot_budget)
+
         return num_new_tokens
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 7c6f48e88637b..a6800f93f167b 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -106,30 +106,72 @@ def __init__(
             self.stream.synchronize()
             del data
 
-        # by default it is disabled, e.g. in profiling models and prefill phase.
-        # to use it, use under `with obj.change_state(enable=True)`, usually
-        # when we are using CUDA graph.
-        self.disabled = True
-
     def all_reduce(self,
-                   tensor: torch.Tensor,
+                   in_tensor: torch.Tensor,
                    op: ReduceOp = ReduceOp.SUM,
-                   stream=None):
+                   stream=None) -> torch.Tensor:
         if self.disabled:
-            return
+            return None
         # nccl communicator created on a specific device
         # will only work on tensors on the same device
         # otherwise it will cause "illegal memory access"
-        assert tensor.device == self.device, (
+        assert in_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {in_tensor.device}")
+
+        out_tensor = torch.empty_like(in_tensor)
+
         if stream is None:
             stream = self.stream
-        self.nccl.ncclAllReduce(buffer_type(tensor.data_ptr()),
-                                buffer_type(tensor.data_ptr()), tensor.numel(),
-                                ncclDataTypeEnum.from_torch(tensor.dtype),
+        self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
+                                buffer_type(out_tensor.data_ptr()),
+                                in_tensor.numel(),
+                                ncclDataTypeEnum.from_torch(in_tensor.dtype),
                                 ncclRedOpTypeEnum.from_torch(op), self.comm,
                                 cudaStream_t(stream.cuda_stream))
+        return out_tensor
+
+    def all_gather(self,
+                   output_tensor: torch.Tensor,
+                   input_tensor: torch.Tensor,
+                   stream=None):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}")
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclAllGather(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype), self.comm,
+            cudaStream_t(stream.cuda_stream))
+
+    def reduce_scatter(self,
+                       output_tensor: torch.Tensor,
+                       input_tensor: torch.Tensor,
+                       op: ReduceOp = ReduceOp.SUM,
+                       stream=None):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}")
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclReduceScatter(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op), self.comm,
+            cudaStream_t(stream.cuda_stream))
 
     def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
@@ -155,6 +197,25 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
 
+    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}")
+        if stream is None:
+            stream = self.stream
+        if src == self.rank:
+            sendbuff = buffer_type(tensor.data_ptr())
+            # NCCL requires the sender also to have a receive buffer
+            recvbuff = buffer_type(tensor.data_ptr())
+        else:
+            sendbuff = buffer_type()
+            recvbuff = buffer_type(tensor.data_ptr())
+        self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
+                                ncclDataTypeEnum.from_torch(tensor.dtype), src,
+                                self.comm, cudaStream_t(stream.cuda_stream))
+
     @contextmanager
     def change_state(self,
                      enable: Optional[bool] = None,
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 7619c98f22148..7dea61b6a09f1 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -151,6 +151,28 @@ class NCCLLibrary:
             ncclRedOp_t, ncclComm_t, cudaStream_t
         ]),
 
+        # ncclResult_t  ncclAllGather(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function("ncclAllGather", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ncclComm_t, cudaStream_t
+        ]),
+
+        # ncclResult_t  ncclReduceScatter(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function("ncclReduceScatter", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ncclRedOp_t, ncclComm_t, cudaStream_t
+        ]),
+
         # ncclResult_t  ncclSend(
         #   const void* sendbuff, size_t count, ncclDataType_t datatype,
         #   int dest, ncclComm_t comm, cudaStream_t stream);
@@ -167,6 +189,15 @@ class NCCLLibrary:
             ncclComm_t, cudaStream_t
         ]),
 
+        # ncclResult_t ncclBroadcast(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, int root, ncclComm_t comm,
+        #   cudaStream_t stream);
+        Function("ncclBroadcast", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ctypes.c_int, ncclComm_t, cudaStream_t
+        ]),
+
         # be cautious! this is a collective call, it will block until all
         # processes in the communicator have called this function.
         # because Python object destruction can happen in random order,
@@ -258,6 +289,28 @@ def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
                                                      datatype, op, comm,
                                                      stream))
 
+    def ncclReduceScatter(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                          count: int, datatype: int, op: int, comm: ncclComm_t,
+                          stream: cudaStream_t) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(self._funcs["ncclReduceScatter"](sendbuff, recvbuff,
+                                                         count, datatype, op,
+                                                         comm, stream))
+
+    def ncclAllGather(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                      count: int, datatype: int, comm: ncclComm_t,
+                      stream: cudaStream_t) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # which is an aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(self._funcs["ncclAllGather"](sendbuff, recvbuff, count,
+                                                     datatype, comm, stream))
+
     def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int,
                  dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype,
@@ -268,6 +321,13 @@ def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int,
         self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
                                                 comm, stream))
 
+    def ncclBroadcast(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                      count: int, datatype: int, root: int, comm: ncclComm_t,
+                      stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclBroadcast"](sendbuff, recvbuff, count,
+                                                     datatype, root, comm,
+                                                     stream))
+
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
 
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 2ff1a1ead99c1..9f97b0f01ad8a 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,10 +1,11 @@
 import os
 import pickle
+import sys
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import List, Optional
+from typing import List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
@@ -14,6 +15,7 @@
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
 import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.utils import get_ip, get_open_port, is_valid_ipv6_address
 
@@ -21,6 +23,20 @@
 
 logger = init_logger(__name__)
 
+# We prefer to use os.sched_yield as it results in tighter polling loops,
+# measured to be around 3e-7 seconds. However on earlier versions of Python
+# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0)
+USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1))
+                   or (sys.version_info[:2] == (3, 10)
+                       and sys.version_info[2] >= 8))
+
+
+def sched_yield():
+    if USE_SCHED_YIELD:
+        os.sched_yield()
+    else:
+        time.sleep(0)
+
 
 class ShmRingBuffer:
 
@@ -114,11 +130,14 @@ def __init__(self,
                     # and we should suppress the error
                     pass
 
+    def handle(self):
+        return (self.n_reader, self.max_chunk_bytes, self.max_chunks,
+                self.shared_memory.name)
+
     def __reduce__(self):
         return (
             self.__class__,
-            (self.n_reader, self.max_chunk_bytes, self.max_chunks,
-             self.shared_memory.name),
+            self.handle(),
         )
 
     def __del__(self):
@@ -147,7 +166,7 @@ class Handle:
     connect_ip: str
     local_reader_ranks: List[int] = field(default_factory=list)
 
-    buffer: Optional[ShmRingBuffer] = None
+    buffer_handle: Optional[Tuple[int, int, int, str]] = None
     local_subscribe_port: Optional[int] = None
     remote_subscribe_port: Optional[int] = None
 
@@ -228,7 +247,7 @@ def __init__(
         self.handle = Handle(
             connect_ip=connect_ip,
             local_reader_ranks=local_reader_ranks,
-            buffer=self.buffer,
+            buffer_handle=self.buffer.handle(),
             local_subscribe_port=local_subscribe_port,
             remote_subscribe_port=remote_subscribe_port,
         )
@@ -247,8 +266,8 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue":
         context = Context()
 
         if rank in handle.local_reader_ranks:
-            assert handle.buffer is not None
-            self.buffer = handle.buffer
+            assert handle.buffer_handle is not None
+            self.buffer = ShmRingBuffer(*handle.buffer_handle)
             self.current_idx = 0
             self.local_reader_rank = handle.local_reader_ranks.index(rank)
             self._is_local_reader = True
@@ -314,7 +333,7 @@ def wait_until_ready(self):
             assert recv == b"READY"
 
     @contextmanager
-    def acquire_write(self):
+    def acquire_write(self, timeout: Optional[float] = None):
         assert self._is_writer, "Only writers can acquire write"
         start_time = time.monotonic()
         n_warning = 1
@@ -329,16 +348,20 @@ def acquire_write(self):
                     # we need to wait until it is read by all readers
 
                     # Release the processor to other threads
-                    os.sched_yield()
+                    sched_yield()
 
-                    # if we wait for a long time, we should warn the user
+                    # if we wait for a long time, log a message
                     if (time.monotonic() - start_time >
                             VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
-                        logger.warning(
-                            "No available block found in %s second. ",
-                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        logger.debug("No available block found in %s second. ",
+                                     VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
 
+                    # if we time out, raise an exception
+                    if (timeout is not None
+                            and time.monotonic() - start_time > timeout):
+                        raise TimeoutError
+
                     continue
                 # found a block that is either
                 # (1) not written
@@ -365,7 +388,7 @@ def acquire_write(self):
                 break
 
     @contextmanager
-    def acquire_read(self):
+    def acquire_read(self, timeout: Optional[float] = None):
         assert self._is_local_reader, "Only readers can acquire read"
         start_time = time.monotonic()
         n_warning = 1
@@ -383,16 +406,20 @@ def acquire_read(self):
                     # we need to wait until it is written
 
                     # Release the processor to other threads
-                    os.sched_yield()
+                    sched_yield()
 
-                    # if we wait for a long time, we should warn the user
+                    # if we wait for a long time, log a message
                     if (time.monotonic() - start_time >
                             VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
-                        logger.warning(
-                            "No available block found in %s second. ",
-                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        logger.debug("No available block found in %s second. ",
+                                     VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
 
+                    # if we time out, raise an exception
+                    if (timeout is not None
+                            and time.monotonic() - start_time > timeout):
+                        raise TimeoutError
+
                     continue
                 # found a block that is not read by this reader
                 # let caller read from the buffer
@@ -406,24 +433,26 @@ def acquire_read(self):
                                     1) % self.buffer.max_chunks
                 break
 
-    def enqueue(self, obj):
+    def enqueue(self, obj, timeout: Optional[float] = None):
+        """ Write to message queue with optional timeout (in seconds) """
         assert self._is_writer, "Only writers can enqueue"
         serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
         if self.n_local_reader > 0:
             if len(serialized_obj) >= self.buffer.max_chunk_bytes:
-                with self.acquire_write() as buf:
+                with self.acquire_write(timeout) as buf:
                     buf[0] = 1  # overflow
                 self.local_socket.send(serialized_obj)
             else:
-                with self.acquire_write() as buf:
+                with self.acquire_write(timeout) as buf:
                     buf[0] = 0  # not overflow
                     buf[1:len(serialized_obj) + 1] = serialized_obj
         if self.n_remote_reader > 0:
             self.remote_socket.send(serialized_obj)
 
-    def dequeue(self):
+    def dequeue(self, timeout: Optional[float] = None):
+        """ Read from message queue with optional timeout (in seconds) """
         if self._is_local_reader:
-            with self.acquire_read() as buf:
+            with self.acquire_read(timeout) as buf:
                 overflow = buf[0] == 1
                 if not overflow:
                     # no need to know the size of serialized object
@@ -448,13 +477,19 @@ def broadcast_object(self, obj=None):
             return self.dequeue()
 
     @staticmethod
-    def create_from_process_group(pg: ProcessGroup,
+    def create_from_process_group(pg: Union[ProcessGroup,
+                                            StatelessProcessGroup],
                                   max_chunk_bytes,
                                   max_chunks,
                                   writer_rank=0) -> "MessageQueue":
-        group_rank = dist.get_rank(pg)
-        group_world_size = dist.get_world_size(pg)
-        global_ranks = dist.get_process_group_ranks(pg)
+        if isinstance(pg, ProcessGroup):
+            group_rank = dist.get_rank(pg)
+            group_world_size = dist.get_world_size(pg)
+            global_ranks = dist.get_process_group_ranks(pg)
+        else:
+            group_rank = pg.rank
+            group_world_size = pg.world_size
+            global_ranks = list(range(pg.world_size))
 
         from vllm.distributed.parallel_state import in_the_same_node_as
         status = in_the_same_node_as(pg, source_rank=writer_rank)
@@ -472,15 +507,21 @@ def create_from_process_group(pg: ProcessGroup,
                 max_chunks=max_chunks,
             )
             handle = buffer_io.export_handle()
-            dist.broadcast_object_list([handle],
-                                       src=global_ranks[writer_rank],
-                                       group=pg)
+            if isinstance(pg, ProcessGroup):
+                dist.broadcast_object_list([handle],
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+            else:
+                pg.broadcast_obj(handle, writer_rank)
         else:
-            recv = [None]
-            dist.broadcast_object_list(recv,
-                                       src=global_ranks[writer_rank],
-                                       group=pg)
-            handle = recv[0]  # type: ignore
+            if isinstance(pg, ProcessGroup):
+                recv = [None]
+                dist.broadcast_object_list(recv,
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+                handle = recv[0]  # type: ignore
+            else:
+                handle = pg.broadcast_obj(None, writer_rank)
             buffer_io = MessageQueue.create_from_handle(handle, group_rank)
         buffer_io.wait_until_ready()
         return buffer_io
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
new file mode 100644
index 0000000000000..dab2d10c4c9d0
--- /dev/null
+++ b/vllm/distributed/kv_transfer/README.md
@@ -0,0 +1,30 @@
+
+# Distributed KV cache transfer
+
+This folder implements distributed KV cache transfer across vLLM instances.
+Currently the main usecase is for disaggregated prefilling.
+
+## Abstractions
+
+The KV cache transfer contains three layer of abstractions:
+
+- KV pipe: a FIFO pipe for torch.tensor transmission. Key APIs: `send_tensor` and `recv_tensor`.
+- KV lookup buffer: a lookup buffer for KV caches. Key: the tokens, value: the KV caches (and/or hidden states). Key APIs: `insert` and `drop_select` (similar to SQL semantics).
+- KV connector: a connector that connects the KV pipe and KV lookup buffer to vLLM. Key APIs: `send_kv_caches_and_hidden_states` and `recv_kv_caches_and_hidden_states`.
+
+Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer.
+
+NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed 
+communication service already supports key-value-based lookup (like redis or 
+RDMA database).
+
+NOTE: If you want to not only transfer KV caches, but adjust the model execution flow of vLLM as well (for example, allow vLLM to receive KV caches on some tokens and do prefill on the remaining tokens), you can bypass both KV pipe layer and KV lookup buffer layer, and directly implement on KV connector layer. Bear in mind that as vLLM's model input is constantly changing, this implementation will likely be broken when vLLM has new updates.
+
+## Disaggregated prefilling
+
+The example usage is in [this file](../../../examples/disaggregated_prefill.sh).
+
+Here is the diagram of how we run disaggretgated prefilling.
+
+![Disaggregated prefill workflow](./disagg_prefill_workflow.jpg)
+
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg b/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
new file mode 100644
index 0000000000000..a25ec5ef52491
Binary files /dev/null and b/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg differ
diff --git a/vllm/distributed/kv_transfer/kv_connector/__init__.py b/vllm/distributed/kv_transfer/kv_connector/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
new file mode 100644
index 0000000000000..6089e3babac3e
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -0,0 +1,122 @@
+"""
+KVConnectorBase Class for Distributed KV Cache & Hidden State communication
+
+The class provides two primary abstract methods:
+1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
+2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
+"""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+import torch
+
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+
+class KVConnectorBase(ABC):
+    """
+    Abstract base class for a KV connector.
+
+    The class provides two primary abstract methods:
+    1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
+    2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
+    """
+
+    @abstractmethod
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: "VllmConfig",
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the 
+        connector when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        """
+        Send KV caches and hidden states to the connector.
+
+        This method processes the input tokens, KV caches, and 
+        hidden/intermediate states for a given model and sends the data to the 
+        decode instance.
+
+        Args:
+            model_executable (torch.nn.Module): The model executable containing 
+                start and end layer information.
+            model_input (ModelInputForGPUWithSamplingMetadata): The input
+                metadata from vLLM.
+            kv_caches (List[torch.Tensor]): List of KV caches (keys and values) 
+                for each layer.
+            hidden_or_intermediate_states (Union[torch.Tensor, 
+            IntermediateTensors]): 
+                The hidden or intermediate states associated with the tokens.
+
+        Returns:
+            None
+
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+        """
+        Receive KV caches and hidden states from the connector.
+
+        This method attempts to retrieve KV caches and hidden states for input
+        tokens. If all required KV caches and hidden states are received, it
+        will bypass model input, else it will fall back to normal vLLM model 
+        forwarding.
+
+        Args:
+            model_executable (torch.nn.Module): 
+                The model executable from vLLM modelrunner.
+            model_input (ModelInputForGPUWithSamplingMetadata): 
+                The model input from vLLM modelrunner.
+            kv_caches (List[torch.Tensor]): 
+                List of KV caches for each layer.
+
+        Returns:
+            - hidden_or_intermediate_states (torch.Tensor or
+            IntermediateTensors): 
+                Concatenated hidden states if all required data is retrieved, 
+                otherwise `None`.
+            - bypass_model_exec (bool): 
+                Indicates whether the model execution can be skipped (True) or 
+                needs to be redone (False).
+            - model_input (ModelInputForGPUWithSamplingMetadata): 
+                Optionally adjusted input metadata for re-execution when 
+                `bypass_model_exec=False`.
+
+        """
+
+        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
new file mode 100644
index 0000000000000..3e2bb436d24b5
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -0,0 +1,20 @@
+from typing import TYPE_CHECKING
+
+from .base import KVConnectorBase
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+
+class KVConnectorFactory:
+
+    @staticmethod
+    def create_connector(rank: int, local_rank: int,
+                         config: "VllmConfig") -> KVConnectorBase:
+        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
+        if config.kv_transfer_config.kv_connector in supported_kv_connector:
+            from .simple_connector import SimpleConnector
+            return SimpleConnector(rank, local_rank, config)
+        else:
+            raise ValueError(f"Unsupported connector type: "
+                             f"{config.kv_connector}")
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
new file mode 100644
index 0000000000000..4ace03ff1184e
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -0,0 +1,312 @@
+"""
+Simple KV Cache Connector for Distributed Machine Learning Inference
+
+The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache 
+producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or
+MooncakePipe.
+
+But the logic can be extended to support other pipe and lookup buffer.
+"""
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
+    SimpleBuffer)
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class SimpleConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+
+        self.config = config.kv_transfer_config
+
+        if self.config.kv_connector == "PyNcclConnector":
+            from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
+                PyNcclPipe)
+            logger.info(
+                "Initializing PyNcclConfig under kv_transfer_config %s",
+                self.config)
+        elif self.config.kv_connector == "MooncakeConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_distributed_pipe = os.getenv(
+                'MOONCAKE_CONFIG_PATH') is not None
+
+            if not use_mooncake_distributed_pipe:
+                raise ValueError(
+                    "To use MooncakeConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import (  # noqa: E501
+                    MooncakePipe)
+                logger.info(
+                    "Initializing MooncakeConfig under kv_transfer_config %s",
+                    self.config)
+
+        self.lookup_buffer_size = self.config.kv_buffer_size
+
+        self.producer_buffer: Optional[SimpleBuffer] = None
+        self.consumer_buffer: Optional[SimpleBuffer] = None
+
+        self.producer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+
+        # 2 pipes for every rank in the world
+        port_offset_base = 2 * rank
+
+        # In disaggregated prefill, the prefill vLLM only uses send pipe
+        # and the decode vLLM only uses recv pipe
+        if self.config.is_kv_producer:
+
+            if self.config.kv_connector == "PyNcclConnector":
+                self.producer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.producer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.producer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                # We only need to initialize MooncakePipe once
+                self.producer_signal_pipe = self.producer_data_pipe
+
+            self.producer_buffer = SimpleBuffer(self.producer_signal_pipe,
+                                                self.producer_data_pipe,
+                                                self.config.kv_buffer_size)
+
+        else:
+
+            # the current vLLM instance is KV consumer, so it needs to connect
+            # its recv pipe to the send pipe of KV producder
+            if self.config.kv_connector == "PyNcclConnector":
+                self.consumer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.consumer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.consumer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                self.consumer_signal_pipe = self.consumer_data_pipe
+
+            self.consumer_buffer = SimpleBuffer(
+                self.consumer_signal_pipe,
+                self.consumer_data_pipe,
+                self.config.kv_buffer_size,
+            )
+
+    def select(self, input_tokens: Optional[torch.Tensor],
+               roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+
+        assert self.consumer_buffer is not None, "Please initialize the "\
+            "consumer buffer before calling select."
+        return self.consumer_buffer.drop_select(input_tokens, roi)
+
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+
+        assert self.producer_buffer is not None, "Please initialize the "\
+            "producer buffer before calling insert."
+
+        self.producer_buffer.insert(input_tokens, roi, key, value, hidden)
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+
+        model_config = model_executable.model.config
+        num_heads = model_config.num_key_value_heads
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+        head_size = int(hidden_size / num_attention_heads)
+
+        # query_lens contains new KV caches that are added to vLLM.
+        # so we will send them to decode instance
+        # FIXME(Kuntai): This assume that all requests are prefill.
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+
+            keys, values = [], []
+
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
+
+                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+
+                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
+
+                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
+                values.append(value_cache[current_slot_mapping].unsqueeze(0))
+
+            keys = torch.cat(keys, dim=0)
+            values = torch.cat(values, dim=0)
+
+            self.insert(current_tokens,
+                        torch.ones_like(current_tokens,
+                                        dtype=bool), keys, values,
+                        hidden_or_intermediate_states[start_pos:end_pos])
+
+        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+
+        # When bypass_model_exec is set to False, it means that at least for one
+        # request its corresponding KV cache or hidden state is missing.
+        # In this case we need to do prefilling to recompute missing KV cache
+        # and hidden states.
+        bypass_model_exec = True
+
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+
+        hidden_or_intermediate_states_for_one_req = []
+
+        input_tokens_list = []
+        num_computed_tokens_list = []
+        start_pos_list = []
+
+        # enumerate different requests
+        # FIXME(Kuntai): This impl assumes that all requests are prefill.
+        for idx, slen in enumerate(seq_lens):
+
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+            num_tokens = slen
+
+            # collecting data for rebuilding the input
+            input_tokens_list.append(current_tokens)
+            start_pos_list.append(start_pos)
+
+            ret = self.select(current_tokens,
+                              torch.ones_like(current_tokens, dtype=bool))
+            if ret[0] is None:
+                # didn't find any match.
+                bypass_model_exec = False
+                num_computed_tokens_list.append(0)
+                continue
+
+            roi: torch.Tensor = ret[1]
+            keys: torch.Tensor = ret[2]
+            values: torch.Tensor = ret[3]
+            hidden: torch.Tensor = ret[4]
+
+            num_computed_tokens = roi.shape[0]
+            num_computed_tokens_list.append(num_computed_tokens)
+
+            # check if both KV cache and the hidden states are received
+            # If not, need to redo the forwarding to compute missing states
+            if not all([(num_computed_tokens == num_tokens), hidden is not None
+                        ]):
+                bypass_model_exec = False
+
+            # update the end position based on how many tokens are cached.
+            end_pos = start_pos + num_computed_tokens
+
+            # put received KV caches into paged memory
+            for i in range(model_executable.model.start_layer,
+                           model_executable.model.end_layer):
+
+                kv_cache = kv_caches[i - model_executable.model.start_layer]
+                layer = model_executable.model.layers[i]
+
+                key_cache, value_cache = kv_cache[0], kv_cache[1]
+                ops.reshape_and_cache_flash(
+                    keys[i - model_executable.model.start_layer].to(
+                        key_cache.device),
+                    values[i - model_executable.model.start_layer].to(
+                        value_cache.device),
+                    key_cache,
+                    value_cache,
+                    slot_mapping[start_pos:end_pos],
+                    layer.self_attn.attn.kv_cache_dtype,
+                    layer.self_attn.attn._k_scale,
+                    layer.self_attn.attn._v_scale,
+                )
+
+            hidden_or_intermediate_states_for_one_req.append(hidden)
+
+        if not bypass_model_exec:
+            # Some of the KV cache is not retrieved
+            # Here we will fall back to normal model forwarding
+            # But optionally you can adjust model_input so that you only do
+            # prefilling on those tokens that are missing KV caches.
+            logger.debug(
+                "[rank%d]: Failed to receive all KVs and hidden "
+                "states, redo model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = None
+
+        else:
+            logger.debug(
+                "[rank%d]: Successfully received all KVs and hidden "
+                "states, skip model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = torch.cat(
+                hidden_or_intermediate_states_for_one_req, dim=0)
+
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    def close(self):
+        self.producer_data_pipe.close()
+        self.consumer_data_pipe.close()
+        if self.config.kv_connector == "PyNcclConnector":
+            self.producer_signal_pipe.close()
+            self.consumer_signal_pipe.close()
+        elif self.config.kv_connector == "MooncakeConnector":
+            # MooncakePipe reuses data_pipe for signal_pipe, so we only have to
+            # close the data_pipe.
+            pass
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
new file mode 100644
index 0000000000000..bad119a1aa929
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -0,0 +1,108 @@
+"""
+This file contains a new class `KVLookupBufferBase` that allows developers to 
+think of KV cache operations as inserting new KV cache entries (`insert`) 
+into the lookup buffer and querying existing KV caches (`drop_select`) 
+from the lookup buffer.
+
+All distributed communications are abstracted behind this class.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+import torch
+
+
+class KVLookupBufferBase(ABC):
+    """
+    Abstract base class for a lookup buffer.
+
+    This class provides an abstraction for a key-value (KV) cache lookup buffer.
+    
+    The key of the lookup buffer:
+    - input_tokens: token IDs of the request
+    - roi: a binary mask on top of input_tokens.
+      - Purpose of roi: Since KV cache may only be available for a subset of 
+        tokens in the input (for example, when vLLM is connected to an external 
+        KV cache service), roi specifies the subset of tokens that the KV cache 
+        is associated with.
+      - NOTE: roi can be further extended to describe which part of KV the 
+        current process is holding (each process may only hold a part of KV 
+        due to TP and PP). This is not implemented for now.
+        
+    The value of the lookup buffer:
+    - key: the key tensor in the KV cache
+    - value: the value tensor in the KV cache
+    - hidden: the final hidden state generated by model forwarding. This allows 
+      vLLM to bypass further model forwarding by transmitting the hidden state.
+    """
+
+    @abstractmethod
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+        """Insert into the lookup buffer.
+        
+        The functionality is similar to the following python statement
+        ```
+        buffer[input_tokens, roi] = [key, value, hidden]
+        ```
+        
+        FIXME: in the future, we should only have two arguments, key and value,
+        where key is a tensor dict and value is a tensor dict.
+        
+        FIXME: we should transmit both sampler outputs and the hidden states.
+
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+            key (torch.Tensor): The key tensor in the KV cache.
+            value (torch.Tensor): The value tensor in the KV cache.
+            hidden (torch.Tensor): The final hidden state tensor generated 
+                                   during model forwarding to bypass model 
+                                   forwarding.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def drop_select(
+            self, input_tokens: Optional[torch.Tensor],
+            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+        """Select and *drop* KV cache entries from the lookup buffer.
+        
+        The functionality is similar to the following python statements
+        ```
+        ret = buffer.pop(input_tokens, roi)
+        return ret
+        ```
+        
+        If `input_tokens` and `roi` is `None`, it means selecting any of the
+        KV caches in the buffer, return, and remove it from the buffer, useful
+        when offloading KV cache to KV cache storage service.
+
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+
+        Returns:
+            List[Optional[torch.Tensor]]: A list of tensors. Can be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the 
+        lookup buffer when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
new file mode 100644
index 0000000000000..fe8d8d7375f36
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -0,0 +1,242 @@
+"""
+    Implements a distributed key-value (KV) cache transfer mechanism.
+
+    Key Features:
+    - Distributed KV cache transmission using PyNccl pipes.
+    - Non-blocking `insert`, blocking `drop_select`.
+    - Use CPU signal pipe to avoid racing condition
+    - Handles buffer size constraints and provide backpressure mechanism to 
+      stop the prefill instance when the decode instance is slow.
+"""
+import threading
+import time
+from collections import deque
+from typing import Deque, List, Optional, Union
+
+import torch
+
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVLookupBufferBase)
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class SimpleBuffer(KVLookupBufferBase):
+
+    def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase,
+                 buffer_size_thresh: float):
+        """
+        signal_pipe: on CPU 
+        
+        NOTE: on-device recv will block all threads in the process, making the 
+        KV cache producer unable to listen to new request while transmitting 
+        KV cache. Luckily CPU recv only blocks the current thread so we use 
+        CPU recv to listen to new request.
+        
+        data_pipe: on device (e.g. GPU)
+        """
+
+        self.buffer: Deque[List[torch.Tensor]] = deque()
+
+        self.buffer_size = 0
+        self.buffer_size_threshold = buffer_size_thresh
+        self.buffer_lock = threading.Lock()
+        self.signal_pipe = signal_pipe
+        self.data_pipe = data_pipe
+        self.request_handling_thread: Optional[threading.Thread] = None
+
+        self.normal_signal = torch.tensor([0], device="cpu")
+        self.end_signal = None
+
+    def _matches(self, tokens_roi_sender: List[torch.Tensor],
+                 tokens_roi_recver: List[torch.Tensor]):
+
+        # tokens_roi_sender: tokens and roi of the producer (in the buffer)
+        # tokens_roi_recver: tokens and roi of the consumer (query)
+
+        tokens_sender = tokens_roi_sender[0]
+        tokens_recver = tokens_roi_recver[0]
+        roi_sender = tokens_roi_sender[1]
+        roi_recver = tokens_roi_recver[1]
+
+        if tokens_recver is None:
+            # consumer sends an empty request
+            # semantics: DROP SELECT * LIMIT 1
+            # so any of the data in the buffer can be drop-selected
+            return True
+
+        # Assuming that roi is a binary mask on tokens
+        tokens_sender = tokens_sender[roi_sender]
+        tokens_recver = tokens_recver[roi_recver]
+
+        # simple common prefix matching
+        min_length = min(len(tokens_sender), len(tokens_recver))
+        if torch.allclose(tokens_sender[:min_length],
+                          tokens_recver[:min_length]):
+            return min_length
+
+        return 0
+
+    def _send_tensor_and_dec_size(self,
+                                  tensor: Optional[torch.Tensor]) -> None:
+
+        assert tensor is not None, "Use self.data_pipe.send(None) instead"
+        self.buffer_size -= tensor.element_size() * tensor.numel()
+        if tensor.dtype == torch.bool:
+            tensor = tensor.float()
+        self.data_pipe.send_tensor(tensor)
+
+    def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
+
+        if isinstance(data, torch.Tensor):
+            return data.element_size() * data.numel()
+        if not data:
+            # cannot perform `not data` on a tensor
+            # so this check needs to go after the check above
+            return 0
+
+        raise AssertionError(f"Unknown data type {type(data)}")
+
+    def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+                       key: torch.Tensor, value: torch.Tensor,
+                       hidden: torch.Tensor):
+
+        if isinstance(input_tokens, torch.Tensor):
+            input_tokens = input_tokens.clone()
+        if isinstance(roi, torch.Tensor):
+            roi = roi.clone()
+        if isinstance(key, torch.Tensor):
+            key = key.clone()
+        if isinstance(value, torch.Tensor):
+            value = value.clone()
+        if isinstance(hidden, torch.Tensor):
+            hidden = hidden.clone()
+
+        buffer_item = [input_tokens, roi, key, value, hidden]
+
+        with self.buffer_lock:
+            for data in buffer_item:
+                self.buffer_size += self._get_element_size(data)
+            self.buffer.append(buffer_item)
+
+    def _is_end_signal(self, signal):
+        return signal is None
+
+    def drop_select_handler(self):
+
+        try:
+
+            while True:
+                signal = self.signal_pipe.recv_tensor()
+                if self._is_end_signal(signal):
+                    logger.info("Received end signal!")
+                    break
+
+                input_tokens = self.data_pipe.recv_tensor()
+
+                roi = self.data_pipe.recv_tensor()
+                assert roi is not None, "Please provide the roi when sending "\
+                    "drop-select request"
+                roi = (roi > 0.5)
+                tokens_roi_recver = [input_tokens, roi]
+
+                matched_length = 0
+
+                # perform input tokens and roi matching
+                # FIXME: this matching is O(n), ideally it should be O(1)
+                # but this buffer size won't (and shouldn't) be too large so
+                # the fix is not urgent.
+                with self.buffer_lock:
+
+                    for _ in range(len(self.buffer)):
+
+                        temp_length = self._matches(self.buffer[0],
+                                                    tokens_roi_recver)
+                        if temp_length > 0:
+                            matched_length = temp_length
+                            break
+                        # rotate the element we just accessed to the end
+                        self.buffer.rotate(-1)
+
+                    if matched_length > 0:
+                        # need to clone the tensor
+                        # in case the tensor is freed before sending finishes
+                        matched_item = self.buffer.popleft()
+                        for tensor in matched_item:
+                            self._send_tensor_and_dec_size(tensor)
+
+                    else:
+                        # no match, just send None
+                        for _ in range(5):
+                            self.data_pipe.send_tensor(None)
+
+        except RuntimeError as e:
+            if 'Connection closed by peer' not in str(e):
+                raise e
+
+        logger.debug("Closing drop_select_handler")
+
+    def drop_select(
+            self, input_tokens: Optional[torch.Tensor],
+            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+
+        assert self.request_handling_thread is None, \
+            "drop_select should be called by the KV cache consumer "\
+            "(e.g. the decode vLLM instance)"
+
+        if isinstance(input_tokens, torch.Tensor):
+            input_tokens = input_tokens.clone()
+        if isinstance(roi, torch.Tensor):
+            roi = roi.clone().float()
+
+        self.signal_pipe.send_tensor(self.normal_signal)
+        self.data_pipe.send_tensor(input_tokens)
+        self.data_pipe.send_tensor(roi)
+
+        input_tokens = self.data_pipe.recv_tensor()
+        roi = self.data_pipe.recv_tensor()
+        if roi is not None:
+            # convert from float tensor to bool tensor
+            # as PyNccl does not support sending bool tensor
+            roi = (roi > 0.5)
+        key = self.data_pipe.recv_tensor()
+        value = self.data_pipe.recv_tensor()
+        hidden = self.data_pipe.recv_tensor()
+
+        return [input_tokens, roi, key, value, hidden]
+
+    def full_handler(self):
+        time.sleep(0.001)
+
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+
+        if self.buffer_size > self.buffer_size_threshold:
+            # log outside the while loop to avoid this message being logged
+            # repeatedly.
+            logger.debug("KV transfer buffer is full. Handling...")
+        while self.buffer_size > self.buffer_size_threshold:
+            self.full_handler()
+
+        self._add_to_buffer(input_tokens, roi, key, value, hidden)
+
+        # when calling the insert, the current process is a sender
+        # need to launch the request handler and start listening to request.
+        if self.request_handling_thread is None:
+            self.request_handling_thread = threading.Thread(
+                target=self.drop_select_handler)
+            self.request_handling_thread.start()
+
+    def close(self):
+
+        if hasattr(self, "request_handling_thread"
+                   ) and self.request_handling_thread is not None:
+            self.request_handling_thread.join()
+
+        else:
+            # TODO: have a explicit close signal and have a explicit way to
+            # check if it's requester
+            self.signal_pipe.send_tensor(self.end_signal)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/__init__.py b/vllm/distributed/kv_transfer/kv_pipe/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
new file mode 100644
index 0000000000000..4b0cb44cc5b81
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -0,0 +1,65 @@
+"""
+This file defines an interface `KVPipeBase`
+that provides an abstraction for sending and receiving tensors, or None, via
+distributed communications.
+
+All classes instantiated from this interface are assumed to be a FIFO pipe.
+
+If your distributed communication platform already supports key-value lookup,
+you can bypass this interface and directly start from `kv_lookup_buffer`.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+
+class KVPipeBase(ABC):
+    """
+    This class provides an interface for sending and receiving tensors, or
+    None, by distributed communications.
+    """
+
+    @abstractmethod
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send a tensor, or None, via the pipe.
+        
+        Need to support sending None -- important for error handling.
+        
+        TODO: add a `key` argument so that we can use traditional 
+        key-value database as the distributed communication mechanism behind 
+        the pipe.
+
+        Args:
+            tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive a tensor (can be None) from the pipeline.
+
+        Returns:
+            Optional[torch.Tensor]: The tensor received from the pipeline. Can 
+                                    be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the pipeline and release resources.
+
+        This method is responsible for closing the communication pipeline 
+        and releasing any resources associated with it.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
new file mode 100644
index 0000000000000..8e4358672b74d
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -0,0 +1,272 @@
+import json
+import os
+import pickle
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import zmq
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+NONE_INT = -150886311
+
+
+@dataclass
+class MooncakeTransferEngineConfig:
+    prefill_url: str
+    decode_url: str
+    metadata_backend: Union[str, None]
+    metadata_server: str
+    protocol: str
+    device_name: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeTransferEngineConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeTransferEngineConfig(
+            prefill_url=config.get("prefill_url"),
+            decode_url=config.get("decode_url"),
+            metadata_backend=config.get("metadata_backend", None),
+            metadata_server=config.get("metadata_server"),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeTransferEngineConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeTransferEngineConfig.from_file(config_file_path)
+
+
+class MooncakeTransferEngine:
+    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
+
+    def __init__(self, kv_rank: int, local_rank: int):
+        try:
+            import mooncake_vllm_adaptor as mva
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+
+        self.engine = mva.mooncake_vllm_adaptor()
+        self.local_rank = local_rank
+
+        try:
+            self.config = MooncakeTransferEngineConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+        except ValueError as e:
+            logger.error(e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+        prefill_host, base_prefill_port = self.config.prefill_url.split(':')
+        decode_host, base_decode_port = self.config.decode_url.split(':')
+
+        # Avoid ports conflict when running prefill and decode on the same node
+        if prefill_host == decode_host and \
+                base_prefill_port == base_decode_port:
+            base_decode_port = str(int(base_decode_port) + 100)
+
+        prefill_port = int(base_prefill_port) + self.local_rank
+        decode_port = int(base_decode_port) + self.local_rank
+        self.prefill_url = ':'.join([prefill_host, str(prefill_port)])
+        self.decode_url = ':'.join([decode_host, str(decode_port)])
+
+        self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
+                        self.config.metadata_server, self.config.protocol,
+                        self.config.device_name, self.config.metadata_backend)
+
+        self.remote_url = (self.decode_url
+                           if kv_rank == 0 else self.prefill_url)
+
+        # Initialize ZeroMQ context and sockets
+        self.context = zmq.Context()  # type: ignore[attr-defined]
+        self.sender_socket = self.context.socket(zmq.constants.PUSH)
+        self.receiver_socket = self.context.socket(zmq.constants.PULL)
+        self.sender_ack = self.context.socket(zmq.constants.PULL)
+        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
+
+        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
+        self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
+                                     decode_host, base_decode_port)
+
+    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str,
+                                d_host: str, d_port: str) -> None:
+        """Set up ZeroMQ sockets for sending and receiving data."""
+        # Offsets < 8 are left for initialization in case tp and pp are enabled
+        p_rank_offset = int(p_port) + 8 + self.local_rank * 2
+        d_rank_offset = int(d_port) + 8 + self.local_rank * 2
+        if kv_rank == 0:
+            self.sender_socket.bind(f"tcp://*:{p_rank_offset + 1}")
+            self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}")
+            self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}")
+            self.receiver_ack.bind(f"tcp://*:{p_rank_offset + 2}")
+        else:
+            self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}")
+            self.sender_socket.bind(f"tcp://*:{d_rank_offset + 1}")
+            self.receiver_ack.bind(f"tcp://*:{d_rank_offset + 2}")
+            self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}")
+
+    def initialize(self, local_hostname: str, metadata_server: str,
+                   protocol: str, device_name: str,
+                   metadata_backend: Union[str, None]) -> None:
+        """Initialize the mooncake instance."""
+        if metadata_backend is None:
+            self.engine.initialize(local_hostname, metadata_server, protocol,
+                                   device_name)
+        else:
+            supported_backend = ["etcd", "redis"]
+            metadata_backend = metadata_backend.lower()
+            if metadata_backend not in supported_backend:
+                raise ValueError(
+                    "Mooncake Configuration error. `metadata_backend`"
+                    f"should be one of {supported_backend}.")
+
+            self.engine.initializeExt(local_hostname, metadata_server,
+                                      protocol, device_name, metadata_backend)
+
+    def allocate_managed_buffer(self, length: int) -> int:
+        """Allocate a managed buffer of the specified length."""
+        ret = self.engine.allocateManagedBuffer(length)
+        if ret <= 0:
+            logger.error("Allocation Return Error")
+            raise Exception("Allocation Return Error")
+        return ret
+
+    def free_managed_buffer(self, buffer: int, length: int) -> int:
+        """Free a previously allocated managed buffer."""
+        return self.engine.freeManagedBuffer(buffer, length)
+
+    def transfer_sync(self, buffer: int, peer_buffer_address: int,
+                      length: int) -> int:
+        """Synchronously transfer data to the specified address."""
+        ret = self.engine.transferSync(self.remote_url, buffer,
+                                       peer_buffer_address, length)
+        if ret < 0:
+            logger.error("Transfer Return Error")
+            raise Exception("Transfer Return Error")
+        return ret
+
+    def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
+                              length: int) -> int:
+        """Write bytes to the allocated buffer."""
+        return self.engine.writeBytesToBuffer(buffer, user_data, length)
+
+    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
+        """Read bytes from the allocated buffer."""
+        return self.engine.readBytesFromBuffer(buffer, length)
+
+    def wait_for_ack(self, src_ptr: int, length: int) -> None:
+        """Asynchronously wait for ACK from the receiver."""
+        ack = self.sender_ack.recv_pyobj()
+        if ack != b'ACK':
+            logger.error("Failed to receive ACK from the receiver")
+
+        self.free_managed_buffer(src_ptr, length)
+
+    def send_bytes(self, user_data: bytes) -> None:
+        """Send bytes to the remote process."""
+        length = len(user_data)
+        src_ptr = self.allocate_managed_buffer(length)
+        self.write_bytes_to_buffer(src_ptr, user_data, length)
+        self.sender_socket.send_pyobj((src_ptr, length))
+        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
+
+    def recv_bytes(self) -> bytes:
+        """Receive bytes from the remote process."""
+        src_ptr, length = self.receiver_socket.recv_pyobj()
+        dst_ptr = self.allocate_managed_buffer(length)
+        self.transfer_sync(dst_ptr, src_ptr, length)
+        ret = self.read_bytes_from_buffer(dst_ptr, length)
+
+        # Buffer cleanup
+        self.receiver_ack.send_pyobj(b'ACK')
+        self.free_managed_buffer(dst_ptr, length)
+
+        return ret
+
+
+class MooncakePipe(KVPipeBase):
+    """MooncakeTransferEngine based Pipe implementation."""
+
+    def __init__(self,
+                 local_rank: int,
+                 config: KVTransferConfig,
+                 device: Optional[str] = None):
+        """Initialize the mooncake pipe and set related parameters."""
+        self.config = config
+        self.local_rank = local_rank
+        self.kv_rank = self.config.kv_rank
+        if device is None:
+            self.device = self._select_device(self.config.kv_buffer_device)
+        else:
+            self.device = self._select_device(device)
+
+        self.transfer_engine = MooncakeTransferEngine(self.kv_rank,
+                                                      self.local_rank)
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
+
+    def _select_device(self, device: str) -> torch.device:
+        """Select available device (CUDA or CPU)."""
+        logger.info("Selecting device: %s", device)
+        if device == "cuda":
+            return torch.device(f"cuda:{self.local_rank}")
+        else:
+            return torch.device("cpu")
+
+    def tensor_hash(self, tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        return hash(tensor.data_ptr())
+
+    def _send_impl(self, tensor: torch.Tensor) -> None:
+        """Implement the tensor sending logic."""
+        value_bytes = pickle.dumps(tensor)
+        self.transfer_engine.send_bytes(value_bytes)
+
+    def _recv_impl(self) -> torch.Tensor:
+        """Implement the tensor receiving logic."""
+        data = self.transfer_engine.recv_bytes()
+        return pickle.loads(data)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send tensor to the target process."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = tensor if tensor is not None else self.none_tensor
+        assert (len(tensor.shape) > 0)
+        self.transport_thread.submit(self._send_impl, tensor)
+
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive tensor from other processes."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = self.transport_thread.submit(self._recv_impl).result()
+        if tensor.numel() == 1 and tensor.item() == NONE_INT:
+            return None
+        else:
+            return tensor
+
+    def close(self) -> None:
+        """Cleanup logic when closing the pipe."""
+        self.transfer_engine.sender_socket.close()
+        self.transfer_engine.receiver_socket.close()
+        self.transfer_engine.sender_ack.close()
+        self.transfer_engine.receiver_ack.close()
+        self.transfer_engine.context.term()  # Terminate the ZMQ context
+        logger.info("Closed the transfer engine and cleaned up resources.")
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
new file mode 100644
index 0000000000000..98222fa67e492
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -0,0 +1,276 @@
+"""
+    This module implements a PyNccl pipe for sending and receiving 
+    Optional[torch.Tensor] between distributed ranks with advanced 
+    communication features.
+
+    Key Features:
+    - Supports sending and receiving tensors with metadata
+    - Handles both CUDA and CPU device communications
+    - Implements a non-blocking tensor transfer mechanism
+    - Manages buffer size and provides backpressure control
+    - Supports distributed process groups with configurable parameters
+"""
+
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Callable, Dict, Optional, Tuple
+
+import torch
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class BrokenPipeException(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
+Metadata = Dict[str, Optional[torch.Tensor]]
+
+
+class PyNcclPipe(KVPipeBase):
+
+    METADATA_LENGTH = 16
+    MAX_TENSOR_DIMENSIONS = 14
+    METADATA_DTYPE = torch.int64
+
+    def __init__(self,
+                 local_rank: int,
+                 config: KVTransferConfig,
+                 device: Optional[str] = None,
+                 port_offset: int = 0):
+        self.config = config
+        self.local_rank = local_rank
+        self.kv_rank = self.config.kv_rank
+        self.kv_parallel_size = self.config.kv_parallel_size
+        if device is None:
+            self.device = self._select_device(self.config.kv_buffer_device)
+        else:
+            self.device = self._select_device(device)
+
+        # build distributed connection and send/recv implementation
+        self.group = StatelessProcessGroup.create(
+            host=self.config.kv_ip,
+            port=self.config.kv_port + port_offset,
+            rank=self.kv_rank,
+            world_size=self.kv_parallel_size,
+        )
+        # add a barrier to make sure the connection is initiated properly
+        self.group.barrier()
+        impl = self._get_device_send_recv_impl(self.group)
+        self.device_send_func, self.device_recv_func = impl
+        # set target rank
+        self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size
+        self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
+
+        # transportation-related variables
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.buffer_size = 0
+        self.buffer_size_lock = threading.Lock()
+        self.buffer_size_thresh = self.config.kv_buffer_size
+
+    def _get_device_send_recv_impl(
+        self, group: StatelessProcessGroup
+    ) -> Tuple[Callable[[torch.Tensor, int], None], Callable[
+        [torch.Tensor, int], None]]:
+
+        send: Callable[[torch.Tensor, int], None]
+        recv: Callable[[torch.Tensor, int], None]
+        if self.device.type == "cuda":
+            # use PyNCCL for send / recv
+            comm = PyNcclCommunicator(group, device=self.local_rank)
+            comm.disabled = False
+            send, recv = comm.send, comm.recv  # type: ignore
+        else:
+            # This send / recv implementation here is NOT intended to transfer
+            # KV caches (and should NOT be repurposed to transfer KV caches).
+            # Currently it is only used to transmit control-plane messages
+            # for PyNcclBuffer.
+            send = group.send_obj
+
+            def my_recv(x, src):
+                x[...] = group.recv_obj(src)
+
+            recv = my_recv
+
+        return send, recv
+
+    def _select_device(self, device: str):
+        logger.info("Selecting device: %s", device)
+        if device == "cuda":
+            return torch.device(f"cuda:{self.local_rank}")
+        else:
+            return torch.device("cpu")
+
+    def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata:
+        """
+        Create the metadata as a dictionary based on the input tensor.
+
+        Parameters:
+            - tensor: The input tensor or None if no tensor is provided.
+
+        Returns:
+            - metadata: A dictionary with the following keys:
+                - "dtype": The data type of the tensor or None.
+                - "shape": The shape of the tensor or None.
+        """
+        if tensor is None:
+            return {"dtype": None, "shape": None}
+        else:
+            return {"dtype": tensor.dtype, "shape": tensor.shape}
+
+    def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
+        """
+        Create a buffer to receive the tensor based on the provided metadata.
+
+        Parameters:
+            - metadata: A dictionary with keys "dtype" and "shape", describing 
+              the tensor's data type and shape.
+
+        Returns:
+            - buffer: A tensor of the specified type and shape, allocated on 
+              self.device.
+        """
+        return torch.empty(metadata["shape"],
+                           dtype=metadata["dtype"],
+                           device=self.device)
+
+    def _send_metadata(self, metadata: Metadata):
+        """
+        Send the metadata dictionary to the target rank.
+
+        Parameters:
+            - metadata: A dictionary with keys "dtype" and "shape".
+        """
+        self.group.send_obj(metadata, self.target_rank_for_send)
+
+    def _recv_metadata(self) -> Metadata:
+        """
+        Receive the metadata dictionary from the target rank.
+
+        Returns:
+            - metadata: A dictionary with keys "dtype" and "shape" describing 
+              the tensor.
+        """
+        return self.group.recv_obj(self.target_rank_for_recv)
+
+    def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
+        """
+        The actual implementation of sending the tensor and its metadata to the 
+        target rank.
+
+        Parameters:
+            - tensor: The input tensor to be sent, or None if no tensor is 
+              being sent.
+        """
+        metadata = self._make_metadata(tensor)
+        self._send_metadata(metadata)
+        if tensor is not None:
+            self.device_send_func(tensor.to(self.device),
+                                  self.target_rank_for_send)
+
+    def _recv_impl(self) -> Optional[torch.Tensor]:
+        """
+        The actual implementation of receiving a tensor and its metadata from 
+        the target rank.
+
+        Returns:
+            - buffer: The received tensor, or None if no tensor is received.
+        """
+        metadata = self._recv_metadata()
+        if metadata["dtype"] is None:
+            return None
+        buffer = self._prepare_recv_buffer(metadata)
+        self.device_recv_func(buffer, self.target_rank_for_recv)
+
+        return buffer
+
+    def send_tensor_wrapper(self, tensor: Optional[torch.Tensor],
+                            tensor_size: int) -> None:
+        """
+        Wrapper for _send_impl to handle exceptions and update buffer size.
+        """
+        try:
+            self._send_impl(tensor)
+
+            with self.buffer_size_lock:
+                self.buffer_size -= tensor_size
+        except Exception as e:
+            logger.error("[rank%d]: Exception when trying to send %s, msg: %s",
+                         torch.distributed.get_rank(), str(tensor), str(e))
+            import traceback
+            traceback.print_exc()
+
+    def block_if_full(self):
+        """
+        Block the current thread if the buffer size is larger than the 
+        threshold.
+        """
+        while self.buffer_size > self.buffer_size_thresh:
+            logger.debug("KV cache transfer pipe is full. Waiting...")
+            time.sleep(0.05)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """
+        Sends a tensor and its metadata to the destination rank in a 
+        non-blocking way.
+
+        Parameters:
+            - tensor: The tensor to send, or None if no tensor is being sent.
+        """
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        if tensor is not None:
+            tensor_size = tensor.element_size() * tensor.numel()
+        else:
+            tensor_size = 0
+
+        self.block_if_full()
+
+        with self.buffer_size_lock:
+            self.buffer_size += tensor_size
+
+        self.transport_thread.submit(self.send_tensor_wrapper, tensor,
+                                     tensor_size)
+
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """
+        Receives a tensor and its metadata from the source rank. Blocking call.
+
+        Returns:
+            - tensor: The received tensor, or None if no tensor is received.
+        """
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        future = self.transport_thread.submit(self._recv_impl)
+
+        try:
+            tensor = future.result()
+        except Exception as e:
+            logger.error("Encountering exception in KV receiving thread")
+            logger.error("%s", e)
+            logger.error("My device: %s", self.device)
+            import traceback
+            traceback.print_exc()
+            raise e
+
+        return tensor
+
+    def close(self):
+        """
+        Close the pipe and release associated resources.
+        """
+        if hasattr(self,
+                   "transport_thread") and self.transport_thread is not None:
+            self.transport_thread.shutdown()
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py
new file mode 100644
index 0000000000000..9ce97851dc849
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py
@@ -0,0 +1,75 @@
+"""A centralized entrypoint to perform distributed KV cache transfer.
+
+This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
+1. `send_kv_caches_and_hidden_states`
+2. `recv_kv_caches_and_hidden_states
+"""
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+    from vllm.config import VllmConfig
+
+import torch
+
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+class KVTransferAgent:
+    """
+    A class designated for distributed KV transfer
+    
+    Target use cases:
+        1. Disaggregated prefill
+        2. Remote KV cache storage
+    """
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: "VllmConfig",
+    ):
+
+        self.config = config
+
+        if config.kv_transfer_config is None:
+            raise ValueError("KVTransferConfig is not set in the VllmConfig,"
+                             " cannot initialize KVConnector.")
+
+        assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\
+            "TransferAgent should only be used when kv_connector is set."
+
+        self.connector = KVConnectorFactory.create_connector(
+            rank, local_rank, config)
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+
+        self.connector.send_kv_caches_and_hidden_states(
+            model_executable, model_input, kv_caches,
+            hidden_or_intermediate_states)
+
+    def close(self) -> None:
+        self.connector.close()
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+
+        return self.connector.recv_kv_caches_and_hidden_states(
+            model_executable, model_input, kv_caches)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 87ade377266a2..5b9236f8c56b6 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -27,18 +27,24 @@
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
+                    Union)
 from unittest.mock import patch
 
 import torch
 import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
+import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op, supports_custom_op
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
 
 @dataclass
 class GraphCaptureContext:
@@ -96,42 +102,24 @@ def _register_group(group: "GroupCoordinator") -> None:
     _groups[group.unique_name] = weakref.ref(group)
 
 
-if supports_custom_op():
+def all_reduce(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    return group._all_reduce_out_place(tensor)
 
-    def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
-        assert group_name in _groups, f"Group {group_name} is not found."
-        group = _groups[group_name]()
-        if group is None:
-            raise ValueError(f"Group {group_name} is destroyed.")
-        group._all_reduce_in_place(tensor)
 
-    def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
-        return
+def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    return torch.empty_like(tensor)
 
-    direct_register_custom_op(
-        op_name="inplace_all_reduce",
-        op_func=inplace_all_reduce,
-        mutates_args=["tensor"],
-        fake_impl=inplace_all_reduce_fake,
-    )
-
-    def outplace_all_reduce(tensor: torch.Tensor,
-                            group_name: str) -> torch.Tensor:
-        assert group_name in _groups, f"Group {group_name} is not found."
-        group = _groups[group_name]()
-        if group is None:
-            raise ValueError(f"Group {group_name} is destroyed.")
-        return group._all_reduce_out_place(tensor)
-
-    def outplace_all_reduce_fake(tensor: torch.Tensor,
-                                 group_name: str) -> torch.Tensor:
-        return torch.empty_like(tensor)
 
+if supports_custom_op():
     direct_register_custom_op(
-        op_name="outplace_all_reduce",
-        op_func=outplace_all_reduce,
+        op_name="all_reduce",
+        op_func=all_reduce,
         mutates_args=[],
-        fake_impl=outplace_all_reduce_fake,
+        fake_impl=all_reduce_fake,
     )
 
 
@@ -317,30 +305,13 @@ def graph_capture(
             stream.wait_stream(curr_stream)
 
         with torch.cuda.stream(stream), maybe_ca_context:
-            # In graph mode, we have to be very careful about the collective
-            # operations. The current status is:
-            #     allreduce \ Mode   |  Eager  |  Graph  |
-            # --------------------------------------------
-            # custom allreduce       | enabled | enabled |
-            # PyNccl                 | disabled| enabled |
-            # torch.distributed      | enabled | disabled|
-            #
-            # Note that custom allreduce will have a runtime check, if the
-            #  tensor size is too large, it will fallback to the next
-            #  available option.
-            # In summary: When using CUDA graph, we use
-            #  either custom all-reduce kernel or pynccl. When not using
-            #  CUDA graph, we use either custom all-reduce kernel or
-            #  PyTorch NCCL. We always prioritize using custom all-reduce
-            #  kernel but fall back to PyTorch or pynccl if it is
-            #  disabled or not supported.
             pynccl_comm = self.pynccl_comm
             maybe_pynccl_context: Any
             if not pynccl_comm:
                 maybe_pynccl_context = nullcontext()
             else:
                 maybe_pynccl_context = pynccl_comm.change_state(
-                    enable=True, stream=torch.cuda.current_stream())
+                    stream=torch.cuda.current_stream())
             with maybe_pynccl_context:
                 yield graph_capture_context
 
@@ -356,8 +327,8 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
          coordinator.
 
         In addition, PyTorch custom ops do not support mutation or returning
-        a new tensor in the same op. So we need to figure out if the op is
-        in-place or out-of-place ahead of time.
+        a new tensor in the same op. So we always make the all-reduce operation
+        out-of-place.
         """
         # Bypass the function if we are using only 1 GPU.
         if self.world_size == 1:
@@ -368,10 +339,6 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
             ipex.distributed.all_reduce(input_, group=self.device_group)
             return input_
 
-        if not supports_custom_op():
-            self._all_reduce_in_place(input_)
-            return input_
-
         if self.tpu_communicator is not None and \
             not self.tpu_communicator.disabled:
             # TPU handles Dynamo with its own logic.
@@ -385,30 +352,31 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
                 not self.xpu_communicator.disabled:
             return self.xpu_communicator.all_reduce(input_)
 
-        if self.ca_comm is not None and \
-            not self.ca_comm.disabled and \
-                self.ca_comm.should_custom_ar(input_):
-            return torch.ops.vllm.outplace_all_reduce(
-                input_, group_name=self.unique_name)
-        else:
-            torch.ops.vllm.inplace_all_reduce(input_,
-                                              group_name=self.unique_name)
-            return input_
+        return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name)
 
     def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
+        # always try custom allreduce first,
+        # and then pynccl.
         ca_comm = self.ca_comm
-        assert ca_comm is not None
-        assert not ca_comm.disabled
-        out = ca_comm.custom_all_reduce(input_)
-        assert out is not None
-        return out
-
-    def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
+        if ca_comm is not None and not ca_comm.disabled and \
+            ca_comm.should_custom_ar(input_):
+            out = ca_comm.custom_all_reduce(input_)
+            assert out is not None
+            return out
         pynccl_comm = self.pynccl_comm
-        if (pynccl_comm is not None and not pynccl_comm.disabled):
-            pynccl_comm.all_reduce(input_)
-        else:
-            torch.distributed.all_reduce(input_, group=self.device_group)
+        assert pynccl_comm is not None
+        # TODO: pynccl should not use `stream=`
+        # it can just always use the current stream.
+        out = pynccl_comm.all_reduce(input_,
+                                     stream=torch.cuda.current_stream())
+        if out is None:
+            # fall back to the default all-reduce using PyTorch.
+            # this usually happens during testing.
+            # when we run the model, allreduce only happens for the TP
+            # group, where we always have either custom allreduce or pynccl.
+            out = input_.clone()
+            torch.distributed.all_reduce(out, group=self.device_group)
+        return out
 
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         world_size = self.world_size
@@ -942,6 +910,14 @@ def get_pp_group() -> GroupCoordinator:
 # kept for backward compatibility
 get_pipeline_model_parallel_group = get_pp_group
 
+_KV_TRANSFER: Optional[kv_transfer.KVTransferAgent] = None
+
+
+def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
+    assert _KV_TRANSFER is not None, (
+        "disaggregated KV cache transfer parallel group is not initialized")
+    return _KV_TRANSFER
+
 
 @contextmanager
 def graph_capture():
@@ -1090,6 +1066,26 @@ def initialize_model_parallel(
                                     group_name="pp")
 
 
+def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize KV cache transfer parallel group.
+    """
+
+    global _KV_TRANSFER
+
+    if vllm_config.kv_transfer_config is None:
+        return
+
+    if all([
+            vllm_config.kv_transfer_config.need_kv_parallel_group,
+            _KV_TRANSFER is None
+    ]):
+        _KV_TRANSFER = kv_transfer.KVTransferAgent(
+            rank=get_world_group().rank,
+            local_rank=get_world_group().local_rank,
+            config=vllm_config)
+
+
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
@@ -1196,25 +1192,31 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         torch.cuda.empty_cache()
 
 
-def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
+def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
+                        source_rank: int = 0) -> List[bool]:
     """
     This is a collective operation that returns if each rank is in the same node
     as the source rank. It tests if processes are attached to the same
     memory system (shared access to shared memory).
     """
-    assert torch.distributed.get_backend(
-        pg) != torch.distributed.Backend.NCCL, (
-            "in_the_same_node_as should be tested with a non-NCCL group.")
-    # local rank inside the group
-    rank = torch.distributed.get_rank(group=pg)
-    world_size = torch.distributed.get_world_size(group=pg)
+    if isinstance(pg, ProcessGroup):
+        assert torch.distributed.get_backend(
+            pg) != torch.distributed.Backend.NCCL, (
+                "in_the_same_node_as should be tested with a non-NCCL group.")
+        # local rank inside the group
+        rank = torch.distributed.get_rank(group=pg)
+        world_size = torch.distributed.get_world_size(group=pg)
+
+        # global ranks of the processes in the group
+        ranks = torch.distributed.get_process_group_ranks(pg)
+    else:
+        rank = pg.rank
+        world_size = pg.world_size
+        ranks = list(range(world_size))
 
     # local tensor in each process to store the result
     is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
 
-    # global ranks of the processes in the group
-    ranks = torch.distributed.get_process_group_ranks(pg)
-
     magic_message = b"magic_message"
     shm = None
 
@@ -1224,17 +1226,21 @@ def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
                 # create a shared memory segment
                 shm = shared_memory.SharedMemory(create=True, size=128)
                 shm.buf[:len(magic_message)] = magic_message
-                torch.distributed.broadcast_object_list([shm.name],
-                                                        src=ranks[source_rank],
-                                                        group=pg)
+                if isinstance(pg, ProcessGroup):
+                    torch.distributed.broadcast_object_list(
+                        [shm.name], src=ranks[source_rank], group=pg)
+                else:
+                    pg.broadcast_obj(shm.name, src=source_rank)
                 is_in_the_same_node[rank] = 1
             else:
                 # try to open the shared memory segment
-                recv = [None]
-                torch.distributed.broadcast_object_list(recv,
-                                                        src=ranks[source_rank],
-                                                        group=pg)
-                name = recv[0]
+                if isinstance(pg, ProcessGroup):
+                    recv = [None]
+                    torch.distributed.broadcast_object_list(
+                        recv, src=ranks[source_rank], group=pg)
+                    name = recv[0]
+                else:
+                    name = pg.broadcast_obj(None, src=source_rank)
                 # fix to https://stackoverflow.com/q/62748654/9191338
                 # Python incorrectly tracks shared memory even if it is not
                 # created by the process. The following patch is a workaround.
@@ -1249,12 +1255,23 @@ def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
         if shm:
             shm.close()
 
-    torch.distributed.barrier(group=pg)
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.barrier(group=pg)
+    else:
+        pg.barrier()
 
     # clean up the shared memory segment
     with contextlib.suppress(OSError):
         if rank == source_rank and shm:
             shm.unlink()
-    torch.distributed.all_reduce(is_in_the_same_node, group=pg)
 
-    return [x == 1 for x in is_in_the_same_node.tolist()]
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.all_reduce(is_in_the_same_node, group=pg)
+        aggregated_data = is_in_the_same_node
+    else:
+        aggregated_data = torch.zeros_like(is_in_the_same_node)
+        for i in range(world_size):
+            rank_data = pg.broadcast_obj(is_in_the_same_node, src=i)
+            aggregated_data += rank_data
+
+    return [x == 1 for x in aggregated_data.tolist()]
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d73f95f59c71f..52593465712e2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -8,17 +8,19 @@
 import torch
 
 import vllm.envs as envs
-from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig,
-                         DeviceConfig, HfOverrides, LoadConfig, LoadFormat,
-                         LoRAConfig, ModelConfig, ObservabilityConfig,
-                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
-                         SchedulerConfig, SpeculativeConfig, TaskOption,
-                         TokenizerPoolConfig, VllmConfig)
+from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
+                         DecodingConfig, DeviceConfig, HfOverrides,
+                         KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
+                         ModelConfig, ObservabilityConfig, ParallelConfig,
+                         PoolerConfig, PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
+                         VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import check_gguf_file
+from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
 
 if TYPE_CHECKING:
@@ -90,7 +92,6 @@ class EngineArgs:
     task: TaskOption = "auto"
     skip_tokenizer_init: bool = False
     tokenizer_mode: str = 'auto'
-    chat_template_text_format: str = 'string'
     trust_remote_code: bool = False
     allowed_local_media_path: str = ""
     download_dir: Optional[str] = None
@@ -107,20 +108,22 @@ class EngineArgs:
     # notice.
     distributed_executor_backend: Optional[Union[str,
                                                  Type[ExecutorBase]]] = None
+    # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
-    # NOTE(kzawora): default block size for Gaudi should be 128
-    # smaller sizes still work, but very inefficiently
-    block_size: int = 16 if not current_platform.is_hpu() else 128
-    enable_prefix_caching: bool = False
+    block_size: Optional[int] = None
+    enable_prefix_caching: Optional[bool] = None
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = True
     swap_space: float = 4  # GiB
     cpu_offload_gb: float = 0  # GiB
     gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: Optional[int] = None
-    max_num_seqs: int = 256
+    max_num_partial_prefills: Optional[int] = 1
+    max_long_partial_prefills: Optional[int] = 1
+    long_prefill_threshold: Optional[float] = 0.04
+    max_num_seqs: Optional[int] = None
     max_logprobs: int = 20  # Default value for OpenAI Chat Completions API
     disable_log_stats: bool = False
     revision: Optional[str] = None
@@ -141,6 +144,7 @@ class EngineArgs:
     tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
+    mm_cache_preprocessor: bool = False
     enable_lora: bool = False
     enable_lora_bias: bool = False
     max_loras: int = 1
@@ -166,7 +170,8 @@ class EngineArgs:
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: Optional[bool] = None
 
-    guided_decoding_backend: str = 'outlines'
+    guided_decoding_backend: str = 'xgrammar'
+    logits_processor_pattern: Optional[str] = None
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
     speculative_model_quantization: Optional[str] = None
@@ -190,11 +195,31 @@ class EngineArgs:
 
     override_neuron_config: Optional[Dict[str, Any]] = None
     override_pooler_config: Optional[PoolerConfig] = None
+    compilation_config: Optional[CompilationConfig] = None
+    worker_cls: str = "auto"
+
+    kv_transfer_config: Optional[KVTransferConfig] = None
 
     def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
 
+        # Override the default value of enable_prefix_caching if it's not set
+        # by user.
+        if self.enable_prefix_caching is None:
+            self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
+
+        # Override max_num_seqs if it's not set by user.
+        if self.max_num_seqs is None:
+            self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
+
+        # support `EngineArgs(compilation_config={...})`
+        # without having to manually construct a
+        # CompilationConfig object
+        if isinstance(self.compilation_config, (int, dict)):
+            self.compilation_config = CompilationConfig.from_cli(
+                str(self.compilation_config))
+
         # Setup plugins
         from vllm.plugins import load_general_plugins
         load_general_plugins()
@@ -258,14 +283,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'fast tokenizer if available.\n* "slow" will '
             'always use the slow tokenizer. \n* '
             '"mistral" will always use the `mistral_common` tokenizer.')
-        parser.add_argument(
-            '--chat-template-text-format',
-            type=str,
-            default=EngineArgs.chat_template_text_format,
-            choices=['string', 'openai'],
-            help='The format to render text content within a chat template. '
-            '"string" will keep the content field as a string whereas '
-            '"openai" will parse content in the current OpenAI format.')
         parser.add_argument('--trust-remote-code',
                             action='store_true',
                             help='Trust remote code from huggingface.')
@@ -351,14 +368,23 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--guided-decoding-backend',
             type=str,
-            default='outlines',
-            choices=['outlines', 'lm-format-enforcer'],
+            default='xgrammar',
+            choices=['outlines', 'lm-format-enforcer', 'xgrammar'],
             help='Which engine will be used for guided decoding'
             ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/outlines-dev/outlines and '
+            'https://github.com/outlines-dev/outlines,'
+            'https://github.com/mlc-ai/xgrammar, and '
             'https://github.com/noamgat/lm-format-enforcer.'
             ' Can be overridden per request via guided_decoding_backend'
             ' parameter.')
+        parser.add_argument(
+            '--logits-processor-pattern',
+            type=nullable_str,
+            default=None,
+            help='Optional regex pattern specifying valid logits processor '
+            'qualified names that can be passed with the `logits_processors` '
+            'extra completion argument. Defaults to None, which allows no '
+            'processors.')
         # Parallel arguments
         parser.add_argument(
             '--distributed-executor-backend',
@@ -404,17 +430,24 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             choices=[8, 16, 32, 64, 128],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len')
+                            'set to max-model-len. On CUDA devices, '
+                            'only block sizes up to 32 are supported. '
+                            'On HPU devices, block size defaults to 128.')
 
-        parser.add_argument('--enable-prefix-caching',
-                            action='store_true',
-                            help='Enables automatic prefix caching.')
+        parser.add_argument(
+            "--enable-prefix-caching",
+            action=argparse.BooleanOptionalAction,
+            default=EngineArgs.enable_prefix_caching,
+            help="Enables automatic prefix caching. "
+            "Use --no-enable-prefix-caching to disable explicitly.",
+        )
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
                             help='Disables sliding window, '
                             'capping to sliding window size')
         parser.add_argument('--use-v2-block-manager',
                             action='store_true',
+                            default=True,
                             help='[DEPRECATED] block manager v1 has been '
                             'removed and SelfAttnBlockSpaceManager (i.e. '
                             'block manager v2) is now the default. '
@@ -458,11 +491,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='The fraction of GPU memory to be used for the model '
             'executor, which can range from 0 to 1. For example, a value of '
             '0.5 would imply 50%% GPU memory utilization. If unspecified, '
-            'will use the default value of 0.9. This is a global gpu memory '
-            'utilization limit, for example if 50%% of the gpu memory is '
-            'already used before vLLM starts and --gpu-memory-utilization is '
-            'set to 0.9, then only 40%% of the gpu memory will be allocated '
-            'to the model executor.')
+            'will use the default value of 0.9. This is a per-instance '
+            'limit, and only applies to the current vLLM instance.'
+            'It does not matter if you have another vLLM instance running '
+            'on the same GPU. For example, if you have two vLLM instances '
+            'running on the same GPU, you can set the GPU memory utilization '
+            'to 0.5 for each instance.')
         parser.add_argument(
             '--num-gpu-blocks-override',
             type=int,
@@ -474,6 +508,31 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             default=EngineArgs.max_num_batched_tokens,
                             help='Maximum number of batched tokens per '
                             'iteration.')
+        parser.add_argument(
+            "--max-num-partial-prefills",
+            type=int,
+            default=EngineArgs.max_num_partial_prefills,
+            help="For chunked prefill, the max number of concurrent \
+            partial prefills."
+            "Defaults to 1",
+        )
+        parser.add_argument(
+            "--max-long-partial-prefills",
+            type=int,
+            default=EngineArgs.max_long_partial_prefills,
+            help="For chunked prefill, the max number of long concurrent "
+            "partial prefills. The length is determined by the "
+            "long-prefill-threshold argument. "
+            "Defaults to 1",
+        )
+        parser.add_argument(
+            "--long-prefill-threshold",
+            type=float,
+            default=EngineArgs.long_prefill_threshold,
+            help="For chunked prefill, a request is considered long "
+            "if the prompt is longer than the "
+            "max_model_length * long_prefill_threshold. Defaults to 0.04%",
+        )
         parser.add_argument('--max-num-seqs',
                             type=int,
                             default=EngineArgs.max_num_seqs,
@@ -574,6 +633,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=json.loads,
             help=('Overrides for the multimodal input mapping/processing, '
                   'e.g., image processor. For example: {"num_crops": 4}.'))
+        parser.add_argument(
+            '--mm-cache-preprocessor',
+            action='store_true',
+            help='If true, then enables caching of the multi-modal '
+            'preprocessor/mapper. Otherwise, the mapper executes each time'
+            ', and for better performance consider enabling frontend process.')
 
         # LoRA related configs
         parser.add_argument('--enable-lora',
@@ -802,7 +867,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=str,
             default=[],
             help="The pattern(s) to ignore when loading the model."
-            "Default to 'original/**/*' to avoid repeated loading of llama's "
+            "Default to `original/**/*` to avoid repeated loading of llama's "
             "checkpoints.")
         parser.add_argument(
             '--preemption-mode',
@@ -874,9 +939,38 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--override-pooler-config',
             type=PoolerConfig.from_json,
             default=None,
-            help="Override or set the pooling method in the embedding model. "
+            help="Override or set the pooling method for pooling models. "
             "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
 
+        parser.add_argument('--compilation-config',
+                            '-O',
+                            type=CompilationConfig.from_cli,
+                            default=None,
+                            help='torch.compile configuration for the model.'
+                            'When it is a number (0, 1, 2, 3), it will be '
+                            'interpreted as the optimization level.\n'
+                            'NOTE: level 0 is the default level without '
+                            'any optimization. level 1 and 2 are for internal '
+                            'testing only. level 3 is the recommended level '
+                            'for production.\n'
+                            'To specify the full compilation config, '
+                            'use a JSON string.\n'
+                            'Following the convention of traditional '
+                            'compilers, using -O without space is also '
+                            'supported. -O3 is equivalent to -O 3.')
+
+        parser.add_argument('--kv-transfer-config',
+                            type=KVTransferConfig.from_cli,
+                            default=None,
+                            help='The configurations for distributed KV cache '
+                            'transfer. Should be a JSON string.')
+
+        parser.add_argument(
+            '--worker-cls',
+            type=str,
+            default="auto",
+            help='The worker class to use for distributed execution.')
+
         return parser
 
     @classmethod
@@ -894,7 +988,6 @@ def create_model_config(self) -> ModelConfig:
             # We know this is not None because we set it in __post_init__
             tokenizer=cast(str, self.tokenizer),
             tokenizer_mode=self.tokenizer_mode,
-            chat_template_text_format=self.chat_template_text_format,
             trust_remote_code=self.trust_remote_code,
             allowed_local_media_path=self.allowed_local_media_path,
             dtype=self.dtype,
@@ -918,9 +1011,10 @@ def create_model_config(self) -> ModelConfig:
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
+            mm_cache_preprocessor=self.mm_cache_preprocessor,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
-        )
+            logits_processor_pattern=self.logits_processor_pattern)
 
     def create_load_config(self) -> LoadConfig:
         return LoadConfig(
@@ -930,7 +1024,12 @@ def create_load_config(self) -> LoadConfig:
             ignore_patterns=self.ignore_patterns,
         )
 
-    def create_engine_config(self) -> VllmConfig:
+    def create_engine_config(self,
+                             usage_context: Optional[UsageContext] = None
+                             ) -> VllmConfig:
+        if envs.VLLM_USE_V1:
+            self._override_v1_engine_args(usage_context)
+
         # gguf file needs a specific model loader and doesn't use hf_repo
         if check_gguf_file(self.model):
             self.quantization = self.load_format = "gguf"
@@ -958,17 +1057,15 @@ def create_engine_config(self) -> VllmConfig:
         device_config = DeviceConfig(device=self.device)
         model_config = self.create_model_config()
 
-        if model_config.is_multimodal_model:
-            if self.enable_prefix_caching:
-                logger.warning(
-                    "--enable-prefix-caching is currently not "
-                    "supported for multimodal models and has been disabled.")
+        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
+                and self.enable_prefix_caching):
+            logger.warning("--enable-prefix-caching is currently not "
+                           "supported for multimodal models in v0 and "
+                           "has been disabled.")
             self.enable_prefix_caching = False
 
         cache_config = CacheConfig(
-            # neuron needs block_size = max_model_len
-            block_size=self.block_size if self.device != "neuron" else
-            (self.max_model_len if self.max_model_len is not None else 0),
+            block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             swap_space=self.swap_space,
             cache_dtype=self.kv_cache_dtype,
@@ -990,7 +1087,9 @@ def create_engine_config(self) -> VllmConfig:
                 self.tokenizer_pool_extra_config,
             ),
             ray_workers_use_nsight=self.ray_workers_use_nsight,
-            distributed_executor_backend=self.distributed_executor_backend)
+            distributed_executor_backend=self.distributed_executor_backend,
+            worker_cls=self.worker_cls,
+        )
 
         max_model_len = model_config.max_model_len
         use_long_context = max_model_len > 32768
@@ -999,16 +1098,21 @@ def create_engine_config(self) -> VllmConfig:
             # long context (> 32K) models. This is to avoid OOM errors in the
             # initial memory profiling phase.
 
-            # Chunked prefill is currently disabled for multimodal models by
-            # default.
-            if use_long_context and not model_config.is_multimodal_model:
+            # For multimodal models, chunked prefill is disabled by default in
+            # V0, but enabled by design in V1
+            if model_config.is_multimodal_model:
+                self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
+
+            elif use_long_context:
                 is_gpu = device_config.device_type == "cuda"
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
                 use_spec_decode = self.speculative_model is not None
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
-                        and not self.enable_prompt_adapter):
+                        and not self.enable_prompt_adapter
+                        and model_config.runner_type != "pooling"
+                        and not current_platform.is_rocm()):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "
@@ -1025,6 +1129,11 @@ def create_engine_config(self) -> VllmConfig:
                 "errors during the initial memory profiling phase, or result "
                 "in low performance due to small KV cache space. Consider "
                 "setting --max-model-len to a smaller value.", max_model_len)
+        elif (self.enable_chunked_prefill
+              and model_config.runner_type == "pooling"):
+            msg = "Chunked prefill is not supported for pooling models"
+            raise ValueError(msg)
+
 
         speculative_config = SpeculativeConfig.maybe_create_spec_config(
             target_model_config=model_config,
@@ -1053,7 +1162,7 @@ def create_engine_config(self) -> VllmConfig:
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
@@ -1082,7 +1191,7 @@ def create_engine_config(self) -> VllmConfig:
                 " please file an issue with detailed information.")
 
         scheduler_config = SchedulerConfig(
-            task=model_config.task,
+            runner_type=model_config.runner_type,
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
@@ -1095,7 +1204,11 @@ def create_engine_config(self) -> VllmConfig:
             multi_step_stream_outputs=self.multi_step_stream_outputs,
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                              and parallel_config.use_ray),
-            policy=self.scheduling_policy)
+            policy=self.scheduling_policy,
+            max_num_partial_prefills=self.max_num_partial_prefills,
+            max_long_partial_prefills=self.max_long_partial_prefills,
+            long_prefill_threshold=self.long_prefill_threshold,
+        )
         lora_config = LoRAConfig(
             bias_enabled=self.enable_lora_bias,
             max_lora_rank=self.max_lora_rank,
@@ -1140,7 +1253,7 @@ def create_engine_config(self) -> VllmConfig:
             or "all" in detailed_trace_modules,
         )
 
-        return VllmConfig(
+        config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
@@ -1152,8 +1265,43 @@ def create_engine_config(self) -> VllmConfig:
             decoding_config=decoding_config,
             observability_config=observability_config,
             prompt_adapter_config=prompt_adapter_config,
+            compilation_config=self.compilation_config,
+            kv_transfer_config=self.kv_transfer_config,
         )
 
+        if envs.VLLM_USE_V1:
+            self._override_v1_engine_config(config)
+        return config
+
+    def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
+        """
+        Override the EngineArgs's args based on the usage context for V1.
+        """
+        assert envs.VLLM_USE_V1, "V1 is not enabled"
+
+        # V1 always uses chunked prefills.
+        self.enable_chunked_prefill = True
+        # When no user override, set the default values based on the usage
+        # context.
+        # TODO(woosuk): Tune the default values for different hardware.
+        default_max_num_batched_tokens = {
+            UsageContext.LLM_CLASS: 8192,
+            UsageContext.OPENAI_API_SERVER: 2048,
+        }
+        if (self.max_num_batched_tokens is None
+                and usage_context in default_max_num_batched_tokens):
+            self.max_num_batched_tokens = default_max_num_batched_tokens[
+                usage_context]
+            logger.warning(
+                "Setting max_num_batched_tokens to %d for %s usage context.",
+                self.max_num_batched_tokens, usage_context.value)
+
+    def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
+        """
+        Override the EngineConfig's configs based on the usage context for V1.
+        """
+        assert envs.VLLM_USE_V1, "V1 is not enabled"
+
 
 @dataclass
 class AsyncEngineArgs(EngineArgs):
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 5a5388708b1c6..f50e20cf70323 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import time
 import weakref
 from functools import partial
@@ -6,6 +7,8 @@
                     List, Mapping, Optional, Set, Tuple, Type, Union, overload)
 from weakref import ReferenceType
 
+from typing_extensions import deprecated
+
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VllmConfig)
@@ -25,7 +28,7 @@
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
@@ -74,7 +77,7 @@ def _log_task_completion(task: asyncio.Task,
 
 
 class AsyncStream:
-    """A stream of RequestOutputs or EmbeddingRequestOutputs for a request
+    """A stream of RequestOutputs or PoolingRequestOutputs for a request
     that can be iterated over asynchronously via an async generator."""
 
     def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
@@ -83,7 +86,7 @@ def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
-    def put(self, item: Union[RequestOutput, EmbeddingRequestOutput,
+    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
                               Exception]) -> None:
         if not self._finished:
             self._queue.put_nowait(item)
@@ -103,7 +106,7 @@ def finished(self) -> bool:
 
     async def generator(
         self
-    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         try:
             while True:
                 result = await self._queue.get()
@@ -154,7 +157,7 @@ def propagate_exception(self,
 
     def process_request_output(self,
                                request_output: Union[RequestOutput,
-                                                     EmbeddingRequestOutput],
+                                                     PoolingRequestOutput],
                                *,
                                verbose: bool = False) -> None:
         """Process a request output from the engine."""
@@ -265,7 +268,7 @@ def __init__(self, *args, **kwargs):
 
     async def step_async(
         self, virtual_engine: int
-    ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
         The workers are ran asynchronously if possible.
 
@@ -300,6 +303,9 @@ async def step_async(
             ctx.seq_group_metadata_list = seq_group_metadata_list
             ctx.scheduler_outputs = scheduler_outputs
 
+            finished_requests_ids = self.scheduler[
+                virtual_engine].get_and_reset_finished_requests_ids()
+
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
@@ -311,13 +317,13 @@ async def step_async(
                 self._cache_scheduler_outputs_for_multi_step(
                     virtual_engine, seq_group_metadata_list, scheduler_outputs,
                     allow_async_output_proc)
+        else:
+            finished_requests_ids = list()
 
         assert seq_group_metadata_list is not None
         assert scheduler_outputs is not None
 
         if not scheduler_outputs.is_empty():
-            finished_requests_ids = self.scheduler[
-                virtual_engine].get_and_reset_finished_requests_ids()
 
             # Check if we have a cached last_output from the previous iteration.
             # For supporting PP this is probably the best way to pass the
@@ -419,7 +425,8 @@ async def get_tokenizer_async(self,
         return await (
             self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
 
-    @overload  # DEPRECATED
+    @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     async def add_request_async(
         self,
         request_id: str,
@@ -501,7 +508,8 @@ async def add_request_async(
                 sampling_params=params,
                 tokenizer=await self.get_tokenizer_async(lora_request),
                 default_guided_backend=self.decoding_config.
-                guided_decoding_backend)
+                guided_decoding_backend,
+                model_config=self.model_config)
 
         self._add_processed_request(
             request_id=request_id,
@@ -522,22 +530,30 @@ async def check_health_async(self) -> None:
 
 async def build_guided_decoding_logits_processor_async(
         sampling_params: SamplingParams, tokenizer: AnyTokenizer,
-        default_guided_backend: str) -> SamplingParams:
+        default_guided_backend: str,
+        model_config: ModelConfig) -> SamplingParams:
     """Constructs logits processors based on the guided_decoding,
     logits_bias, and allowed_token_ids fields in sampling_params. Deletes
     those fields and adds the constructed logits processors to the
     logits_processors field. Modifies sampling params in-place and returns
     the modified sampling params."""
-    if (guided_decoding := sampling_params.guided_decoding) is None:
+    if sampling_params.guided_decoding is None:
         return sampling_params
 
+    # Defensively copy sampling params since guided decoding logits
+    # processors can have different state for each request
+    sampling_params = copy.copy(sampling_params)
+    guided_decoding = sampling_params.guided_decoding
+
     logger.debug("Building guided decoding logits processor. "
                  "Params: %s", guided_decoding)
 
     guided_decoding.backend = guided_decoding.backend or default_guided_backend
 
     processor = await get_guided_decoding_logits_processor(
-        guided_params=guided_decoding, tokenizer=tokenizer)
+        guided_params=guided_decoding,
+        tokenizer=tokenizer,
+        model_config=model_config)
 
     if processor:
         if sampling_params.logits_processors is None:
@@ -680,7 +696,7 @@ def from_engine_args(
         """Creates an async LLM engine from the engine arguments."""
         # Create the engine configs.
         if engine_config is None:
-            engine_config = engine_args.create_engine_config()
+            engine_config = engine_args.create_engine_config(usage_context)
 
         executor_class = cls._get_executor_cls(engine_config)
 
@@ -891,7 +907,8 @@ async def run_engine_loop(engine_ref: ReferenceType):
 
     # This method does not need to be async, but kept that way
     # for backwards compatibility.
-    @overload  # DEPRECATED
+    @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def add_request(
         self,
         request_id: str,
@@ -904,7 +921,7 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Coroutine[None, None, AsyncGenerator[Union[
-            RequestOutput, EmbeddingRequestOutput], None]]:
+            RequestOutput, PoolingRequestOutput], None]]:
         ...
 
     @overload
@@ -919,7 +936,7 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Coroutine[None, None, AsyncGenerator[Union[
-            RequestOutput, EmbeddingRequestOutput], None]]:
+            RequestOutput, PoolingRequestOutput], None]]:
         ...
 
     @deprecate_kwargs(
@@ -938,7 +955,7 @@ async def add_request(
         priority: int = 0,
         *,
         inputs: Optional[PromptType] = None,  # DEPRECATED
-    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         if inputs is not None:
             prompt = inputs
         assert prompt is not None and params is not None
@@ -1048,16 +1065,20 @@ async def generate(
             >>> # Process and return the final output
             >>> ...
         """
-        async for output in await self.add_request(
-                request_id,
-                prompt,
-                sampling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                prompt_adapter_request=prompt_adapter_request,
-                priority=priority,
-        ):
-            yield LLMEngine.validate_output(output, RequestOutput)
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    sampling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    prompt_adapter_request=prompt_adapter_request,
+                    priority=priority,
+            ):
+                yield LLMEngine.validate_output(output, RequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     async def encode(
         self,
@@ -1067,8 +1088,8 @@ async def encode(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model.
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
+        """Generate outputs for a request from a pooling model.
 
         Generate outputs for a request. This method is a coroutine. It adds the
         request into the waiting queue of the LLMEngine and streams the outputs
@@ -1085,7 +1106,7 @@ async def encode(
                 Only applicable with priority scheduling.
 
         Yields:
-            The output `EmbeddingRequestOutput` objects from the LLMEngine
+            The output `PoolingRequestOutput` objects from the LLMEngine
             for the request.
 
         Details:
@@ -1130,15 +1151,19 @@ async def encode(
             >>> # Process and return the final output
             >>> ...
         """
-        async for output in await self.add_request(
-                request_id,
-                prompt,
-                pooling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                priority=priority,
-        ):
-            yield LLMEngine.validate_output(output, EmbeddingRequestOutput)
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    pooling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=priority,
+            ):
+                yield LLMEngine.validate_output(output, PoolingRequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     async def abort(self, request_id: str) -> None:
         """Abort a request.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index aa9c7893c4cfe..dc2d77d6927cd 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,3 +1,4 @@
+import copy
 import time
 from collections import Counter as collectionsCounter
 from collections import deque
@@ -10,7 +11,7 @@
 from typing import Set, Type, Union, cast, overload
 
 import torch
-from typing_extensions import TypeVar
+from typing_extensions import TypeVar, deprecated
 
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
@@ -40,16 +41,15 @@
     get_local_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
+from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
-                           ParallelSampleSequenceGroup, Sequence,
-                           SequenceGroup, SequenceGroupBase,
-                           SequenceGroupMetadata, SequenceGroupOutput,
-                           SequenceStatus)
+from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
+                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
+                           SequenceGroupBase, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
 from vllm.transformers_utils.config import try_get_generation_config
@@ -80,7 +80,7 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
 
 
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
-_O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)
+_O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
 
 
 @dataclass
@@ -112,7 +112,7 @@ class SchedulerContext:
     def __init__(self, multi_step_stream_outputs: bool = False):
         self.output_queue: Deque[OutputData] = deque()
         self.request_outputs: List[Union[RequestOutput,
-                                         EmbeddingRequestOutput]] = []
+                                         PoolingRequestOutput]] = []
         self.seq_group_metadata_list: Optional[
             List[SequenceGroupMetadata]] = None
         self.scheduler_outputs: Optional[SchedulerOutputs] = None
@@ -231,88 +231,29 @@ def __init__(
         use_cached_outputs: bool = False,
     ) -> None:
 
-        # TODO: remove the local variables and use self.* throughout the class.
-        model_config = self.model_config = vllm_config.model_config
-        cache_config = self.cache_config = vllm_config.cache_config
-        lora_config = self.lora_config = vllm_config.lora_config
-        parallel_config = self.parallel_config = vllm_config.parallel_config
-        scheduler_config = self.scheduler_config = vllm_config.scheduler_config
-        device_config = self.device_config = vllm_config.device_config
-        speculative_config = self.speculative_config = vllm_config.speculative_config  # noqa
-        load_config = self.load_config = vllm_config.load_config
-        decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config  # noqa
+        self.load_config = vllm_config.load_config
+        self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
         )
-        prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config  # noqa
-        observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config  # noqa
+        self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
         )
 
         logger.info(
-            "Initializing an LLM engine (v%s) with config: "
-            "model=%r, speculative_config=%r, tokenizer=%r, "
-            "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
-            "override_neuron_config=%s, tokenizer_revision=%s, "
-            "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
-            "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
-            "pipeline_parallel_size=%d, "
-            "disable_custom_all_reduce=%s, quantization=%s, "
-            "enforce_eager=%s, kv_cache_dtype=%s, "
-            "quantization_param_path=%s, device_config=%s, "
-            "decoding_config=%r, observability_config=%r, "
-            "seed=%d, served_model_name=%s, "
-            "num_scheduler_steps=%d, chunked_prefill_enabled=%s "
-            "multi_step_stream_outputs=%s, enable_prefix_caching=%s, "
-            "use_async_output_proc=%s, use_cached_outputs=%s, "
-            "chat_template_text_format=%s, mm_processor_kwargs=%s, "
-            "pooler_config=%r)",
+            "Initializing an LLM engine (v%s) with config: %s, "
+            "use_cached_outputs=%s, ",
             VLLM_VERSION,
-            model_config.model,
-            speculative_config,
-            model_config.tokenizer,
-            model_config.skip_tokenizer_init,
-            model_config.tokenizer_mode,
-            model_config.revision,
-            model_config.override_neuron_config,
-            model_config.tokenizer_revision,
-            model_config.trust_remote_code,
-            model_config.dtype,
-            model_config.max_model_len,
-            load_config.download_dir,
-            load_config.load_format,
-            parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size,
-            parallel_config.disable_custom_all_reduce,
-            model_config.quantization,
-            model_config.enforce_eager,
-            cache_config.cache_dtype,
-            model_config.quantization_param_path,
-            device_config.device,
-            decoding_config,
-            observability_config,
-            model_config.seed,
-            model_config.served_model_name,
-            scheduler_config.num_scheduler_steps,
-            scheduler_config.chunked_prefill_enabled,
-            scheduler_config.multi_step_stream_outputs,
-            cache_config.enable_prefix_caching,
-            model_config.use_async_output_proc,
+            vllm_config,
             use_cached_outputs,
-            model_config.chat_template_text_format,
-            model_config.mm_processor_kwargs,
-            model_config.pooler_config,
-        )
-        # TODO(woosuk): Print more configs in debug mode.
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.speculative_config = speculative_config
-        self.load_config = load_config
-        self.decoding_config = decoding_config or DecodingConfig()
-        self.prompt_adapter_config = prompt_adapter_config
-        self.observability_config = observability_config or ObservabilityConfig(
         )
+
         self.log_stats = log_stats
         self.use_cached_outputs = use_cached_outputs
 
@@ -334,19 +275,19 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
 
         self.seq_counter = Counter()
         self.generation_config_fields = _load_generation_config_dict(
-            model_config)
+            self.model_config)
 
-        self.input_preprocessor = InputPreprocessor(model_config,
+        self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer,
                                                     mm_registry)
 
         self.input_registry = input_registry
         self.input_processor = input_registry.create_input_processor(
-            model_config)
+            self.model_config)
 
         self.model_executor = executor_class(vllm_config=vllm_config, )
 
-        if self.model_config.task != "embedding":
+        if self.model_config.runner_type != "pooling":
             self._initialize_kv_caches()
 
         # If usage stat is enabled, collect relevant info.
@@ -354,36 +295,36 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
             from vllm.model_executor.model_loader import (
                 get_architecture_class_name)
             usage_message.report_usage(
-                get_architecture_class_name(model_config),
+                get_architecture_class_name(self.model_config),
                 usage_context,
                 extra_kvs={
                     # Common configuration
                     "dtype":
-                    str(model_config.dtype),
+                    str(self.model_config.dtype),
                     "tensor_parallel_size":
-                    parallel_config.tensor_parallel_size,
+                    self.parallel_config.tensor_parallel_size,
                     "block_size":
-                    cache_config.block_size,
+                    self.cache_config.block_size,
                     "gpu_memory_utilization":
-                    cache_config.gpu_memory_utilization,
+                    self.cache_config.gpu_memory_utilization,
 
                     # Quantization
                     "quantization":
-                    model_config.quantization,
+                    self.model_config.quantization,
                     "kv_cache_dtype":
-                    str(cache_config.cache_dtype),
+                    str(self.cache_config.cache_dtype),
 
                     # Feature flags
                     "enable_lora":
-                    bool(lora_config),
+                    bool(self.lora_config),
                     "enable_prompt_adapter":
-                    bool(prompt_adapter_config),
+                    bool(self.prompt_adapter_config),
                     "enable_prefix_caching":
-                    cache_config.enable_prefix_caching,
+                    self.cache_config.enable_prefix_caching,
                     "enforce_eager":
-                    model_config.enforce_eager,
+                    self.model_config.enforce_eager,
                     "disable_custom_all_reduce":
-                    parallel_config.disable_custom_all_reduce,
+                    self.parallel_config.disable_custom_all_reduce,
                 })
 
         if self.tokenizer:
@@ -402,7 +343,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
             for _ in range(self.parallel_config.pipeline_parallel_size)
         ]
 
-        if model_config.use_async_output_proc:
+        if self.model_config.use_async_output_proc:
             process_model_outputs = weak_bind(self._process_model_outputs)
 
             self.async_callbacks = [
@@ -422,11 +363,11 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
         # GPU and CPU blocks, which are profiled in the distributed executor.
         self.scheduler = [
             Scheduler(
-                scheduler_config, cache_config, lora_config,
-                parallel_config.pipeline_parallel_size,
+                self.scheduler_config, self.cache_config, self.lora_config,
+                self.parallel_config.pipeline_parallel_size,
                 self.async_callbacks[v_id]
-                if model_config.use_async_output_proc else None)
-            for v_id in range(parallel_config.pipeline_parallel_size)
+                if self.model_config.use_async_output_proc else None)
+            for v_id in range(self.parallel_config.pipeline_parallel_size)
         ]
 
         # Metric Logging.
@@ -444,12 +385,14 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
                 self.stat_loggers = {
                     "logging":
                     LoggingStatLogger(
-                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC),
+                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+                        vllm_config=vllm_config),
                     "prometheus":
                     PrometheusStatLogger(
                         local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                        labels=dict(model_name=model_config.served_model_name),
-                        max_model_len=self.model_config.max_model_len),
+                        labels=dict(
+                            model_name=self.model_config.served_model_name),
+                        vllm_config=vllm_config),
                 }
                 self.stat_loggers["prometheus"].info("cache_config",
                                                      self.cache_config)
@@ -483,6 +426,7 @@ def _initialize_kv_caches(self) -> None:
         The workers will determine the number of blocks in both the GPU cache
         and the swap CPU cache.
         """
+        start = time.time()
         num_gpu_blocks, num_cpu_blocks = (
             self.model_executor.determine_num_available_blocks())
 
@@ -498,6 +442,9 @@ def _initialize_kv_caches(self) -> None:
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
         self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+        elapsed = time.time() - start
+        logger.info(("init engine (profile, create kv cache, "
+                     "warmup model) took %.2f seconds"), elapsed)
 
     @classmethod
     def _get_executor_cls(cls,
@@ -579,7 +526,7 @@ def from_engine_args(
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
         # Create the engine configs.
-        engine_config = engine_args.create_engine_config()
+        engine_config = engine_args.create_engine_config(usage_context)
         executor_class = cls._get_executor_cls(engine_config)
         # Create the LLM engine.
         engine = cls(
@@ -630,7 +577,7 @@ def _init_tokenizer(self) -> BaseTokenizerGroup:
             model_config=self.model_config,
             scheduler_config=self.scheduler_config,
             parallel_config=self.parallel_config,
-            enable_lora=bool(self.lora_config))
+            lora_config=self.lora_config)
 
     def _verify_args(self) -> None:
         self.model_config.verify_with_parallel_config(self.parallel_config)
@@ -730,12 +677,11 @@ def _add_processed_request(
     def stop_remote_worker_execution_loop(self) -> None:
         self.model_executor.stop_remote_worker_execution_loop()
 
-    @overload  # DEPRECATED
+    @overload
     def add_request(
         self,
         request_id: str,
-        *,
-        inputs: PromptType,
+        prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -746,10 +692,12 @@ def add_request(
         ...
 
     @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def add_request(
         self,
         request_id: str,
-        prompt: PromptType,
+        *,
+        inputs: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -1017,9 +965,9 @@ def has_unfinished_requests_for_virtual_engine(
     @staticmethod
     def _process_sequence_group_outputs(
         seq_group: SequenceGroup,
-        outputs: List[EmbeddingSequenceGroupOutput],
+        outputs: List[PoolingSequenceGroupOutput],
     ) -> None:
-        seq_group.embeddings = outputs[0].embeddings
+        seq_group.pooled_data = outputs[0].data
 
         for seq in seq_group.get_seqs():
             seq.status = SequenceStatus.FINISHED_STOPPED
@@ -1034,9 +982,9 @@ def _update_num_computed_tokens_for_multi_step_prefill(
         This function updates num_computed_tokens for prompt sequences
         when Multi-Step is enabled.
 
-        seq_group: SequenceGroup to update the num_computed_tokens for. 
+        seq_group: SequenceGroup to update the num_computed_tokens for.
         seq_group_meta: Metadata of the given SequenceGroup.
-        is_first_step_output: Optional[bool] - 
+        is_first_step_output: Optional[bool] -
             When available, is_first_step_output indicates if the appended
             output token is the output of the first-step in multi-step.
             A value of None indicates that outputs from all steps in
@@ -1174,7 +1122,7 @@ def _process_model_outputs(self,
                             seq_group.metrics.model_execute_time = (
                                 o.model_execute_time)
 
-            if self.model_config.task == "embedding":
+            if self.model_config.runner_type == "pooling":
                 self._process_sequence_group_outputs(seq_group, output)
             else:
                 self.output_processor.process_prompt_logprob(seq_group, output)
@@ -1325,7 +1273,7 @@ def _advance_to_next_step(
                 else:
                     seq.append_token_id(sample.output_token, sample.logprobs)
 
-    def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+    def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
 
         .. figure:: https://i.imgur.com/sv2HssD.png
@@ -1409,6 +1357,9 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
             ctx.seq_group_metadata_list = seq_group_metadata_list
             ctx.scheduler_outputs = scheduler_outputs
 
+            finished_requests_ids = self.scheduler[
+                virtual_engine].get_and_reset_finished_requests_ids()
+
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
@@ -1420,13 +1371,13 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
                 self._cache_scheduler_outputs_for_multi_step(
                     virtual_engine, seq_group_metadata_list, scheduler_outputs,
                     allow_async_output_proc)
+        else:
+            finished_requests_ids = list()
 
         assert seq_group_metadata_list is not None
         assert scheduler_outputs is not None
 
         if not scheduler_outputs.is_empty():
-            finished_requests_ids = self.scheduler[
-                virtual_engine].get_and_reset_finished_requests_ids()
 
             # Check if we have a cached last_output from the previous iteration.
             # For supporting PP this is probably the best way to pass the
@@ -1718,7 +1669,7 @@ def _get_stats(self,
             # not counted (to avoid double counting)
             actual_num_batched_tokens = scheduler_outputs.num_batched_tokens  # type: ignore
 
-            num_generation_tokens_from_prefill_groups = 0.
+            num_generation_tokens_from_prefill_groups = 0
             # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
             # the len of scheduler_outputs.scheduled_seq_groups is !=
             # scheduler_outputs.num_prefill_groups, this means that
@@ -1832,8 +1783,8 @@ def _get_stats(self,
                                num_prompt_tokens_iter)
         # Spec decode, if enabled, emits specialized metrics from the worker in
         # sampler output.
-        if model_output and (model_output[0].spec_decode_worker_metrics
-                             is not None):
+        if model_output and isinstance(model_output[0], SamplerOutput) and (
+                model_output[0].spec_decode_worker_metrics is not None):
             spec_decode_metrics = model_output[0].spec_decode_worker_metrics
         else:
             spec_decode_metrics = None
@@ -2043,7 +1994,11 @@ def _build_logits_processors(
 
         logits_processors = []
 
-        if (guided_decoding := sampling_params.guided_decoding) is not None:
+        if sampling_params.guided_decoding is not None:
+            # Defensively copy sampling params since guided decoding logits
+            # processors can have different state for each request
+            sampling_params = copy.copy(sampling_params)
+            guided_decoding = sampling_params.guided_decoding
 
             logger.debug(
                 "Building guided decoding logits processor in "
@@ -2054,7 +2009,9 @@ def _build_logits_processors(
                 self.decoding_config.guided_decoding_backend
 
             processor = get_local_guided_decoding_logits_processor(
-                guided_params=guided_decoding, tokenizer=tokenizer)
+                guided_params=guided_decoding,
+                tokenizer=tokenizer,
+                model_config=self.model_config)
             if processor:
                 logits_processors.append(processor)
 
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index e896bcdded2d1..c8aec8dd3afa3 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -6,6 +6,7 @@
 import numpy as np
 import prometheus_client
 
+from vllm.config import VllmConfig
 from vllm.engine.metrics_types import (StatLoggerBase, Stats,
                                        SupportsMetricsInfo)
 from vllm.executor.ray_utils import ray
@@ -44,10 +45,12 @@ class Metrics:
     _counter_cls = prometheus_client.Counter
     _histogram_cls = prometheus_client.Histogram
 
-    def __init__(self, labelnames: List[str], max_model_len: int):
+    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
         # Unregister any existing vLLM collectors (for CI/CD)
         self._unregister_vllm_metrics()
 
+        max_model_len = vllm_config.model_config.max_model_len
+
         # System stats
         #   Scheduler State
         self.gauge_scheduler_running = self._gauge_cls(
@@ -115,11 +118,15 @@ def __init__(self, labelnames: List[str], max_model_len: int):
             name="vllm:tokens_total",
             documentation="Number of prefill plus generation tokens processed.",
             labelnames=labelnames)
+        buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
+        if not vllm_config.model_config.enforce_eager:
+            buckets = vllm_config.compilation_config.capture_sizes.copy()
+            buckets.sort()
         self.histogram_iteration_tokens = self._histogram_cls(
             name="vllm:iteration_tokens_total",
             documentation="Histogram of number of tokens per engine_step.",
             labelnames=labelnames,
-            buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096])
+            buckets=buckets)
         self.histogram_time_to_first_token = self._histogram_cls(
             name="vllm:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
@@ -361,10 +368,10 @@ class RayMetrics(Metrics):
     _histogram_cls: Type[prometheus_client.Histogram] = cast(
         Type[prometheus_client.Histogram], _RayHistogramWrapper)
 
-    def __init__(self, labelnames: List[str], max_model_len: int):
+    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
         if ray_metrics is None:
             raise ImportError("RayMetrics requires Ray to be installed.")
-        super().__init__(labelnames, max_model_len)
+        super().__init__(labelnames, vllm_config)
 
     def _unregister_vllm_metrics(self) -> None:
         # No-op on purpose
@@ -421,6 +428,11 @@ def get_throughput(tracked_stats: List[int], now: float,
 class LoggingStatLogger(StatLoggerBase):
     """LoggingStatLogger is used in LLMEngine to log to Stdout."""
 
+    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
+        super().__init__(local_interval, vllm_config)
+        self.last_prompt_throughput: Optional[float] = None
+        self.last_generation_throughput: Optional[float] = None
+
     def log(self, stats: Stats) -> None:
         """Called by LLMEngine.
            Logs to Stdout every self.local_interval seconds."""
@@ -445,8 +457,14 @@ def log(self, stats: Stats) -> None:
                 now=stats.now,
                 last_log=self.last_local_log)
 
-            # Log to stdout.
-            logger.info(
+            log_fn = logger.info
+            if not any((prompt_throughput, generation_throughput,
+                        self.last_prompt_throughput,
+                        self.last_generation_throughput)):
+                # Avoid log noise on an idle production system
+                log_fn = logger.debug
+
+            log_fn(
                 "Avg prompt throughput: %.1f tokens/s, "
                 "Avg generation throughput: %.1f tokens/s, "
                 "Running: %d reqs, Swapped: %d reqs, "
@@ -462,21 +480,26 @@ def log(self, stats: Stats) -> None:
             )
             if (stats.cpu_prefix_cache_hit_rate >= 0
                     or stats.gpu_prefix_cache_hit_rate >= 0):
-                logger.info(
+                log_fn(
                     "Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%",
                     stats.gpu_prefix_cache_hit_rate * 100,
                     stats.cpu_prefix_cache_hit_rate * 100,
                 )
             if self.spec_decode_metrics is not None:
-                logger.info(
+                log_fn(
                     self._format_spec_decode_metrics_str(
                         self.spec_decode_metrics))
 
-            # Reset tracked stats for next interval.
-            self.num_prompt_tokens = []
-            self.num_generation_tokens = []
-            self.last_local_log = stats.now
-            self.spec_decode_metrics = None
+            self._reset(stats, prompt_throughput, generation_throughput)
+
+    def _reset(self, stats, prompt_throughput, generation_throughput) -> None:
+        # Reset tracked stats for next interval.
+        self.num_prompt_tokens = []
+        self.num_generation_tokens = []
+        self.last_local_log = stats.now
+        self.spec_decode_metrics = None
+        self.last_prompt_throughput = prompt_throughput
+        self.last_generation_throughput = generation_throughput
 
     def _format_spec_decode_metrics_str(
             self, metrics: "SpecDecodeWorkerMetrics") -> str:
@@ -499,12 +522,12 @@ class PrometheusStatLogger(StatLoggerBase):
     _gauge_cls = prometheus_client.Gauge
 
     def __init__(self, local_interval: float, labels: Dict[str, str],
-                 max_model_len: int) -> None:
-        super().__init__(local_interval)
+                 vllm_config: VllmConfig) -> None:
+        super().__init__(local_interval, vllm_config)
         # Prometheus metrics
         self.labels = labels
         self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
-                                         max_model_len=max_model_len)
+                                         vllm_config=vllm_config)
 
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
@@ -512,6 +535,11 @@ def _log_gauge(self, gauge, data: Union[int, float]) -> None:
 
     def _log_counter(self, counter, data: Union[int, float]) -> None:
         # Convenience function for logging to counter.
+        # Prevent ValueError from negative increment
+        if data < 0:
+            logger.warning("Skipping negative increment of %g to %s", data,
+                           counter)
+            return
         counter.labels(**self.labels).inc(data)
 
     def _log_counter_labels(self, counter, data: CollectionsCounter,
@@ -578,9 +606,9 @@ def _log_prometheus(self, stats: Stats) -> None:
                             stats.time_queue_requests)
         self._log_histogram(self.metrics.histogram_inference_time_request,
                             stats.time_inference_requests)
-        self._log_histogram(self.metrics.histogram_decode_time_request,
-                            stats.time_prefill_requests)
         self._log_histogram(self.metrics.histogram_prefill_time_request,
+                            stats.time_prefill_requests)
+        self._log_histogram(self.metrics.histogram_decode_time_request,
                             stats.time_decode_requests)
         self._log_histogram(self.metrics.histogram_time_in_queue_request,
                             stats.time_in_queue_requests)
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 5f7ec3bbcb269..5c7a430d11c5a 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -16,6 +16,7 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Protocol
 
+from vllm.config import VllmConfig
 from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
@@ -77,7 +78,7 @@ def metrics_info(self) -> Dict[str, str]:
 class StatLoggerBase(ABC):
     """Base class for StatLogger."""
 
-    def __init__(self, local_interval: float) -> None:
+    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
         # Tracked stats over current local logging interval.
         self.num_prompt_tokens: List[int] = []
         self.num_generation_tokens: List[int] = []
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 34c161e9395ae..420f540d0b5f4 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -2,6 +2,8 @@
 from enum import Enum
 from typing import List, Mapping, Optional, Union, overload
 
+from typing_extensions import deprecated
+
 from vllm import PoolingParams
 from vllm.inputs import PromptType
 from vllm.lora.request import LoRARequest
@@ -32,11 +34,10 @@ class RPCProcessRequest:
     prompt_adapter_request: Optional[PromptAdapterRequest] = None
     priority: int = 0
 
-    @overload  # DEPRECATED
+    @overload
     def __init__(
         self,
-        *,
-        inputs: PromptType,
+        prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -47,9 +48,11 @@ def __init__(
         ...
 
     @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def __init__(
         self,
-        prompt: PromptType,
+        *,
+        inputs: PromptType,
         params: Union[SamplingParams, PoolingParams],
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index fe21c58c775fe..0a046c71e86e8 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -9,6 +9,7 @@
 import psutil
 import zmq
 import zmq.asyncio
+from typing_extensions import deprecated
 from zmq import Frame  # type: ignore[attr-defined]
 from zmq.asyncio import Socket
 
@@ -35,7 +36,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
@@ -93,8 +94,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
             model_config=self.model_config,
             scheduler_config=engine_config.scheduler_config,
             parallel_config=engine_config.parallel_config,
-            enable_lora=bool(engine_config.lora_config),
-        )
+            lora_config=engine_config.lora_config)
         self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer)
 
@@ -414,11 +414,10 @@ def errored(self) -> bool:
     def dead_error(self) -> BaseException:
         return ENGINE_DEAD_ERROR(self._errored_with)
 
-    @overload  # DEPRECATED
+    @overload
     def generate(
         self,
-        *,
-        inputs: PromptType,
+        prompt: PromptType,
         sampling_params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -429,9 +428,11 @@ def generate(
         ...
 
     @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def generate(
         self,
-        prompt: PromptType,
+        *,
+        inputs: PromptType,
         sampling_params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -472,8 +473,8 @@ def generate(
             trace_headers: OpenTelemetry trace headers.
             prompt_adapter_request: Prompt Adapter request to use
                                             for generation, if any.
-            priority: Priority of the request (lower means earlier handling). 
-                Any priority other than 0 will lead to an error if the 
+            priority: Priority of the request (lower means earlier handling).
+                Any priority other than 0 will lead to an error if the
                 scheduling policy is not "priority".
         """
         if inputs is not None:
@@ -485,29 +486,30 @@ def generate(
                                      lora_request, trace_headers,
                                      prompt_adapter_request, priority)
 
-    @overload  # DEPRECATED
+    @overload
     def encode(
         self,
-        *,
-        inputs: PromptType,
+        prompt: PromptType,
         pooling_params: PoolingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
         ...
 
     @overload
+    @deprecated("'inputs' will be renamed to 'prompt")
     def encode(
         self,
-        prompt: PromptType,
+        *,
+        inputs: PromptType,
         pooling_params: PoolingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
         ...
 
     @deprecate_kwargs(
@@ -524,8 +526,8 @@ def encode(
         priority: int = 0,
         *,
         inputs: Optional[PromptType] = None  # DEPRECATED
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model.
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
+        """Generate outputs for a request from a pooling model.
 
         Generate outputs for a request. This method is a coroutine. It adds the
         request into the waiting queue of the LLMEngine and streams the outputs
@@ -540,7 +542,7 @@ def encode(
             trace_headers: OpenTelemetry trace headers.
 
         Yields:
-            The output `EmbeddingRequestOutput` objects from the LLMEngine
+            The output `PoolingRequestOutput` objects from the LLMEngine
             for the request.
         """
         if inputs is not None:
@@ -549,7 +551,7 @@ def encode(
                 and request_id is not None)
 
         return cast(
-            AsyncGenerator[EmbeddingRequestOutput, None],
+            AsyncGenerator[PoolingRequestOutput, None],
             self._process_request(prompt,
                                   pooling_params,
                                   request_id,
@@ -567,13 +569,17 @@ async def _process_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
-            EmbeddingRequestOutput, None]]:
+            PoolingRequestOutput, None]]:
         """Send an RPCGenerateRequest to the RPCServer and stream responses."""
 
         # If already dead, error out.
         if self._errored_with is not None:
             raise ENGINE_DEAD_ERROR(self._errored_with)
 
+        # Ensure the request id is unique among running requests
+        if request_id in self.output_queues:
+            raise ValueError(f"Request {request_id} already exists")
+
         # Constructing guided decoding logits processors is expensive, so we do
         # it here to avoid contending with cpu resources and the GIL on the
         # backend process.
@@ -586,6 +592,7 @@ async def _process_request(
                     default_guided_backend=(self.decoding_config.guided_decoding_backend
                         if self.decoding_config
                         else DecodingConfig.guided_decoding_backend),
+                    model_config=self.model_config
                 )
 
         # 1) Create output queue for this requests.
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 7de23643a2e1c..49a90b321dac4 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -111,7 +111,7 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 
-        engine_config = engine_args.create_engine_config()
+        engine_config = engine_args.create_engine_config(usage_context)
         executor_class = LLMEngine._get_executor_cls(engine_config)
 
         use_async_sockets = engine_config.model_config.use_async_output_proc
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 7a6ebb430541f..a9b638ed02a1e 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
     @staticmethod
     @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index e15395d75c91f..a066836b92708 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -11,8 +11,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import (CompletionOutput, EmbeddingRequestOutput,
-                          RequestOutput)
+from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import BeamSearchParams, SamplingParams
@@ -209,8 +208,8 @@ def encode(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model."""
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
+        """Generate outputs for a request from a pooling model."""
         ...
 
     @abstractmethod
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index ea3c93f733038..95da1c6e7b9bf 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -17,11 +17,11 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.launcher import serve_http
+from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (FlexibleArgumentParser, iterate_with_cancellation,
-                        random_uuid)
+from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
@@ -47,6 +47,11 @@ async def generate(request: Request) -> Response:
     - other fields: the sampling parameters (See `SamplingParams` for details).
     """
     request_dict = await request.json()
+    return await _generate(request_dict, raw_request=request)
+
+
+@with_cancellation
+async def _generate(request_dict: dict, raw_request: Request) -> Response:
     prompt = request_dict.pop("prompt")
     stream = request_dict.pop("stream", False)
     sampling_params = SamplingParams(**request_dict)
@@ -54,8 +59,6 @@ async def generate(request: Request) -> Response:
 
     assert engine is not None
     results_generator = engine.generate(prompt, sampling_params, request_id)
-    results_generator = iterate_with_cancellation(
-        results_generator, is_cancelled=request.is_disconnected)
 
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3ca460c47c3bd..3df08c740d65b 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -2,16 +2,19 @@
 import codecs
 import json
 from abc import ABC, abstractmethod
-from collections import defaultdict
+from collections import defaultdict, deque
 from functools import lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
                     Literal, Mapping, Optional, Tuple, TypeVar, Union, cast)
 
+import jinja2.nodes
+import transformers.utils.chat_template_utils as hf_chat_utils
 # yapf conflicts with isort for this block
 # yapf: disable
 from openai.types.chat import (ChatCompletionAssistantMessageParam,
-                               ChatCompletionContentPartImageParam)
+                               ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartInputAudioParam)
 from openai.types.chat import (
     ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
 from openai.types.chat import (ChatCompletionContentPartRefusalParam,
@@ -103,6 +106,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
 
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
+    ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
     CustomChatCompletionContentSimpleAudioParam,
@@ -153,6 +157,199 @@ class ConversationMessage(TypedDict, total=False):
     """The tool calls generated by the model, such as function calls."""
 
 
+# Passed in by user
+ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
+
+# Used internally
+_ChatTemplateContentFormat = Literal["string", "openai"]
+
+
+def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
+    if isinstance(node, jinja2.nodes.Name):
+        return node.ctx == "load" and node.name == varname
+
+    return False
+
+
+def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
+    if isinstance(node, jinja2.nodes.Getitem):
+        return (_is_var_access(node.node, varname)
+                and isinstance(node.arg, jinja2.nodes.Const)
+                and node.arg.value == key)
+
+    if isinstance(node, jinja2.nodes.Getattr):
+        return _is_var_access(node.node, varname) and node.attr == key
+
+    return False
+
+
+def _is_var_or_elems_access(
+    node: jinja2.nodes.Node,
+    varname: str,
+    key: Optional[str] = None,
+) -> bool:
+    if isinstance(node, jinja2.nodes.Filter):
+        return (node.node is not None
+                and _is_var_or_elems_access(node.node, varname, key))
+    if isinstance(node, jinja2.nodes.Test):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    if (isinstance(node, jinja2.nodes.Getitem)
+            and isinstance(node.arg, jinja2.nodes.Slice)):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    # yapf: disable
+    return (
+        _is_attr_access(node, varname, key) if key
+        else _is_var_access(node, varname)
+    ) # yapf: enable
+
+
+def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
+    # Global variable that is implicitly defined at the root
+    yield root, varname
+
+    # Iterative BFS
+    related_varnames = deque([varname])
+    while related_varnames:
+        related_varname = related_varnames.popleft()
+
+        for assign_ast in root.find_all(jinja2.nodes.Assign):
+            lhs = assign_ast.target
+            rhs = assign_ast.node
+
+            if _is_var_or_elems_access(rhs, related_varname):
+                assert isinstance(lhs, jinja2.nodes.Name)
+                yield assign_ast, lhs.name
+
+                # Avoid infinite looping for self-assignment
+                if lhs.name != related_varname:
+                    related_varnames.append(lhs.name)
+
+
+# NOTE: The proper way to handle this is to build a CFG so that we can handle
+# the scope in which each variable is defined, but that is too complicated
+def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node):
+    messages_varnames = [
+        varname
+        for _, varname in _iter_nodes_assign_var_or_elems(root, "messages")
+    ]
+
+    # Search for {%- for message in messages -%} loops
+    for loop_ast in root.find_all(jinja2.nodes.For):
+        loop_iter = loop_ast.iter
+        loop_target = loop_ast.target
+
+        for varname in messages_varnames:
+            if _is_var_or_elems_access(loop_iter, varname):
+                assert isinstance(loop_target, jinja2.nodes.Name)
+                yield loop_ast, loop_target.name
+                break
+
+
+def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
+    message_varnames = [
+        varname for _, varname in _iter_nodes_assign_messages_item(root)
+    ]
+
+    # Search for {%- for content in message['content'] -%} loops
+    for loop_ast in root.find_all(jinja2.nodes.For):
+        loop_iter = loop_ast.iter
+        loop_target = loop_ast.target
+
+        for varname in message_varnames:
+            if _is_var_or_elems_access(loop_iter, varname, "content"):
+                assert isinstance(loop_target, jinja2.nodes.Name)
+                yield loop_ast, loop_target.name
+                break
+
+
+def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]:
+    try:
+        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
+        return jinja_compiled.environment.parse(chat_template)
+    except Exception:
+        logger.exception("Error when compiling Jinja template")
+        return None
+
+
+def _detect_content_format(
+    chat_template: str,
+    *,
+    default: _ChatTemplateContentFormat,
+) -> _ChatTemplateContentFormat:
+    jinja_ast = _try_extract_ast(chat_template)
+    if jinja_ast is None:
+        return default
+
+    try:
+        next(_iter_nodes_assign_content_item(jinja_ast))
+    except StopIteration:
+        return "string"
+    except Exception:
+        logger.exception("Error when parsing AST of Jinja template")
+        return default
+    else:
+        return "openai"
+
+
+def _resolve_chat_template_content_format(
+    chat_template: Optional[str],
+    given_format: ChatTemplateContentFormatOption,
+    tokenizer: AnyTokenizer,
+) -> _ChatTemplateContentFormat:
+    if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
+        tokenizer_chat_template = tokenizer.chat_template
+    else:
+        tokenizer_chat_template = None
+
+    jinja_text: Optional[str]
+    if isinstance(tokenizer_chat_template, str) and chat_template is None:
+        jinja_text = tokenizer_chat_template
+    elif (isinstance(tokenizer_chat_template, dict)
+            and chat_template in tokenizer_chat_template):
+        jinja_text = tokenizer_chat_template[chat_template]
+    else:
+        jinja_text = load_chat_template(chat_template, is_literal=True)
+
+    detected_format = ("string" if jinja_text is None else
+                       _detect_content_format(jinja_text, default="string"))
+
+    return detected_format if given_format == "auto" else given_format
+
+
+@lru_cache
+def resolve_chat_template_content_format(
+    chat_template: Optional[str],
+    given_format: ChatTemplateContentFormatOption,
+    tokenizer: AnyTokenizer,
+) -> _ChatTemplateContentFormat:
+    detected_format = _resolve_chat_template_content_format(
+        chat_template,
+        given_format,
+        tokenizer,
+    )
+
+    logger.info(
+        "Detected the chat template content format to be '%s'. "
+        "You can set `--chat-template-content-format` to override this.",
+        detected_format,
+    )
+
+    if given_format != "auto" and given_format != detected_format:
+        logger.warning(
+            "You specified `--chat-template-content-format %s` "
+            "which is different from the detected format '%s'. "
+            "If our automatic detection is incorrect, please consider "
+            "opening a GitHub issue so that we can improve it: "
+            "https://github.com/vllm-project/vllm/issues/new/choose",
+            given_format,
+            detected_format,
+        )
+
+    return detected_format
+
+
 ModalityStr = Literal["image", "audio", "video"]
 _T = TypeVar("_T")
 
@@ -217,11 +414,13 @@ def _placeholder_str(self, modality: ModalityStr,
                 return ""
             if model_type == "idefics3":
                 return "<image>"
+            if model_type == "aria":
+                return "<|fim_prefix|><|img|><|fim_suffix|>"
 
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
             if model_type == "ultravox":
-                return "<|reserved_special_token_0|>"
+                return "<|audio|>"
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
@@ -322,6 +521,10 @@ def parse_image(self, image_url: str) -> None:
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str) -> None:
         raise NotImplementedError
@@ -348,6 +551,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio = get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = get_and_parse_video(video_url)
 
@@ -377,6 +589,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio_coro)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio_coro = async_get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio_coro)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = async_get_and_parse_video(video_url)
 
@@ -407,12 +628,23 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]):
 
 
 def load_chat_template(
-        chat_template: Optional[Union[Path, str]]) -> Optional[str]:
+    chat_template: Optional[Union[Path, str]],
+    *,
+    is_literal: bool = False,
+) -> Optional[str]:
     if chat_template is None:
         return None
+
+    if is_literal:
+        if isinstance(chat_template, Path):
+            raise TypeError("chat_template is expected to be read directly "
+                            "from its value")
+
+        return codecs.decode(chat_template, "unicode_escape")
+
     try:
         with open(chat_template) as f:
-            resolved_chat_template = f.read()
+            return f.read()
     except OSError as e:
         if isinstance(chat_template, Path):
             raise
@@ -426,10 +658,7 @@ def load_chat_template(
 
         # If opening a file fails, set chat template to be args to
         # ensure we decode so our escape are interpreted correctly
-        resolved_chat_template = codecs.decode(chat_template, "unicode_escape")
-
-    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
-    return resolved_chat_template
+        return load_chat_template(chat_template, is_literal=True)
 
 
 # TODO: Let user specify how to insert multimodal tokens into prompt
@@ -462,18 +691,22 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
+_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
-MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'}
 
 # Define a mapping from part types to their corresponding parsing functions.
-MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
+MM_PARSER_MAP: Dict[str,
+                    Callable[[ChatCompletionContentPartParam],
+                             Union[str, Dict[str,str]]]] = {
     "text":
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
+    "input_audio":
+    lambda part: _InputAudioParser(part).get("input_audio", {}),
     "refusal":
     lambda part: _RefusalParser(part).get("refusal", ""),
     "video_url":
@@ -482,7 +715,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> Tuple[str, str]:
+        part: ChatCompletionContentPartParam) -> Tuple[str,
+                                                Union[str, Dict[str, str]]]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -513,6 +747,7 @@ def _parse_chat_message_content_mm_part(
         return part_type, content
 
     # Handle missing 'type' but provided direct URL fields.
+    # 'type' is required field by pydantic
     if part_type is None:
         if part.get("image_url") is not None:
             image_params = cast(CustomChatCompletionContentSimpleImageParam,
@@ -522,6 +757,9 @@ def _parse_chat_message_content_mm_part(
             audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
                                 part)
             return "audio_url", audio_params.get("audio_url", "")
+        if part.get("input_audio") is not None:
+            input_audio_params = cast(Dict[str, str], part)
+            return "input_audio", input_audio_params
         if part.get("video_url") is not None:
             video_params = cast(CustomChatCompletionContentSimpleVideoParam,
                                 part)
@@ -535,25 +773,19 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "audio_url", "video_url")
+                                       "audio_url", "input_audio", "video_url")
 
 
 def _parse_chat_message_content_parts(
     role: str,
     parts: Iterable[ChatCompletionContentPartParam],
     mm_tracker: BaseMultiModalItemTracker,
-    chat_template_text_format: str,
+    *,
+    wrap_dicts: bool,
 ) -> List[ConversationMessage]:
     content: List[Union[str, Dict[str, str]]] = []
 
     mm_parser = mm_tracker.create_parser()
-    model_config = mm_tracker.model_config
-
-    wrap_dicts = (chat_template_text_format == "openai"
-                  or (model_config.task == "embedding"
-                      and model_config.is_multimodal_model)
-                  or (model_config.hf_config.model_type
-                      in MODEL_KEEP_MULTI_MODAL_CONTENT))
 
     for part in parts:
         parse_res = _parse_chat_message_content_part(
@@ -578,9 +810,11 @@ def _parse_chat_message_content_parts(
 
 
 def _parse_chat_message_content_part(
-        part: ChatCompletionContentPartParam,
-        mm_parser: BaseMultiModalContentParser,
-        wrap_dicts: bool) -> Optional[Union[str, Dict[str, str]]]:
+    part: ChatCompletionContentPartParam,
+    mm_parser: BaseMultiModalContentParser,
+    *,
+    wrap_dicts: bool,
+) -> Optional[Union[str, Dict[str, str]]]:
     """Parses a single part of a conversation. If wrap_dicts is True,
     structured dictionary pieces for texts and images will be
     wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
@@ -595,7 +829,7 @@ def _parse_chat_message_content_part(
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
 
-    # if part_type is text/refusal/image_url/audio_url/video_url but
+    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
     # content is empty, log a warning and skip
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
         logger.warning(
@@ -604,18 +838,30 @@ def _parse_chat_message_content_part(
         return None
 
     if part_type in ("text", "refusal"):
-        return {'type': 'text', 'text': content} if wrap_dicts else content
+        str_content = cast(str, content)
+        if wrap_dicts:
+            return {'type': 'text', 'text': str_content}
+        else:
+            return str_content
 
     if part_type == "image_url":
-        mm_parser.parse_image(content)
+        str_content = cast(str, content)
+        mm_parser.parse_image(str_content)
         return {'type': 'image'} if wrap_dicts else None
 
     if part_type == "audio_url":
-        mm_parser.parse_audio(content)
+        str_content = cast(str, content)
+        mm_parser.parse_audio(str_content)
+        return {'type': 'audio'} if wrap_dicts else None
+
+    if part_type == "input_audio":
+        dict_content = cast(Dict[str, str], content)
+        mm_parser.parse_input_audio(dict_content)
         return {'type': 'audio'} if wrap_dicts else None
 
     if part_type == "video_url":
-        mm_parser.parse_video(content)
+        str_content = cast(str, content)
+        mm_parser.parse_video(str_content)
         return {'type': 'video'} if wrap_dicts else None
 
     raise NotImplementedError(f"Unknown part type: {part_type}")
@@ -629,7 +875,7 @@ def _parse_chat_message_content_part(
 def _parse_chat_message_content(
     message: ChatCompletionMessageParam,
     mm_tracker: BaseMultiModalItemTracker,
-    chat_template_text_format: str,
+    content_format: _ChatTemplateContentFormat,
 ) -> List[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
@@ -640,12 +886,11 @@ def _parse_chat_message_content(
         content = [
             ChatCompletionContentPartTextParam(type="text", text=content)
         ]
-
     result = _parse_chat_message_content_parts(
         role,
         content,  # type: ignore
         mm_tracker,
-        chat_template_text_format,
+        wrap_dicts=(content_format == "openai"),
     )
 
     for result_msg in result:
@@ -684,6 +929,7 @@ def parse_chat_messages(
     messages: List[ChatCompletionMessageParam],
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
+    content_format: _ChatTemplateContentFormat,
 ) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]:
     conversation: List[ConversationMessage] = []
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
@@ -692,7 +938,7 @@ def parse_chat_messages(
         sub_messages = _parse_chat_message_content(
             msg,
             mm_tracker,
-            model_config.chat_template_text_format,
+            content_format,
         )
 
         conversation.extend(sub_messages)
@@ -706,6 +952,7 @@ def parse_chat_messages_futures(
     messages: List[ChatCompletionMessageParam],
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
+    content_format: _ChatTemplateContentFormat,
 ) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
     conversation: List[ConversationMessage] = []
     mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
@@ -714,7 +961,7 @@ def parse_chat_messages_futures(
         sub_messages = _parse_chat_message_content(
             msg,
             mm_tracker,
-            model_config.chat_template_text_format,
+            content_format,
         )
 
         conversation.extend(sub_messages)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 4b33fc1458ee3..58ab892676b9a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -5,24 +5,30 @@
                     Union, cast, overload)
 
 from tqdm import tqdm
+from typing_extensions import deprecated
 
 from vllm import envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence, get_beam_search_score)
+from vllm.config import CompilationConfig
 from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
                                    TaskOption)
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ChatTemplateContentFormatOption,
                                          apply_hf_chat_template,
                                          apply_mistral_chat_template,
-                                         parse_chat_messages)
-from vllm.inputs import PromptType, TextPrompt, TokensPrompt
+                                         parse_chat_messages,
+                                         resolve_chat_template_content_format)
+from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
     GuidedDecodingRequest, LLMGuidedOptions)
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput, RequestOutput,
+                          ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
@@ -105,6 +111,9 @@ class LLM:
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
             HuggingFace config. If a callable, it is called to update the
             HuggingFace config.
+        compilation_config: Either an integer or a dictionary. If it is an
+            integer, it is used as the level of compilation optimization. If it
+            is a dictionary, it can specify the full compilation configuration.
         **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
             :ref:`engine_args`)
 
@@ -113,7 +122,7 @@ class LLM:
         serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
     """
 
-    DEPRECATE_LEGACY: ClassVar[bool] = False
+    DEPRECATE_LEGACY: ClassVar[bool] = True
     """A flag to toggle whether to deprecate the legacy generate/encode API."""
 
     DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
@@ -164,6 +173,7 @@ def __init__(
         # After positional args are removed, move this right below `model`
         task: TaskOption = "auto",
         override_pooler_config: Optional[PoolerConfig] = None,
+        compilation_config: Optional[Union[int, Dict[str, Any]]] = None,
         **kwargs,
     ) -> None:
         '''
@@ -176,6 +186,15 @@ def __init__(
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
 
+        if compilation_config is not None:
+            if isinstance(compilation_config, (int, dict)):
+                compilation_config_instance = CompilationConfig.from_cli(
+                    str(compilation_config))
+            else:
+                compilation_config_instance = compilation_config
+        else:
+            compilation_config_instance = None
+
         engine_args = EngineArgs(
             model=model,
             task=task,
@@ -200,6 +219,7 @@ def __init__(
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
+            compilation_config=compilation_config_instance,
             **kwargs,
         )
         # Logic to switch between engines is done at runtime instead of import
@@ -212,6 +232,10 @@ def __init__(
 
         self.request_counter = Counter()
 
+    def __del__(self):
+        if self.llm_engine and hasattr(self.llm_engine, "shutdown"):
+            self.llm_engine.shutdown()
+
     @staticmethod
     def get_engine_class() -> Type[LLMEngine]:
         if envs.VLLM_USE_V1:
@@ -234,7 +258,24 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
         else:
             tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
 
+    @overload
+    def generate(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        sampling_params: Optional[Union[SamplingParams,
+                                        Sequence[SamplingParams]]] = None,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
+    ) -> List[RequestOutput]:
+        ...
+
     @overload  # LEGACY: single (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def generate(
         self,
         prompts: str,
@@ -243,10 +284,14 @@ def generate(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
     @overload  # LEGACY: multi (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def generate(
         self,
         prompts: List[str],
@@ -255,10 +300,14 @@ def generate(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
     @overload  # LEGACY: single (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def generate(
         self,
         prompts: Optional[str] = None,
@@ -268,10 +317,14 @@ def generate(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
     @overload  # LEGACY: multi (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def generate(
         self,
         prompts: Optional[List[str]] = None,
@@ -281,10 +334,14 @@ def generate(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
     @overload  # LEGACY: single or multi token ids [pos-only]
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def generate(
         self,
         prompts: None,
@@ -292,19 +349,9 @@ def generate(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[RequestOutput]:
-        ...
-
-    @overload
-    def generate(
-        self,
-        prompts: Union[PromptType, Sequence[PromptType]],
-        /,
-        *,
-        sampling_params: Optional[Union[SamplingParams,
-                                        Sequence[SamplingParams]]] = None,
-        use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -358,19 +405,20 @@ def generate(
             considered legacy and may be deprecated in the future. You should
             instead pass them via the ``inputs`` parameter.
         """
-        task = self.llm_engine.model_config.task
-        if task != "generate":
+        runner_type = self.llm_engine.model_config.runner_type
+        if runner_type != "generate":
             messages = [
                 "LLM.generate() is only supported for (conditional) generation "
                 "models (XForCausalLM, XForConditionalGeneration).",
             ]
 
-            supported_tasks = self.llm_engine.model_config.supported_tasks
-            if "generate" in supported_tasks:
+            supported_runner_types = self.llm_engine.model_config \
+                .supported_runner_types
+            if "generate" in supported_runner_types:
                 messages.append(
-                    "Your model supports the 'generate' task, but is "
-                    f"currently initialized for the '{task}' task. Please "
-                    "initialize the model using `--task generate`.")
+                    "Your model supports the 'generate' runner, but is "
+                    f"currently initialized for the '{runner_type}' runner. "
+                    "Please initialize vLLM using `--task generate`.")
 
             raise ValueError(" ".join(messages))
 
@@ -523,6 +571,7 @@ def chat(
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
         chat_template: Optional[str] = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
         tools: Optional[List[Dict[str, Any]]] = None,
@@ -539,9 +588,11 @@ def chat(
         to the OpenAI API.
 
         Args:
-            messages: A list of conversations or a single conversation. 
-                - Each conversation is represented as a list of messages.
-                - Each message is a dictionary with 'role' and 'content' keys.
+            messages: A list of conversations or a single conversation.
+
+              - Each conversation is represented as a list of messages.
+              - Each message is a dictionary with 'role' and 'content' keys.
+
             sampling_params: The sampling parameters for text generation.
                 If None, we use the default sampling parameters. When it
                 is a single value, it is applied to every prompt. When it
@@ -551,11 +602,19 @@ def chat(
             lora_request: LoRA request to use for generation, if any.
             chat_template: The template to use for structuring the chat.
               If not provided, the model's default chat template will be used.
+            chat_template_content_format: The format to render message content.
+
+              - "string" will render the content as a string.
+                Example: ``"Who are you?"``
+              - "openai" will render the content as a list of dictionaries,
+                similar to OpenAI schema.
+                Example: ``[{"type": "text", "text": "Who are you?"}]``
+
             add_generation_prompt: If True, adds a generation template
                 to each message.
             continue_final_message: If True, continues the final message in
-                the conversation instead of starting a new one. Cannot be `True`
-                if `add_generation_prompt` is also `True`.
+                the conversation instead of starting a new one. Cannot be
+                ``True`` if ``add_generation_prompt`` is also ``True``.
             mm_processor_kwargs: Multimodal processor kwarg overrides for this
                 chat request. Only used for offline requests.
 
@@ -576,17 +635,26 @@ def chat(
                 cast(List[ChatCompletionMessageParam], messages)
             ]
 
+        tokenizer = self.get_tokenizer()
+        model_config = self.llm_engine.get_model_config()
+        resolved_content_format = resolve_chat_template_content_format(
+            chat_template,
+            chat_template_content_format,
+            tokenizer,
+        )
+
         prompts: List[Union[TokensPrompt, TextPrompt]] = []
 
         for msgs in list_of_messages:
-            tokenizer = self.get_tokenizer()
-            model_config = self.llm_engine.get_model_config()
-
             # NOTE: _parse_chat_message_content_parts() currently doesn't
             # handle mm_processor_kwargs, since there is no implementation in
             # the chat message parsing for it.
             conversation, mm_data = parse_chat_messages(
-                msgs, model_config, tokenizer)
+                msgs,
+                model_config,
+                tokenizer,
+                content_format=resolved_content_format,
+            )
 
             prompt_data: Union[str, List[int]]
             if isinstance(tokenizer, MistralTokenizer):
@@ -629,7 +697,22 @@ def chat(
             lora_request=lora_request,
         )
 
+    @overload
+    def encode(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[PoolingRequestOutput]:
+        ...
+
     @overload  # LEGACY: single (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def encode(
         self,
         prompts: str,
@@ -638,10 +721,12 @@ def encode(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: multi (prompt + optional token ids)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def encode(
         self,
         prompts: List[str],
@@ -650,10 +735,12 @@ def encode(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: single (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def encode(
         self,
         prompts: Optional[str] = None,
@@ -663,10 +750,12 @@ def encode(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: multi (token ids + optional prompt)
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def encode(
         self,
         prompts: Optional[List[str]] = None,
@@ -676,10 +765,12 @@ def encode(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: single or multi token ids [pos-only]
+    @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def encode(
         self,
         prompts: None,
@@ -687,20 +778,8 @@ def encode(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
-        ...
-
-    @overload
-    def encode(
-        self,
-        prompts: Union[PromptType, Sequence[PromptType]],
-        /,
-        *,
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
-        use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @deprecate_kwargs(
@@ -718,8 +797,9 @@ def encode(
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[EmbeddingRequestOutput]:
-        """Generates the completions for the input prompts.
+    ) -> List[PoolingRequestOutput]:
+        """Apply pooling to the hidden states corresponding to the input
+        prompts.
 
         This class automatically batches the given prompts, considering
         the memory constraint. For the best performance, put all of your prompts
@@ -737,24 +817,26 @@ def encode(
                 generation, if any.
 
         Returns:
-            A list of `EmbeddingRequestOutput` objects containing the
-            generated embeddings in the same order as the input prompts.
+            A list of ``PoolingRequestOutput`` objects containing the
+            pooled hidden states in the same order as the input prompts.
 
         Note:
             Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
             considered legacy and may be deprecated in the future. You should
             instead pass them via the ``inputs`` parameter.
         """
-        task = self.llm_engine.model_config.task
-        if task != "embedding":
-            messages = ["LLM.encode() is only supported for embedding models."]
+        runner_type = self.llm_engine.model_config.runner_type
+        if runner_type != "pooling":
+            messages = ["LLM.encode() is only supported for pooling models."]
 
-            supported_tasks = self.llm_engine.model_config.supported_tasks
-            if "embedding" in supported_tasks:
+            supported_runner_types = self.llm_engine.model_config \
+                .supported_runner_types
+            if "pooling" in supported_runner_types:
                 messages.append(
-                    "Your model supports the 'embedding' task, but is "
-                    f"currently initialized for the '{task}' task. Please "
-                    "initialize the model using `--task embedding`.")
+                    "Your model supports the 'pooling' runner, but is "
+                    f"currently initialized for the '{runner_type}' runner. "
+                    "Please initialize vLLM using `--task embed`, "
+                    "`--task classify`, `--task score` etc.")
 
             raise ValueError(" ".join(messages))
 
@@ -780,7 +862,217 @@ def encode(
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return self.engine_class.validate_outputs(outputs,
-                                                  EmbeddingRequestOutput)
+                                                  PoolingRequestOutput)
+
+    def embed(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[EmbeddingRequestOutput]:
+        """
+        Generate an embedding vector for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for more details about the format of each prompts.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for
+                generation, if any.
+
+        Returns:
+            A list of ``EmbeddingRequestOutput`` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        if self.llm_engine.model_config.task != "embed":
+            raise ValueError(
+                "Embedding API is only enabled for `--task embed`")
+
+        items = self.encode(prompts,
+                            use_tqdm=use_tqdm,
+                            lora_request=lora_request,
+                            prompt_adapter_request=prompt_adapter_request)
+
+        return [EmbeddingRequestOutput.from_base(item) for item in items]
+
+    def classify(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ClassificationRequestOutput]:
+        """
+        Generate class logits for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for more details about the format of each prompts.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for
+                generation, if any.
+
+        Returns:
+            A list of ``ClassificationRequestOutput`` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        if self.llm_engine.model_config.task != "classify":
+            raise ValueError(
+                "Classification API is only enabled for `--task classify`")
+
+        items = self.encode(prompts,
+                            use_tqdm=use_tqdm,
+                            lora_request=lora_request,
+                            prompt_adapter_request=prompt_adapter_request)
+
+        return [ClassificationRequestOutput.from_base(item) for item in items]
+
+    def score(
+        self,
+        text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
+        text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]],
+        /,
+        *,
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ScoringRequestOutput]:
+        """Generate similarity scores for all pairs ``<text,text_pair>``.
+
+        The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``.
+        In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N``
+        times to pair with the ``text_2`` sentences.
+        The input pairs are used to build a list of prompts for the
+        cross encoder model. This class automatically batches the prompts,
+        considering the memory constraint. For the best performance, put all
+        of your texts into a single list and pass it to this method.
+
+        Args:
+            text_1: can be a single prompt or a list of prompts, in which
+                case it has to have the same length as the ``text_2`` list
+            text_2: The texts to pair with the query to form the input
+                to the LLM. See :class:`~vllm.inputs.PromptType` for
+                more details about the format of each prompts.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for
+                generation, if any.
+
+        Returns:
+            A list of ``ScoringRequestOutput`` objects containing the
+            generated scores in the same order as the input prompts.
+        """
+        runner_type = self.llm_engine.model_config.runner_type
+        if runner_type != "pooling":
+            messages = ["LLM.score() is only supported for pooling models."]
+
+            supported_runner_types = self.llm_engine.model_config \
+                .supported_runner_types
+            if "pooling" in supported_runner_types:
+                messages.append(
+                    "Your model supports the 'pooling' runner, but is "
+                    f"currently initialized for the '{runner_type}' runner. "
+                    "Please initialize vLLM using `--task embed`, "
+                    "`--task classify`, `--task score` etc.")
+
+            raise ValueError(" ".join(messages))
+
+        if not self.llm_engine.model_config.is_cross_encoder:
+            raise ValueError("Your model does not support cross encoding")
+        if self.llm_engine.model_config.task != "score":
+            raise ValueError("Score API is only enabled for `--task score`")
+
+        tokenizer = self.llm_engine.get_tokenizer()
+
+        if isinstance(tokenizer, MistralTokenizer):
+            raise ValueError(
+                "MistralTokenizer not supported for cross-encoding")
+
+        # the tokenizer for models such as
+        # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
+        # lists of tokens to the `text` and `text_pair` kwargs
+        def ensure_str(prompt: SingletonPrompt):
+            if isinstance(prompt, dict):
+                if "multi_modal_data" in prompt:
+                    raise ValueError("Multi-modal prompt is not "
+                                     "supported for cross encoding")
+                elif "prompt_token_ids" in prompt:
+                    prompt = tokenizer.decode(
+                        cast(TokensPrompt, prompt)["prompt_token_ids"])
+                elif "prompt" in prompt:
+                    prompt = cast(TextPrompt, prompt)["prompt"]
+            assert type(prompt) is str
+            return prompt
+
+        if isinstance(text_1, (str, dict)):
+            # Convert a single prompt to a list.
+            text_1 = [text_1]
+        text_1 = [ensure_str(t) for t in text_1]
+
+        if isinstance(text_2, (str, dict)):
+            # Convert a single prompt to a list.
+            text_2 = [text_2]
+        text_2 = [ensure_str(t) for t in text_2]
+
+        if len(text_1) > 1 and len(text_1) != len(text_2):
+            raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
+        if len(text_1) == 0:
+            raise ValueError("At least one text element must be given")
+        if len(text_2) == 0:
+            raise ValueError("At least one text_pair element must be given")
+
+        if len(text_1) == 1:
+            text_1 = text_1 * len(text_2)
+
+        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
+        pooling_params = PoolingParams()
+
+        tokenization_kwargs: Dict[str, Any] = {}
+        if truncate_prompt_tokens is not None:
+            tokenization_kwargs["truncation"] = True
+            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+        parsed_prompts = []
+
+        for q, t in input_pairs:
+            prompt_inputs = tokenizer(text=q,
+                                      text_pair=t,
+                                      **tokenization_kwargs)
+            engine_prompt = TokensPrompt(
+                prompt_token_ids=prompt_inputs["input_ids"],
+                token_type_ids=prompt_inputs.get("token_type_ids"))
+            parsed_prompts.append(engine_prompt)
+
+        self._validate_and_add_requests(
+            prompts=parsed_prompts,
+            params=pooling_params,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+        outputs = self._run_engine(use_tqdm=use_tqdm)
+        items = self.engine_class.validate_outputs(outputs,
+                                                   PoolingRequestOutput)
+
+        return [ScoringRequestOutput.from_base(item) for item in items]
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
@@ -922,7 +1214,7 @@ def _add_guided_params(
 
     def _run_engine(
             self, *, use_tqdm: bool
-    ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
@@ -935,7 +1227,7 @@ def _run_engine(
             )
 
         # Run the engine.
-        outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
+        outputs: List[Union[RequestOutput, PoolingRequestOutput]] = []
         total_in_toks = 0
         total_out_toks = 0
         while self.llm_engine.has_unfinished_requests():
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index b13f6a228b4c6..00e2d1a56f160 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,4 +1,5 @@
 import asyncio
+import atexit
 import importlib
 import inspect
 import multiprocessing
@@ -29,6 +30,7 @@
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -44,6 +46,7 @@
                                               EmbeddingRequest,
                                               EmbeddingResponse, ErrorResponse,
                                               LoadLoraAdapterRequest,
+                                              ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
                                               TokenizeResponse,
                                               UnloadLoraAdapterRequest)
@@ -52,9 +55,11 @@
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
@@ -132,8 +137,8 @@ async def build_async_engine_client_from_engine_args(
     # TODO: fill out feature matrix.
     if (MQLLMEngineClient.is_unsupported_config(engine_args)
             or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
-
-        engine_config = engine_args.create_engine_config()
+        engine_config = engine_args.create_engine_config(
+            UsageContext.OPENAI_API_SERVER)
         uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
                            "uses_ray", False)
 
@@ -172,8 +177,8 @@ async def build_async_engine_client_from_engine_args(
 
         # Select random path for IPC.
         ipc_path = get_open_zmq_ipc_path()
-        logger.info("Multiprocessing frontend to use %s for IPC Path.",
-                    ipc_path)
+        logger.debug("Multiprocessing frontend to use %s for IPC Path.",
+                     ipc_path)
 
         # Start RPCServer in separate process (holds the LLMEngine).
         # the current process might have CUDA context,
@@ -193,6 +198,14 @@ async def build_async_engine_client_from_engine_args(
         assert engine_pid is not None, "Engine process failed to start."
         logger.info("Started engine process with PID %d", engine_pid)
 
+        def _cleanup_ipc_path():
+            socket_path = ipc_path.replace("ipc://", "")
+            if os.path.exists(socket_path):
+                os.remove(socket_path)
+
+        # Ensure we clean up the local IPC socket file on exit.
+        atexit.register(_cleanup_ipc_path)
+
         # Build RPCClient, which conforms to EngineClient Protocol.
         engine_config = engine_args.create_engine_config()
         build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
@@ -246,8 +259,8 @@ def mount_metrics(app: FastAPI):
 
     prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
     if prometheus_multiproc_dir_path is not None:
-        logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
-                    prometheus_multiproc_dir_path)
+        logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
+                     prometheus_multiproc_dir_path)
         registry = CollectorRegistry()
         multiprocess.MultiProcessCollector(registry)
 
@@ -279,6 +292,10 @@ def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
     return request.app.state.openai_serving_embedding
 
 
+def score(request: Request) -> Optional[OpenAIServingScores]:
+    return request.app.state.openai_serving_scores
+
+
 def tokenization(request: Request) -> OpenAIServingTokenization:
     return request.app.state.openai_serving_tokenization
 
@@ -295,10 +312,11 @@ async def health(raw_request: Request) -> Response:
 
 
 @router.post("/tokenize")
+@with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
 
-    generator = await handler.create_tokenize(request)
+    generator = await handler.create_tokenize(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -309,10 +327,11 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
 
 
 @router.post("/detokenize")
+@with_cancellation
 async def detokenize(request: DetokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
 
-    generator = await handler.create_detokenize(request)
+    generator = await handler.create_detokenize(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -337,6 +356,7 @@ async def show_version():
 
 
 @router.post("/v1/chat/completions")
+@with_cancellation
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
     handler = chat(raw_request)
@@ -357,6 +377,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
 
 @router.post("/v1/completions")
+@with_cancellation
 async def create_completion(request: CompletionRequest, raw_request: Request):
     handler = completion(raw_request)
     if handler is None:
@@ -374,6 +395,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 
 
 @router.post("/v1/embeddings")
+@with_cancellation
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
     if handler is None:
@@ -390,6 +412,34 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     assert_never(generator)
 
 
+@router.post("/score")
+@with_cancellation
+async def create_score(request: ScoreRequest, raw_request: Request):
+    handler = score(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Score API")
+
+    generator = await handler.create_score(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, ScoreResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/v1/score")
+@with_cancellation
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly.")
+
+    return await create_score(request, raw_request)
+
+
 if envs.VLLM_TORCH_PROFILER_DIR:
     logger.warning(
         "Torch Profiler is enabled in the API server. This should ONLY be "
@@ -465,8 +515,9 @@ def build_app(args: Namespace) -> FastAPI:
 
     @app.exception_handler(RequestValidationError)
     async def validation_exception_handler(_, exc):
-        chat = app.state.openai_serving_chat
-        err = chat.create_error_response(message=str(exc))
+        err = ErrorResponse(message=str(exc),
+                            type="BadRequestError",
+                            code=HTTPStatus.BAD_REQUEST)
         return JSONResponse(err.model_dump(),
                             status_code=HTTPStatus.BAD_REQUEST)
 
@@ -474,10 +525,12 @@ async def validation_exception_handler(_, exc):
 
         @app.middleware("http")
         async def authentication(request: Request, call_next):
-            root_path = "" if args.root_path is None else args.root_path
             if request.method == "OPTIONS":
                 return await call_next(request)
-            if not request.url.path.startswith(f"{root_path}/v1"):
+            url_path = request.url.path
+            if app.root_path and url_path.startswith(app.root_path):
+                url_path = url_path[len(app.root_path):]
+            if not url_path.startswith("/v1"):
                 return await call_next(request)
             if request.headers.get("Authorization") != "Bearer " + token:
                 return JSONResponse(content={"error": "Unauthorized"},
@@ -529,6 +582,9 @@ def init_app_state(
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
 
+    resolved_chat_template = load_chat_template(args.chat_template)
+    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
+
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
@@ -537,12 +593,13 @@ def init_app_state(
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
-        chat_template=args.chat_template,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-    ) if model_config.task == "generate" else None
+    ) if model_config.runner_type == "generate" else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
@@ -551,21 +608,30 @@ def init_app_state(
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
-    ) if model_config.task == "generate" else None
+    ) if model_config.runner_type == "generate" else None
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
         base_model_paths,
         request_logger=request_logger,
-        chat_template=args.chat_template,
-    ) if model_config.task == "embedding" else None
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
+    ) if model_config.runner_type == "pooling" else None
+    state.openai_serving_scores = OpenAIServingScores(
+        engine_client,
+        model_config,
+        base_model_paths,
+        request_logger=request_logger
+    ) if (model_config.runner_type == "pooling" \
+          and model_config.is_cross_encoder) else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
         base_model_paths,
         lora_modules=args.lora_modules,
         request_logger=request_logger,
-        chat_template=args.chat_template,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
     )
 
 
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index eb08a89293370..24c206a1261f2 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -7,10 +7,11 @@
 import argparse
 import json
 import ssl
-from typing import List, Optional, Sequence, Union
+from typing import List, Optional, Sequence, Union, get_args
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
-from vllm.entrypoints.chat_utils import validate_chat_template
+from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
+                                         validate_chat_template)
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -132,6 +133,18 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         help="The file path to the chat template, "
                         "or the template in single-line form "
                         "for the specified model")
+    parser.add_argument(
+        '--chat-template-content-format',
+        type=str,
+        default="auto",
+        choices=get_args(ChatTemplateContentFormatOption),
+        help='The format to render message content within a chat template.'
+        '\n\n'
+        '* "string" will render the content as a string. '
+        'Example: "Hello World"\n'
+        '* "openai" will render the content as a list of dictionaries, '
+        'similar to OpenAI schema. '
+        'Example: [{"type": "text", "text": "Hello world!"}]')
     parser.add_argument("--response-role",
                         type=nullable_str,
                         default="assistant",
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 7913f8720ca73..c8132811de903 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -71,7 +71,7 @@ def get_logits_processors(
 
         # Check if token_id is within the vocab size
         for token_id, bias in clamped_logit_bias.items():
-            if token_id < 0 or token_id >= tokenizer.vocab_size:
+            if token_id < 0 or token_id >= len(tokenizer):
                 raise ValueError(f"token_id {token_id} in logit_bias contains "
                                  "out-of-vocab token id")
 
@@ -81,6 +81,6 @@ def get_logits_processors(
     if allowed_token_ids is not None:
         logits_processors.append(
             _get_allowed_token_ids_logits_processor(
-                frozenset(allowed_token_ids), tokenizer.vocab_size))
+                frozenset(allowed_token_ids), len(tokenizer)))
 
     return logits_processors
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 820aefd8800d9..5a70e0952666b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,20 +1,23 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import re
 import time
 from argparse import Namespace
 from typing import Any, Dict, List, Literal, Optional, Union
 
 import torch
-from openai.types.chat import ChatCompletionContentPartParam
 from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Annotated, Required, TypedDict
+from typing_extensions import Annotated
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.logger import init_logger
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
                                   RequestOutputKind, SamplingParams)
 from vllm.sequence import Logprob
-from vllm.utils import random_uuid
+from vllm.utils import random_uuid, resolve_obj_by_qualname
+
+logger = init_logger(__name__)
 
 # torch is mocked during docs generation,
 # so we have to provide the values as literals
@@ -35,29 +38,20 @@
 assert _LONG_INFO.max == _MOCK_LONG_INFO.max
 
 
-class CustomChatCompletionMessageParam(TypedDict, total=False):
-    """Enables custom roles in the Chat Completion API."""
-    role: Required[str]
-    """The role of the message's author."""
-
-    content: Union[str, List[ChatCompletionContentPartParam]]
-    """The contents of the message."""
-
-    name: str
-    """An optional name for the participant.
-
-    Provides the model information to differentiate between participants of the
-    same role.
-    """
-
-    tool_call_id: Optional[str]
-
-    tool_calls: Optional[List[dict]]
-
-
 class OpenAIBaseModel(BaseModel):
-    # OpenAI API does not allow extra fields
-    model_config = ConfigDict(extra="forbid")
+    # OpenAI API does allow extra fields
+    model_config = ConfigDict(extra="allow")
+
+    @model_validator(mode="before")
+    @classmethod
+    def __log_extra_fields__(cls, data):
+        if isinstance(data, dict):
+            extra_fields = data.keys() - cls.model_fields.keys()
+            if extra_fields:
+                logger.warning(
+                    "The following fields were present in the request "
+                    "but ignored: %s", extra_fields)
+        return data
 
 
 class ErrorResponse(OpenAIBaseModel):
@@ -155,6 +149,46 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
     type: Literal["function"] = "function"
 
 
+class LogitsProcessorConstructor(BaseModel):
+    qualname: str
+    args: Optional[List[Any]] = None
+    kwargs: Optional[Dict[str, Any]] = None
+
+
+LogitsProcessors = List[Union[str, LogitsProcessorConstructor]]
+
+
+def get_logits_processors(processors: Optional[LogitsProcessors],
+                          pattern: Optional[str]) -> Optional[List[Any]]:
+    if processors and pattern:
+        logits_processors = []
+        for processor in processors:
+            qualname = processor if isinstance(processor,
+                                               str) else processor.qualname
+            if not re.match(pattern, qualname):
+                raise ValueError(
+                    f"Logits processor '{qualname}' is not allowed by this "
+                    "server. See --logits-processor-pattern engine argument "
+                    "for more information.")
+            try:
+                logits_processor = resolve_obj_by_qualname(qualname)
+            except Exception as e:
+                raise ValueError(
+                    f"Logits processor '{qualname}' could not be resolved: {e}"
+                ) from e
+            if isinstance(processor, LogitsProcessorConstructor):
+                logits_processor = logits_processor(*processor.args or [],
+                                                    **processor.kwargs or {})
+            logits_processors.append(logits_processor)
+        return logits_processors
+    elif processors:
+        raise ValueError(
+            "The `logits_processors` argument is not supported by this "
+            "server. See --logits-processor-pattern engine argugment "
+            "for more information.")
+    return None
+
+
 class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
@@ -177,7 +211,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
-    temperature: Optional[float] = 0.7
+    temperature: Optional[float] = 1.0
     top_p: Optional[float] = 1.0
     tools: Optional[List[ChatCompletionToolsParam]] = None
     tool_choice: Optional[Union[Literal["none"], Literal["auto"],
@@ -300,6 +334,17 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
             "through out the inference process and return in response."))
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
 
     # doc: end-chat-completion-extra-params
 
@@ -321,7 +366,9 @@ def to_beam_search_params(self,
             length_penalty=self.length_penalty,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
-    def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
+    def to_sampling_params(
+            self, default_max_tokens: int,
+            logits_processor_pattern: Optional[str]) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
@@ -340,7 +387,7 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
                 assert json_schema is not None
                 self.guided_json = json_schema.json_schema
                 if self.guided_decoding_backend is None:
-                    self.guided_decoding_backend = "lm-format-enforcer"
+                    self.guided_decoding_backend = "xgrammar"
 
         guided_decoding = GuidedDecodingParams.from_optional(
             json=self._get_guided_json_from_tool() or self.guided_json,
@@ -371,6 +418,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             min_tokens=self.min_tokens,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
             include_stop_str_in_output=self.include_stop_str_in_output,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
@@ -485,17 +534,17 @@ def check_tool_usage(cls, data):
             # it matches a valid tool
             if isinstance(data["tool_choice"], dict):
                 valid_tool = False
-                specified_function = data["tool_choice"]["function"]
+                specified_function = data["tool_choice"].get("function")
                 if not specified_function:
                     raise ValueError(
-                        "Incorrectly formatted `tool_choice`. Should be like "
-                        "`{\"type\": \"function\","
+                        "Expected field `function` in `tool_choice`."
+                        " Correct usage: `{\"type\": \"function\","
                         " \"function\": {\"name\": \"my_function\"}}`")
-                specified_function_name = specified_function["name"]
+                specified_function_name = specified_function.get("name")
                 if not specified_function_name:
                     raise ValueError(
-                        "Incorrectly formatted `tool_choice`. Should be like "
-                        "`{\"type\": \"function\", "
+                        "Expected field `name` in `function` in `tool_choice`."
+                        "Correct usage: `{\"type\": \"function\", "
                         "\"function\": {\"name\": \"my_function\"}}`")
                 for tool in data["tools"]:
                     if tool["function"]["name"] == specified_function_name:
@@ -606,6 +655,17 @@ class CompletionRequest(OpenAIBaseModel):
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."))
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
 
     # doc: end-completion-extra-params
 
@@ -626,7 +686,9 @@ def to_beam_search_params(self,
             length_penalty=self.length_penalty,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
-    def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
+    def to_sampling_params(
+            self, default_max_tokens: int,
+            logits_processor_pattern: Optional[str]) -> SamplingParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
@@ -672,6 +734,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
@@ -767,22 +831,6 @@ class EmbeddingChatRequest(OpenAIBaseModel):
     # doc: end-chat-embedding-pooling-params
 
     # doc: begin-chat-embedding-extra-params
-    add_generation_prompt: bool = Field(
-        default=True,
-        description=
-        ("If true, the generation prompt will be added to the chat template. "
-         "This is a parameter used by chat template in tokenizer config of the "
-         "model."),
-    )
-    continue_final_message: bool = Field(
-        default=False,
-        description=
-        ("If this is set, the chat will be formatted so that the final "
-         "message in the chat is open-ended, without any EOS tokens. The "
-         "model will continue this message rather than starting a new one. "
-         "This allows you to \"prefill\" part of the model's response for it. "
-         "Cannot be used at the same time as `add_generation_prompt`."),
-    )
     add_special_tokens: bool = Field(
         default=False,
         description=(
@@ -829,6 +877,30 @@ def to_pooling_params(self):
 EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
 
 
+class ScoreRequest(OpenAIBaseModel):
+    model: str
+    text_1: Union[List[str], str]
+    text_2: Union[List[str], str]
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+
+    # doc: begin-score-pooling-params
+    additional_data: Optional[Any] = None
+    # doc: end-score-pooling-params
+
+    # doc: begin-score-extra-params
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."))
+
+    # doc: end-score-extra-params
+
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)
+
+
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: List[int] = Field(default_factory=list)
     token_logprobs: List[Optional[float]] = Field(default_factory=list)
@@ -899,6 +971,21 @@ class EmbeddingResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
+class ScoreResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "score"
+    score: float
+
+
+class ScoreResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: List[ScoreResponseData]
+    usage: UsageInfo
+
+
 class FunctionCall(OpenAIBaseModel):
     name: str
     arguments: str
@@ -1054,16 +1141,56 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
     model: str
     prompt: str
 
-    add_special_tokens: bool = Field(default=True)
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."),
+    )
 
 
 class TokenizeChatRequest(OpenAIBaseModel):
     model: str
     messages: List[ChatCompletionMessageParam]
 
-    add_generation_prompt: bool = Field(default=True)
-    continue_final_message: bool = Field(default=False)
-    add_special_tokens: bool = Field(default=False)
+    add_generation_prompt: bool = Field(
+        default=True,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=
+        ("If this is set, the chat will be formatted so that the final "
+         "message in the chat is open-ended, without any EOS tokens. The "
+         "model will continue this message rather than starting a new one. "
+         "This allows you to \"prefill\" part of the model's response for it. "
+         "Cannot be used at the same time as `add_generation_prompt`."),
+    )
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."),
+    )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the template renderer. "
+                     "Will be accessible by the chat template."),
+    )
 
     @model_validator(mode="before")
     @classmethod
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 1b422a93263b2..675daf54c0d0d 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -222,15 +222,17 @@ async def main(args):
         prompt_adapters=None,
         request_logger=request_logger,
         chat_template=None,
+        chat_template_content_format="auto",
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-    ) if model_config.task == "generate" else None
+    ) if model_config.runner_type == "generate" else None
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
         model_config,
         base_model_paths,
         request_logger=request_logger,
         chat_template=None,
-    ) if model_config.task == "embedding" else None
+        chat_template_content_format="auto",
+    ) if model_config.runner_type == "pooling" else None
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 77cae00ae827f..81bce0dd370bb 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -10,7 +10,8 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template
+from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
+                                         ConversationMessage)
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -31,27 +32,29 @@
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
-from vllm.utils import iterate_with_cancellation
 
 logger = init_logger(__name__)
 
 
 class OpenAIServingChat(OpenAIServing):
 
-    def __init__(self,
-                 engine_client: EngineClient,
-                 model_config: ModelConfig,
-                 base_model_paths: List[BaseModelPath],
-                 response_role: str,
-                 *,
-                 lora_modules: Optional[List[LoRAModulePath]],
-                 prompt_adapters: Optional[List[PromptAdapterPath]],
-                 request_logger: Optional[RequestLogger],
-                 chat_template: Optional[str],
-                 return_tokens_as_token_ids: bool = False,
-                 enable_auto_tools: bool = False,
-                 tool_parser: Optional[str] = None,
-                 enable_prompt_tokens_details: bool = False):
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        base_model_paths: List[BaseModelPath],
+        response_role: str,
+        *,
+        lora_modules: Optional[List[LoRAModulePath]],
+        prompt_adapters: Optional[List[PromptAdapterPath]],
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        return_tokens_as_token_ids: bool = False,
+        enable_auto_tools: bool = False,
+        tool_parser: Optional[str] = None,
+        enable_prompt_tokens_details: bool = False,
+    ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          base_model_paths=base_model_paths,
@@ -61,8 +64,8 @@ def __init__(self,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
 
         self.response_role = response_role
-        self.use_tool_use_model_template = False
-        self.chat_template = load_chat_template(chat_template)
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
 
         # set up tool use
         self.enable_auto_tools: bool = enable_auto_tools
@@ -119,7 +122,10 @@ async def create_chat_completion(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
+            model_name = self._get_model_name(lora_request)
+
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
             tool_parser = self.tool_parser
 
             # validation for OpenAI tools
@@ -157,6 +163,7 @@ async def create_chat_completion(
                 tokenizer,
                 request.messages,
                 chat_template=request.chat_template or self.chat_template,
+                chat_template_content_format=self.chat_template_content_format,
                 add_generation_prompt=request.add_generation_prompt,
                 continue_final_message=request.continue_final_message,
                 tool_dicts=tool_dicts,
@@ -170,7 +177,8 @@ async def create_chat_completion(
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-        request_id = f"chatcmpl-{request.request_id}"
+        request_id = "chatcmpl-" \
+                     f"{self._base_request_id(raw_request, request.request_id)}"
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:
@@ -188,7 +196,8 @@ async def create_chat_completion(
                         default_max_tokens)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens)
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern)
 
                 self._log_inputs(request_id,
                                  request_prompts[i],
@@ -224,20 +233,16 @@ async def create_chat_completion(
         assert len(generators) == 1
         result_generator, = generators
 
-        if raw_request:
-            result_generator = iterate_with_cancellation(
-                result_generator, raw_request.is_disconnected)
-
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
-                request, result_generator, request_id, conversation, tokenizer,
-                request_metadata)
+                request, result_generator, request_id, model_name,
+                conversation, tokenizer, request_metadata)
 
         try:
             return await self.chat_completion_full_generator(
-                request, result_generator, request_id, conversation, tokenizer,
-                request_metadata)
+                request, result_generator, request_id, model_name,
+                conversation, tokenizer, request_metadata)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
@@ -252,11 +257,11 @@ async def chat_completion_stream_generator(
         request: ChatCompletionRequest,
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
+        model_name: str,
         conversation: List[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
-        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         chunk_object_type: Final = "chat.completion.chunk"
         first_iteration = True
@@ -355,7 +360,7 @@ async def chat_completion_stream_generator(
 
                     # Send response to echo the input portion of the
                     # last message
-                    if request.echo or request.continue_final_message:
+                    if request.echo:
                         last_msg_content: Union[str, List[Dict[str, str]]] = ""
                         if conversation and "content" in conversation[
                                 -1] and conversation[-1].get("role") == role:
@@ -489,21 +494,33 @@ async def chat_completion_stream_generator(
 
                         if self._should_check_for_unstreamed_tool_arg_tokens(
                                 delta_message, output) and tool_parser:
+                            latest_delta_len = 0
+                            if ((isinstance(
+                                    delta_message.tool_calls[0].function,
+                                    DeltaFunctionCall)) and isinstance(
+                                        delta_message.tool_calls[0].function.
+                                        arguments, str)):
+                                latest_delta_len = len(
+                                    delta_message.tool_calls[0].function.
+                                    arguments)
+
                             # get the expected call based on partial JSON
                             # parsing which "autocompletes" the JSON
                             expected_call = json.dumps(
                                 tool_parser.prev_tool_call_arr[index].get(
-                                    "arguments", {}))
+                                    "arguments", {}),
+                                ensure_ascii=False)
 
                             # get what we've streamed so far for arguments
                             # for the current tool
                             actual_call = tool_parser.streamed_args_for_tool[
                                 index]
+                            if (latest_delta_len > 0):
+                                actual_call = actual_call[:-latest_delta_len]
 
                             # check to see if there's anything left to stream
                             remaining_call = expected_call.replace(
                                 actual_call, "", 1)
-
                             # set that as a delta message
                             delta_message = DeltaMessage(tool_calls=[
                                 DeltaToolCall(index=index,
@@ -585,12 +602,12 @@ async def chat_completion_full_generator(
         request: ChatCompletionRequest,
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
+        model_name: str,
         conversation: List[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
 
-        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         final_res: Optional[RequestOutput] = None
 
@@ -700,7 +717,7 @@ async def chat_completion_full_generator(
                 stop_reason=output.stop_reason)
             choices.append(choice_data)
 
-        if request.echo or request.continue_final_message:
+        if request.echo:
             last_msg_content: Union[str, List[Dict[str, str]]] = ""
             if conversation and "content" in conversation[-1] and conversation[
                     -1].get("role") == role:
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 936aae8f1c267..5cf9df92e296e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -30,7 +30,7 @@
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import merge_async_iterators, random_uuid
+from vllm.utils import merge_async_iterators
 
 logger = init_logger(__name__)
 
@@ -85,8 +85,7 @@ async def create_completion(
             return self.create_error_response(
                 "suffix is not currently supported")
 
-        model_name = self.base_model_paths[0].name
-        request_id = f"cmpl-{random_uuid()}"
+        request_id = f"cmpl-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
@@ -101,7 +100,7 @@ async def create_completion(
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
-            request_prompts, engine_prompts = self._preprocess_completion(
+            request_prompts, engine_prompts = await self._preprocess_completion(
                 request,
                 tokenizer,
                 request.prompt,
@@ -124,7 +123,8 @@ async def create_completion(
                         default_max_tokens)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens)
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern)
 
                 request_id_item = f"{request_id}-{i}"
 
@@ -159,9 +159,9 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators, is_cancelled=raw_request.is_disconnected)
+        result_generator = merge_async_iterators(*generators)
 
+        model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
@@ -392,6 +392,12 @@ def request_output_to_completion_response(
             prompt_token_ids = final_res.prompt_token_ids
             assert prompt_token_ids is not None
             prompt_logprobs = final_res.prompt_logprobs
+            if prompt_logprobs:
+                for logprob_dict in prompt_logprobs:
+                    if logprob_dict:
+                        for logprob_values in logprob_dict.values():
+                            if logprob_values.logprob == float('-inf'):
+                                logprob_values.logprob = -9999.0
             prompt_text = final_res.prompt
 
             token_ids: GenericSequence[int]
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index bbe7db8f13231..879276646d2ba 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,7 +1,7 @@
 import asyncio
 import base64
 import time
-from typing import AsyncGenerator, List, Literal, Optional, Union, cast
+from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
 
 import numpy as np
 from fastapi import Request
@@ -9,7 +9,7 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
                                               EmbeddingRequest,
@@ -18,8 +18,9 @@
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.logger import init_logger
-from vllm.outputs import EmbeddingOutput, EmbeddingRequestOutput
-from vllm.utils import merge_async_iterators, random_uuid
+from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput)
+from vllm.utils import merge_async_iterators
 
 logger = init_logger(__name__)
 
@@ -40,14 +41,16 @@ def _get_embedding(
 
 
 def request_output_to_embedding_response(
-        final_res_batch: List[EmbeddingRequestOutput], request_id: str,
+        final_res_batch: List[PoolingRequestOutput], request_id: str,
         created_time: int, model_name: str,
         encoding_format: Literal["float", "base64"]) -> EmbeddingResponse:
     data: List[EmbeddingResponseData] = []
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
+        embedding_res = EmbeddingRequestOutput.from_base(final_res)
         prompt_token_ids = final_res.prompt_token_ids
-        embedding = _get_embedding(final_res.outputs, encoding_format)
+
+        embedding = _get_embedding(embedding_res.outputs, encoding_format)
         embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
         data.append(embedding_data)
 
@@ -77,7 +80,8 @@ def __init__(
         *,
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
-    ):
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          base_model_paths=base_model_paths,
@@ -85,7 +89,8 @@ def __init__(
                          prompt_adapters=None,
                          request_logger=request_logger)
 
-        self.chat_template = load_chat_template(chat_template)
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
 
     async def create_embedding(
         self,
@@ -108,7 +113,7 @@ async def create_embedding(
                 "dimensions is currently not supported")
 
         model_name = request.model
-        request_id = f"embd-{random_uuid()}"
+        request_id = f"embd-{self._base_request_id(raw_request)}"
         created_time = int(time.monotonic())
 
         truncate_prompt_tokens = None
@@ -144,25 +149,30 @@ async def create_embedding(
                     tokenizer,
                     request.messages,
                     chat_template=request.chat_template or self.chat_template,
-                    add_generation_prompt=request.add_generation_prompt,
-                    continue_final_message=request.continue_final_message,
+                    chat_template_content_format=self.
+                    chat_template_content_format,
+                    # In embedding requests, we are not generating tokens,
+                    # so there is no need to append extra tokens to the input
+                    add_generation_prompt=False,
+                    continue_final_message=False,
                     truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
-                request_prompts, engine_prompts = self._preprocess_completion(
-                    request,
-                    tokenizer,
-                    request.input,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
-                    add_special_tokens=request.add_special_tokens,
-                )
+                (request_prompts,
+                 engine_prompts) = await self._preprocess_completion(
+                     request,
+                     tokenizer,
+                     request.input,
+                     truncate_prompt_tokens=truncate_prompt_tokens,
+                     add_special_tokens=request.add_special_tokens,
+                 )
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
         try:
             pooling_params = request.to_pooling_params()
 
@@ -192,15 +202,12 @@ async def create_embedding(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators,
-            is_cancelled=raw_request.is_disconnected if raw_request else None,
-        )
+        result_generator = merge_async_iterators(*generators)
 
         num_prompts = len(engine_prompts)
 
         # Non-streaming response
-        final_res_batch: List[Optional[EmbeddingRequestOutput]]
+        final_res_batch: List[Optional[PoolingRequestOutput]]
         final_res_batch = [None] * num_prompts
         try:
             async for i, res in result_generator:
@@ -208,7 +215,7 @@ async def create_embedding(
 
             assert all(final_res is not None for final_res in final_res_batch)
 
-            final_res_batch_checked = cast(List[EmbeddingRequestOutput],
+            final_res_batch_checked = cast(List[PoolingRequestOutput],
                                            final_res_batch)
 
             response = request_output_to_embedding_response(
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index fa315fa516632..5b6a089e4c319 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,24 +1,28 @@
 import json
 import pathlib
+from concurrent.futures.thread import ThreadPoolExecutor
 from dataclasses import dataclass
 from http import HTTPStatus
 from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
                     Optional, Sequence, Tuple, TypedDict, Union)
 
+from fastapi import Request
 from pydantic import Field
 from starlette.datastructures import Headers
 from typing_extensions import Annotated
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ChatTemplateContentFormatOption,
                                          ConversationMessage,
                                          apply_hf_chat_template,
                                          apply_mistral_chat_template,
-                                         parse_chat_messages_futures)
+                                         parse_chat_messages_futures,
+                                         resolve_chat_template_content_format)
 from vllm.entrypoints.logger import RequestLogger
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest,
                                               DetokenizeRequest,
@@ -27,7 +31,7 @@
                                               ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
-                                              ModelPermission,
+                                              ModelPermission, ScoreRequest,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest,
                                               UnloadLoraAdapterRequest)
@@ -44,7 +48,7 @@
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import AtomicCounter, is_list_of
+from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid
 
 logger = init_logger(__name__)
 
@@ -69,7 +73,7 @@ class LoRAModulePath:
 
 
 CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
-                              EmbeddingCompletionRequest,
+                              EmbeddingCompletionRequest, ScoreRequest,
                               TokenizeCompletionRequest]
 
 ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
@@ -138,6 +142,14 @@ def __init__(
         self.request_logger = request_logger
         self.return_tokens_as_token_ids = return_tokens_as_token_ids
 
+        self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
+
+        self._tokenize_prompt_input_async = make_async(
+            self._tokenize_prompt_input, executor=self._tokenizer_executor)
+        self._tokenize_prompt_input_or_inputs_async = make_async(
+            self._tokenize_prompt_input_or_inputs,
+            executor=self._tokenizer_executor)
+
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
@@ -366,7 +378,7 @@ def _tokenize_prompt_input_or_inputs(
         input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = True,
-    ) -> Iterator[TextTokensPrompt]:
+    ) -> List[TextTokensPrompt]:
         """
         Tokenize/detokenize depending on the input format.
 
@@ -374,45 +386,41 @@ def _tokenize_prompt_input_or_inputs(
         , each input can be a string or array of tokens. Note that each request
         can pass one or more inputs.
         """
-        for prompt_input in parse_and_batch_prompt(input_or_inputs):
-            # Although our type checking is based on mypy,
-            # VSCode Pyright extension should still work properly
-            # "is True" is required for Pyright to perform type narrowing
-            # See: https://github.com/microsoft/pyright/issues/7672
-            if prompt_input["is_tokens"] is False:
-                yield self._normalize_prompt_text_to_input(
-                    request,
-                    tokenizer,
-                    prompt=prompt_input["content"],
-                    truncate_prompt_tokens=truncate_prompt_tokens,
-                    add_special_tokens=add_special_tokens,
-                )
-            else:
-                yield self._normalize_prompt_tokens_to_input(
-                    request,
-                    tokenizer,
-                    prompt_ids=prompt_input["content"],
-                    truncate_prompt_tokens=truncate_prompt_tokens,
-                )
+        # Although our type checking is based on mypy,
+        # VSCode Pyright extension should still work properly
+        # "is True" is required for Pyright to perform type narrowing
+        # See: https://github.com/microsoft/pyright/issues/7672
+        return [
+            self._normalize_prompt_text_to_input(
+                request,
+                tokenizer,
+                prompt=prompt_input["content"],
+                truncate_prompt_tokens=truncate_prompt_tokens,
+                add_special_tokens=add_special_tokens)
+            if prompt_input["is_tokens"] is False else
+            self._normalize_prompt_tokens_to_input(
+                request,
+                tokenizer,
+                prompt_ids=prompt_input["content"],
+                truncate_prompt_tokens=truncate_prompt_tokens)
+            for prompt_input in parse_and_batch_prompt(input_or_inputs)
+        ]
 
-    def _preprocess_completion(
+    async def _preprocess_completion(
         self,
         request: CompletionLikeRequest,
         tokenizer: AnyTokenizer,
         input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = True,
-    ) -> Tuple[Sequence[TextTokensPrompt], List[TokensPrompt]]:
-        request_prompts = [
-            request_prompt
-            for request_prompt in self._tokenize_prompt_input_or_inputs(
-                request,
-                tokenizer,
-                input_or_inputs,
-                truncate_prompt_tokens=truncate_prompt_tokens,
-                add_special_tokens=add_special_tokens,
-            )
-        ]
+    ) -> Tuple[List[TextTokensPrompt], List[TokensPrompt]]:
+        request_prompts = await self._tokenize_prompt_input_or_inputs_async(
+            request,
+            tokenizer,
+            input_or_inputs,
+            truncate_prompt_tokens=truncate_prompt_tokens,
+            add_special_tokens=add_special_tokens,
+        )
 
         engine_prompts = [
             TokensPrompt(prompt_token_ids=request_prompt["prompt_token_ids"])
@@ -426,7 +434,8 @@ async def _preprocess_chat(
         request: ChatLikeRequest,
         tokenizer: AnyTokenizer,
         messages: List[ChatCompletionMessageParam],
-        chat_template: Optional[str] = None,
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
         tool_dicts: Optional[List[Dict[str, Any]]] = None,
@@ -437,10 +446,16 @@ async def _preprocess_chat(
         add_special_tokens: bool = False,
     ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt],
                List[TokensPrompt]]:
+        resolved_content_format = resolve_chat_template_content_format(
+            chat_template,
+            chat_template_content_format,
+            tokenizer,
+        )
         conversation, mm_data_future = parse_chat_messages_futures(
             messages,
             self.model_config,
             tokenizer,
+            content_format=resolved_content_format,
         )
 
         _chat_template_kwargs: Dict[str, Any] = dict(
@@ -484,7 +499,7 @@ async def _preprocess_chat(
                 request=request)
 
         if isinstance(request_prompt, str):
-            prompt_inputs = self._tokenize_prompt_input(
+            prompt_inputs = await self._tokenize_prompt_input_async(
                 request,
                 tokenizer,
                 request_prompt,
@@ -551,6 +566,16 @@ async def _get_trace_headers(
 
         return None
 
+    @staticmethod
+    def _base_request_id(raw_request: Optional[Request],
+                         default: Optional[str] = None) -> Optional[str]:
+        """Pulls the request id to use from a header, if provided"""
+        default = default or random_uuid()
+        if raw_request is None:
+            return default
+
+        return raw_request.headers.get("X-Request-Id", default)
+
     @staticmethod
     def _get_decoded_token(logprob: Logprob,
                            token_id: int,
@@ -638,3 +663,16 @@ async def unload_lora_adapter(
 
     def _is_model_supported(self, model_name):
         return any(model.name == model_name for model in self.base_model_paths)
+
+    def _get_model_name(self, lora: Optional[LoRARequest]):
+        """
+        Returns the appropriate model name depending on the availability
+        and support of the LoRA or base model.
+        Parameters:
+        - lora: LoRARequest that contain a base_model_name.
+        Returns:
+        - str: The name of the base model or the first available model path.
+        """
+        if lora is not None:
+            return lora.lora_name
+        return self.base_model_paths[0].name
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
new file mode 100644
index 0000000000000..101d170bee4d6
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -0,0 +1,214 @@
+import asyncio
+import time
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
+
+from fastapi import Request
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest,
+                                              ScoreResponse, ScoreResponseData,
+                                              UsageInfo)
+from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.inputs.data import TokensPrompt
+from vllm.logger import init_logger
+from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.utils import make_async, merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+def request_output_to_score_response(
+        final_res_batch: List[PoolingRequestOutput], request_id: str,
+        created_time: int, model_name: str) -> ScoreResponse:
+    data: List[ScoreResponseData] = []
+    num_prompt_tokens = 0
+    for idx, final_res in enumerate(final_res_batch):
+        classify_res = ScoringRequestOutput.from_base(final_res)
+
+        score_data = ScoreResponseData(index=idx,
+                                       score=classify_res.outputs.score)
+        data.append(score_data)
+
+    usage = UsageInfo(
+        prompt_tokens=num_prompt_tokens,
+        total_tokens=num_prompt_tokens,
+    )
+
+    return ScoreResponse(
+        id=request_id,
+        created=created_time,
+        model=model_name,
+        data=data,
+        usage=usage,
+    )
+
+
+def make_pairs(text_1: Union[List[str], str], text_2: Union[List[str],
+                                                            str]) -> List:
+    if isinstance(text_1, (str, dict)):
+        # Convert a single prompt to a list.
+        text_1 = [text_1]
+    text_1 = [t for t in text_1]
+
+    if isinstance(text_2, (str, dict)):
+        # Convert a single prompt to a list.
+        text_2 = [text_2]
+    text_2 = [t for t in text_2]
+    if len(text_1) > 1 and len(text_1) != len(text_2):
+        raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
+    if len(text_1) == 0:
+        raise ValueError("At least one text element must be given")
+    if len(text_2) == 0:
+        raise ValueError("At least one text_pair element must be given")
+
+    if len(text_1) == 1:
+        text_1 = text_1 * len(text_2)
+
+    return [(t1, t2) for t1, t2 in zip(text_1, text_2)]
+
+
+class OpenAIServingScores(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        base_model_paths: List[BaseModelPath],
+        *,
+        request_logger: Optional[RequestLogger],
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         base_model_paths=base_model_paths,
+                         lora_modules=None,
+                         prompt_adapters=None,
+                         request_logger=request_logger)
+
+    async def create_score(
+        self,
+        request: ScoreRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[ScoreResponse, ErrorResponse]:
+        """
+        Score API similar to Sentence Transformers cross encoder
+
+        See https://sbert.net/docs/package_reference/cross_encoder
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        model_name = request.model
+        request_id = f"score-{self._base_request_id(raw_request)}"
+        created_time = int(time.monotonic())
+        truncate_prompt_tokens = request.truncate_prompt_tokens
+
+        request_prompts = []
+        engine_prompts = []
+
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if prompt_adapter_request is not None:
+                raise NotImplementedError("Prompt adapter is not supported "
+                                          "for scoring models")
+
+            if isinstance(tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "MistralTokenizer not supported for cross-encoding")
+
+            if not self.model_config.is_cross_encoder:
+                raise ValueError("Model is not cross encoder.")
+
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+
+        input_pairs = make_pairs(request.text_1, request.text_2)
+
+        for q, t in input_pairs:
+            request_prompt = f"{q}{tokenizer.sep_token}{t}"
+
+            tokenization_kwargs: Dict[str, Any] = {}
+            if truncate_prompt_tokens is not None:
+                tokenization_kwargs["truncation"] = True
+                tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+            tokenize_async = make_async(tokenizer.__call__,
+                                        executor=self._tokenizer_executor)
+            prompt_inputs = await tokenize_async(text=q,
+                                                 text_pair=t,
+                                                 **tokenization_kwargs)
+            engine_prompt = TokensPrompt(
+                prompt_token_ids=prompt_inputs["input_ids"],
+                token_type_ids=prompt_inputs.get("token_type_ids"))
+
+            request_prompts.append(request_prompt)
+            engine_prompts.append(engine_prompt)
+
+        try:
+            pooling_params = request.to_pooling_params()
+
+            for i, engine_prompt in enumerate(engine_prompts):
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(request_id_item,
+                                 request_prompts[i],
+                                 params=pooling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                generator = self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        result_generator = merge_async_iterators(*generators)
+
+        num_prompts = len(engine_prompts)
+
+        # Non-streaming response
+        final_res_batch: List[Optional[PoolingRequestOutput]]
+        final_res_batch = [None] * num_prompts
+
+        try:
+            async for i, res in result_generator:
+                final_res_batch[i] = res
+
+            assert all(final_res is not None for final_res in final_res_batch)
+
+            final_res_batch_checked = cast(List[PoolingRequestOutput],
+                                           final_res_batch)
+
+            response = request_output_to_score_response(
+                final_res_batch_checked, request_id, created_time, model_name)
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        return response
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 1fd82304f7a4d..2e849333680d4 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,8 +1,10 @@
-from typing import List, Optional, Union
+from typing import Final, List, Optional, Union
+
+from fastapi import Request
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -17,7 +19,6 @@
                                                     LoRAModulePath,
                                                     OpenAIServing)
 from vllm.logger import init_logger
-from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -33,7 +34,8 @@ def __init__(
         lora_modules: Optional[List[LoRAModulePath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
-    ):
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          base_model_paths=base_model_paths,
@@ -41,22 +43,19 @@ def __init__(
                          prompt_adapters=None,
                          request_logger=request_logger)
 
-        # If this is None we use the tokenizer's default chat template
-        # the list of commonly-used chat template names for HF named templates
-        hf_chat_templates: List[str] = ['default', 'tool_use']
-        self.chat_template = chat_template \
-            if chat_template in hf_chat_templates \
-            else load_chat_template(chat_template)
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
 
     async def create_tokenize(
         self,
         request: TokenizeRequest,
+        raw_request: Request,
     ) -> Union[TokenizeResponse, ErrorResponse]:
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
 
-        request_id = f"tokn-{random_uuid()}"
+        request_id = f"tokn-{self._base_request_id(raw_request)}"
 
         try:
             (
@@ -75,18 +74,22 @@ async def create_tokenize(
                     request,
                     tokenizer,
                     request.messages,
-                    chat_template=self.chat_template,
+                    chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.
+                    chat_template_content_format,
                     add_generation_prompt=request.add_generation_prompt,
                     continue_final_message=request.continue_final_message,
+                    chat_template_kwargs=request.chat_template_kwargs,
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
-                request_prompts, engine_prompts = self._preprocess_completion(
-                    request,
-                    tokenizer,
-                    request.prompt,
-                    add_special_tokens=request.add_special_tokens,
-                )
+                (request_prompts,
+                 engine_prompts) = await self._preprocess_completion(
+                     request,
+                     tokenizer,
+                     request.prompt,
+                     add_special_tokens=request.add_special_tokens,
+                 )
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
@@ -111,12 +114,13 @@ async def create_tokenize(
     async def create_detokenize(
         self,
         request: DetokenizeRequest,
+        raw_request: Request,
     ) -> Union[DetokenizeResponse, ErrorResponse]:
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
 
-        request_id = f"tokn-{random_uuid()}"
+        request_id = f"tokn-{self._base_request_id(raw_request)}"
 
         (
             lora_request,
@@ -134,7 +138,7 @@ async def create_detokenize(
         # Silently ignore prompt adapter since it does not affect tokenization
         # (Unlike in Embeddings API where an error is raised)
 
-        prompt_input = self._tokenize_prompt_input(
+        prompt_input = await self._tokenize_prompt_input_async(
             request,
             tokenizer,
             request.tokens,
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index b5854ca39ab47..8aefcd8d58a39 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -35,11 +35,18 @@ class GraniteToolParser(ToolParser):
 
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
+        # for granite 3.0, the token `<|tool_call|>`
+        self.bot_token = "<|tool_call|>"
+        # for granite 3.1, the string `<tool_call>`
+        self.bot_string = "<tool_call>"
 
     def extract_tool_calls(
             self, model_output: str,
             request: ChatCompletionRequest) -> ExtractedToolCallInformation:
-        stripped = model_output.strip()
+        stripped = model_output.strip()\
+                    .removeprefix(self.bot_token)\
+                    .removeprefix(self.bot_string)\
+                    .lstrip()
         if not stripped or stripped[0] != '[':
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
@@ -86,7 +93,14 @@ def extract_tool_calls_streaming(
     ) -> Union[DeltaMessage, None]:
 
         start_idx = consume_space(0, current_text)
-        if not current_text or current_text[start_idx] != '[':
+        if current_text[start_idx:].startswith(self.bot_token):
+            start_idx = consume_space(start_idx + len(self.bot_token),
+                                      current_text)
+        if current_text[start_idx:].startswith(self.bot_string):
+            start_idx = consume_space(start_idx + len(self.bot_string),
+                                      current_text)
+        if not current_text or start_idx >= len(current_text)\
+            or current_text[start_idx] != '[':
             return DeltaMessage(content=delta_text)
 
         # bit mask flags for partial JSON parsing. If the name hasn't been
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index faa6f653b835c..869d15ac359ea 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -12,8 +12,6 @@
                                               FunctionCall, ToolCall)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser, ToolParserManager)
-from vllm.entrypoints.openai.tool_parsers.utils import (
-    extract_intermediate_diff)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
@@ -93,7 +91,8 @@ def extract_tool_calls(
                         function=FunctionCall(
                             name=function_call["name"],
                             # function call args are JSON but as a string
-                            arguments=json.dumps(function_call["arguments"])))
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False)))
                     for function_call in raw_function_calls
                 ]
 
@@ -141,13 +140,26 @@ def extract_tool_calls_streaming(
                 self.tool_call_start_token_id)
             cur_tool_end_count = current_token_ids.count(
                 self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
 
             # case: if we're generating text, OR rounding out a tool call
             if (cur_tool_start_count == cur_tool_end_count
-                    and prev_tool_end_count == cur_tool_end_count):
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
                 logger.debug("Generating text content! skipping tool parsing.")
-                if delta_text != self.tool_call_end_token:
-                    return DeltaMessage(content=delta_text)
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
 
             # case: if tool open & close tag counts don't match, we're doing
             # imaginary "else" block here
@@ -186,12 +198,21 @@ def extract_tool_calls_streaming(
 
             # case -- the current tool call is being closed.
             elif (cur_tool_start_count == cur_tool_end_count
-                  and cur_tool_end_count > prev_tool_end_count):
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if (self.prev_tool_call_arr is None
+                        or len(self.prev_tool_call_arr) == 0):
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
                 diff = self.prev_tool_call_arr[self.current_tool_id].get(
                     "arguments")
                 if diff:
-                    diff = json.dumps(diff).replace(
-                        self.streamed_args_for_tool[self.current_tool_id], "")
+                    diff = diff.encode('utf-8').decode(
+                        'unicode_escape') if diff is str else diff
+                    if ('"}' not in delta_text):
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
                     logger.debug(
                         "Finishing tool and found diff that had not "
                         "been streamed yet: %s", diff)
@@ -220,10 +241,15 @@ def extract_tool_calls_streaming(
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')
                 return None
+            except json.decoder.JSONDecodeError:
+                logger.debug("unable to parse JSON")
+                return None
 
             # case - we haven't sent the tool name yet. If it's available, send
             #   it. otherwise, wait until it's available.
             if not self.current_tool_name_sent:
+                if (current_tool_call is None):
+                    return None
                 function_name: Union[str, None] = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
@@ -283,13 +309,17 @@ def extract_tool_calls_streaming(
             #   autocompleting the JSON
             elif cur_arguments and not prev_arguments:
 
-                cur_arguments_json = json.dumps(cur_arguments)
+                cur_arguments_json = json.dumps(cur_arguments,
+                                                ensure_ascii=False)
                 logger.debug("finding %s in %s", delta_text,
                              cur_arguments_json)
 
                 # get the location where previous args differ from current
-                args_delta_start_loc = cur_arguments_json.index(delta_text) \
-                                       + len(delta_text)
+                if (delta_text not in cur_arguments_json[:-2]):
+                    return None
+                args_delta_start_loc = cur_arguments_json[:-2]. \
+                                           rindex(delta_text) + \
+                                           len(delta_text)
 
                 # use that to find the actual delta
                 arguments_delta = cur_arguments_json[:args_delta_start_loc]
@@ -307,22 +337,20 @@ def extract_tool_calls_streaming(
 
             # last case -- we have an update to existing arguments.
             elif cur_arguments and prev_arguments:
+                if isinstance(delta_text, str) and len(delta_text.rstrip(
+                )) >= 1 and delta_text.rstrip()[-1] == '}':
+                    delta_text = delta_text.rstrip()[:-1]
+
+                logger.debug("got diff %s", delta_text)
 
-                cur_args_json = json.dumps(cur_arguments)
-                prev_args_json = json.dumps(prev_arguments)
-                logger.debug("Searching for diff between\n%s", cur_args_json)
-                logger.debug("and\n%s", prev_args_json)
-                argument_diff = extract_intermediate_diff(
-                    cur_args_json, prev_args_json)
-                logger.debug("got argument diff %s", argument_diff)
                 delta = DeltaMessage(tool_calls=[
                     DeltaToolCall(index=self.current_tool_id,
                                   function=DeltaFunctionCall(
-                                      arguments=argument_diff).model_dump(
+                                      arguments=delta_text).model_dump(
                                           exclude_none=True))
                 ])
                 self.streamed_args_for_tool[self.current_tool_id] \
-                    += argument_diff
+                    += delta_text
 
             # handle saving the state for the current tool into
             # the "prev" list for use in diffing for the next iteration
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index a5f44d69e5fd2..1856308b88cfa 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -29,7 +29,8 @@ class Llama3JsonToolParser(ToolParser):
     Tool call parser for Llama 3.1 models intended for use with the
     examples/tool_chat_template_llama.jinja template.
 
-    Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
+    Used when --enable-auto-tool-choice --tool-call-parser llama3_json 
+    are all set
     """
 
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 5caac84138e3b..bada805dd35b9 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -19,7 +19,6 @@
     extract_intermediate_diff)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -109,7 +108,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=raw_function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(raw_function_call["arguments"])))
+                        arguments=json.dumps(raw_function_call["arguments"],
+                                             ensure_ascii=False)))
                 for raw_function_call in function_call_arr
             ]
 
@@ -199,7 +199,7 @@ def extract_tool_calls_streaming(
                     diff: Union[str, None] = current_tool_call.get("arguments")
 
                     if diff:
-                        diff = json.dumps(diff).replace(
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
                             self.streamed_args_for_tool[self.current_tool_id],
                             "")
                         delta = DeltaMessage(tool_calls=[
@@ -232,7 +232,7 @@ def extract_tool_calls_streaming(
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=f"chatcmpl-tool-{random_uuid()}",
+                                      id=MistralToolCall.generate_random_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
@@ -250,6 +250,8 @@ def extract_tool_calls_streaming(
                 cur_arguments = current_tool_call.get("arguments")
 
                 new_text = delta_text.replace("\'", "\"")
+                if ('"}' in new_text):
+                    new_text = new_text[:new_text.rindex('"}')]
 
                 if not cur_arguments and not prev_arguments:
 
@@ -260,12 +262,15 @@ def extract_tool_calls_streaming(
                         "mid-arguments")
                     delta = None
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)[:-2]
                     logger.debug("finding %s in %s", new_text,
                                  cur_arguments_json)
 
+                    if (new_text not in cur_arguments_json):
+                        return None
                     arguments_delta = cur_arguments_json[:cur_arguments_json.
-                                                         index(new_text) +
+                                                         rindex(new_text) +
                                                          len(new_text)]
                     logger.debug("First tokens in arguments received: %s",
                                  arguments_delta)
@@ -279,8 +284,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
 
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
                     logger.debug("Searching for diff between \n%s\n%s",
                                  cur_args_json, prev_args_json)
 
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
new file mode 100644
index 0000000000000..e8a78d216d0f0
--- /dev/null
+++ b/vllm/entrypoints/utils.py
@@ -0,0 +1,57 @@
+import asyncio
+import functools
+
+from fastapi import Request
+
+
+async def listen_for_disconnect(request: Request) -> None:
+    """Returns if a disconnect message is received"""
+    while True:
+        message = await request.receive()
+        if message["type"] == "http.disconnect":
+            break
+
+
+def with_cancellation(handler_func):
+    """Decorator that allows a route handler to be cancelled by client
+    disconnections.
+    
+    This does _not_ use request.is_disconnected, which does not work with
+    middleware. Instead this follows the pattern from 
+    starlette.StreamingResponse, which simultaneously awaits on two tasks- one
+    to wait for an http disconnect message, and the other to do the work that we
+    want done. When the first task finishes, the other is cancelled.
+
+    A core assumption of this method is that the body of the request has already
+    been read. This is a safe assumption to make for fastapi handlers that have
+    already parsed the body of the request into a pydantic model for us.
+    This decorator is unsafe to use elsewhere, as it will consume and throw away
+    all incoming messages for the request while it looks for a disconnect
+    message.
+
+    In the case where a `StreamingResponse` is returned by the handler, this
+    wrapper will stop listening for disconnects and instead the response object
+    will start listening for disconnects.
+    """
+
+    # Functools.wraps is required for this wrapper to appear to fastapi as a
+    # normal route handler, with the correct request type hinting.
+    @functools.wraps(handler_func)
+    async def wrapper(*args, **kwargs):
+
+        # The request is either the second positional arg or `raw_request`
+        request = args[1] if len(args) > 1 else kwargs["raw_request"]
+
+        handler_task = asyncio.create_task(handler_func(*args, **kwargs))
+        cancellation_task = asyncio.create_task(listen_for_disconnect(request))
+
+        done, pending = await asyncio.wait([handler_task, cancellation_task],
+                                           return_when=asyncio.FIRST_COMPLETED)
+        for task in pending:
+            task.cancel()
+
+        if handler_task in done:
+            return handler_task.result()
+        return None
+
+    return wrapper
diff --git a/vllm/envs.py b/vllm/envs.py
index f320e35971f94..18870c1c6b51a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -8,7 +8,6 @@
     VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
     VLLM_USE_MODELSCOPE: bool = False
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
-    VLLM_INSTANCE_ID: Optional[str] = None
     VLLM_NCCL_SO_PATH: Optional[str] = None
     LD_LIBRARY_PATH: Optional[str] = None
     VLLM_USE_TRITON_FLASH_ATTN: bool = False
@@ -46,10 +45,11 @@
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
-    VLLM_VIDEO_FETCH_TIMEOUT: int = 15
+    VLLM_VIDEO_FETCH_TIMEOUT: int = 30
     VLLM_AUDIO_FETCH_TIMEOUT: int = 10
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
@@ -67,12 +67,11 @@
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
-    VLLM_TORCH_COMPILE_LEVEL: int = 0
-    VLLM_TORCH_COMPILE_CONFIG: Optional[str] = None
-    VLLM_CUSTOM_OPS: List[str] = []
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
-    VLLM_ENABLE_V1_MULTIPROCESSING: bool = False
+    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
+    VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
+    VLLM_DISABLE_COMPILE_CACHE: bool = False
 
 
 def get_default_cache_root():
@@ -116,7 +115,8 @@ def get_default_config_root():
 
     # If set, vllm will use precompiled binaries (*.so)
     "VLLM_USE_PRECOMPILED":
-    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
+    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
+        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
 
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"
@@ -156,7 +156,7 @@ def get_default_config_root():
     # If you are using multi-node inference, you should set this differently
     # on each node.
     'VLLM_HOST_IP':
-    lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
+    lambda: os.getenv('VLLM_HOST_IP', ""),
 
     # used in distributed environment to manually set the communication port
     # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
@@ -177,11 +177,6 @@ def get_default_config_root():
     "VLLM_USE_MODELSCOPE":
     lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
 
-    # Instance id represents an instance of the VLLM. All processes in the same
-    # instance should have the same instance id.
-    "VLLM_INSTANCE_ID":
-    lambda: os.environ.get("VLLM_INSTANCE_ID", None),
-
     # Interval in seconds to log a warning message when the ring buffer is full
     "VLLM_RINGBUFFER_WARNING_INTERVAL":
     lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
@@ -210,24 +205,6 @@ def get_default_config_root():
     "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
     lambda: bool(
         os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
-    "VLLM_TORCH_COMPILE_LEVEL":
-    lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")),
-
-    # Path to the config file for torch compile
-    "VLLM_TORCH_COMPILE_CONFIG":
-    lambda: os.environ.get("VLLM_TORCH_COMPILE_CONFIG", None),
-
-    # Fine-grained control over which custom ops to enable/disable.
-    # Use 'all' to enable all, 'none' to disable all.
-    # Also specify a list of custom op names to enable (prefixed with a '+'),
-    # or disable (prefixed with a '-').
-    # Examples:
-    # - 'all,-op1' to enable all except op1
-    # - 'none,+op1,+op2' to enable only op1 and op2
-    # By default, all custom ops are enabled when running without Inductor
-    # and disabled when running with Inductor (compile_level >= Inductor).
-    "VLLM_CUSTOM_OPS":
-    lambda: os.environ.get("VLLM_CUSTOM_OPS", "").replace(" ", "").split(","),
 
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
@@ -362,6 +339,13 @@ def get_default_config_root():
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
                  ),
 
+    # If the env var is set, it enables GPU communication overlap in
+    # Ray's compiled DAG. This flag is ignored if
+    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1"))
+                 ),
+
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
@@ -477,7 +461,11 @@ def get_default_config_root():
 
     # If set, enable multiprocessing in LLM for the V1 code path.
     "VLLM_ENABLE_V1_MULTIPROCESSING":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))),
+    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
+    "VLLM_LOG_BATCHSIZE_INTERVAL":
+    lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
+    "VLLM_DISABLE_COMPILE_CACHE":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
 }
 
 # end-env-vars-definition
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 4ceb5a837dd7f..2816b5c5c1f88 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -2,9 +2,6 @@
 from functools import partial
 from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
-import vllm.envs as envs
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
@@ -13,8 +10,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (GiB_bytes, get_distributed_init_method, get_open_port,
-                        get_vllm_instance_id, make_async)
+from vllm.utils import get_distributed_init_method, get_open_port, make_async
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -26,7 +22,7 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
@@ -34,9 +30,6 @@ def _init_executor(self) -> None:
         # Environment variables for CPU executor
         #
 
-        # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
-        os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
-
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
@@ -57,13 +50,6 @@ def _init_executor(self) -> None:
         os.environ["LOCAL_WORLD_SIZE"] = str(
             self.parallel_config.tensor_parallel_size)
 
-        self.model_config = _verify_and_get_model_config(self.model_config)
-        self.cache_config = _verify_and_get_cache_config(self.cache_config)
-        self.scheduler_config = _verify_and_get_scheduler_config(
-            self.scheduler_config)
-        self.parallel_config = _verify_and_get_parallel_config(
-            self.parallel_config)
-
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
@@ -125,13 +111,8 @@ def _create_worker(
         local_rank: int = 0,
         rank: int = 0,
     ):
-        worker_module_name = "vllm.worker.cpu_worker"
-        worker_class_name = "CPUWorker"
 
-        wrapper = WorkerWrapperBase(
-            worker_module_name=worker_module_name,
-            worker_class_name=worker_class_name,
-        )
+        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
 
         assert self.distributed_init_method is not None
 
@@ -313,62 +294,6 @@ async def check_health_async(self) -> None:
         self.check_health()
 
 
-def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-    # If the feature combo become valid
-    if not config.enforce_eager:
-        logger.warning(
-            "CUDA graph is not supported on CPU, fallback to the eager "
-            "mode.")
-        config.enforce_eager = True
-    return config
-
-
-def _verify_and_get_scheduler_config(
-        config: SchedulerConfig) -> SchedulerConfig:
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-    # If the feature combo become valid
-    if config.chunked_prefill_enabled:
-        logger.warning("Chunked prefill is not supported on CPU, disable it.")
-        config.chunked_prefill_enabled = False
-
-    return config
-
-
-def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-    # If the feature combo become valid
-    if config.enable_prefix_caching:
-        logger.warning("Prefix caching is not supported on CPU, disable it.")
-        config.enable_prefix_caching = False
-
-    kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
-
-    if kv_cache_space >= 0:
-        if kv_cache_space == 0:
-            config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
-            logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
-                           "for CPU backend is not set, using 4 by default.")
-        else:
-            config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore
-    else:
-        raise RuntimeError(
-            "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
-            f" {kv_cache_space}, expect a positive integer value.")
-
-    return config
-
-
-def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
-    if (config.distributed_executor_backend is not None
-            and config.distributed_executor_backend != "mp"):
-        logger.warning(
-            "%s is not supported on CPU, fallback to mp distributed executor "
-            "backend.", config.distributed_executor_backend)
-        config.distributed_executor_backend = "mp"
-    return config
-
-
 def _driver_method_invoker(driver, method: str, *args, **kwargs):
     return getattr(driver, method)(*args, **kwargs)
 
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index c65d0836e5ff7..7fa34456028dd 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
@@ -8,19 +8,14 @@
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
-from vllm.worker.worker_base import WorkerBase, WorkerWrapperBase
+from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
 
-def create_worker(worker_module_name: str, worker_class_name: str,
-                  worker_class_fn: Optional[Callable[[], Type[WorkerBase]]],
-                  **kwargs):
-    wrapper = WorkerWrapperBase(
-        worker_module_name=worker_module_name,
-        worker_class_name=worker_class_name,
-        worker_class_fn=worker_class_fn,
-    )
+def create_worker(**kwargs):
+    vllm_config = kwargs.get("vllm_config")
+    wrapper = WorkerWrapperBase(vllm_config=vllm_config)
     wrapper.init_worker(**kwargs)
     return wrapper.worker
 
@@ -57,43 +52,11 @@ def _get_worker_kwargs(
             or (rank % self.parallel_config.tensor_parallel_size == 0),
         )
 
-    def _get_worker_module_and_class(
-            self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]:
-        worker_class_fn = None
-        if self.scheduler_config.is_multi_step:
-            worker_module_name = "vllm.worker.multi_step_worker"
-            worker_class_name = "MultiStepWorker"
-        elif self.speculative_config:
-            worker_module_name = "vllm.spec_decode.spec_decode_worker"
-            worker_class_name = "create_spec_worker"
-        else:
-            worker_module_name = "vllm.worker.worker"
-            worker_class_name = "Worker"
-        return (worker_module_name, worker_class_name, worker_class_fn)
-
-    def _get_create_worker_kwargs(
-            self,
-            local_rank: int = 0,
-            rank: int = 0,
-            distributed_init_method: Optional[str] = None) -> Dict:
-        worker_kwargs = self._get_worker_kwargs(local_rank, rank,
-                                                distributed_init_method)
-
-        (worker_module_name, worker_class_name,
-         worker_class_fn) = self._get_worker_module_and_class()
-        worker_kwargs.update(
-            worker_module_name=worker_module_name,
-            worker_class_name=worker_class_name,
-            worker_class_fn=worker_class_fn,
-        )
-
-        return worker_kwargs
-
     def _create_worker(self,
                        local_rank: int = 0,
                        rank: int = 0,
                        distributed_init_method: Optional[str] = None):
-        return create_worker(**self._get_create_worker_kwargs(
+        return create_worker(**self._get_worker_kwargs(
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method))
diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py
index 220e9eee87bb3..c9b7bfa71edfa 100644
--- a/vllm/executor/hpu_executor.py
+++ b/vllm/executor/hpu_executor.py
@@ -48,10 +48,7 @@ def _create_worker(self,
                        local_rank: int = 0,
                        rank: int = 0,
                        distributed_init_method: Optional[str] = None):
-        wrapper = WorkerWrapperBase(
-            worker_module_name="vllm.worker.hpu_worker",
-            worker_class_name="HPUWorker",
-        )
+        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
         wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
                                                       distributed_init_method))
         return wrapper.worker
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 3eb14fb931925..fc58163cade64 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -3,25 +3,19 @@
 from functools import partial
 from typing import Any, List, Optional
 
-import torch
-
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.gpu_executor import create_worker
-from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
-                                                  ResultHandler, WorkerMonitor)
+from vllm.executor.multiproc_worker_utils import (
+    ProcessWorkerWrapper, ResultHandler, WorkerMonitor,
+    set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
-from vllm.triton_utils.importing import HAS_TRITON
 from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        cuda_is_initialized, get_distributed_init_method,
-                        get_open_port, get_vllm_instance_id, make_async,
+                        get_distributed_init_method, get_open_port, make_async,
                         update_environment_variables)
 
-if HAS_TRITON:
-    from vllm.triton_utils import maybe_set_triton_cache_manager
-
 logger = init_logger(__name__)
 
 
@@ -37,33 +31,8 @@ def _init_executor(self) -> None:
         world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
 
-        # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
-        os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
-
-        # Disable torch async compiling which won't work with daemonic processes
-        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
-
-        # Configure thread parallelism if OMP_NUM_THREADS isn't set
-        #
-        # Helps to avoid CPU contention. The default of spawning a thread per
-        # core combined with multiprocessing for each GPU can have a negative
-        # impact on performance. The contention is amplified when running in a
-        # container where CPU limits can cause throttling.
-        default_omp_num_threads = 1
-        if "OMP_NUM_THREADS" not in os.environ and (
-                current_parallelism :=
-                torch.get_num_threads()) > default_omp_num_threads:
-            logger.warning(
-                "Reducing Torch parallelism from %d threads to %d to avoid "
-                "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
-                "external environment to tune this value as needed.",
-                current_parallelism, default_omp_num_threads)
-            os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
-            torch.set_num_threads(default_omp_num_threads)
-
-        # workaround for https://github.com/vllm-project/vllm/issues/6103
-        if HAS_TRITON and world_size > 1:
-            maybe_set_triton_cache_manager()
+        # Set multiprocessing envs that are common to V0 and V1
+        set_multiprocessing_worker_envs(self.parallel_config)
 
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
@@ -90,7 +59,7 @@ def _init_executor(self) -> None:
                     result_handler,
                     partial(
                         create_worker,
-                        **self._get_create_worker_kwargs(
+                        **self._get_worker_kwargs(
                             rank=rank,
                             local_rank=rank,
                             distributed_init_method=distributed_init_method,
@@ -125,13 +94,6 @@ def _check_executor_parameters(self):
                 "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
             })
 
-        if (cuda_is_initialized()
-                and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
-            logger.warning("CUDA was previously initialized. We must use "
-                           "the `spawn` multiprocessing start method. Setting "
-                           "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
-            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
         cuda_device_count = cuda_device_count_stateless()
         # Use confusing message for more common TP-only case.
         assert tensor_parallel_size <= cuda_device_count, (
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 884267d23dfc8..c4d90f0856f86 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -11,8 +11,15 @@
 from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
                     TypeVar, Union)
 
+import torch
+
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.triton_utils.importing import HAS_TRITON
+from vllm.utils import cuda_is_initialized
+
+if HAS_TRITON:
+    from vllm.triton_utils import maybe_set_triton_cache_manager
 
 logger = init_logger(__name__)
 
@@ -267,6 +274,49 @@ def write_with_prefix(s: str):
     file.write = write_with_prefix  # type: ignore[method-assign]
 
 
+def _check_multiproc_method():
+    if (cuda_is_initialized()
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+        logger.warning("CUDA was previously initialized. We must use "
+                       "the `spawn` multiprocessing start method. Setting "
+                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                       "See https://docs.vllm.ai/en/latest/getting_started/"
+                       "debugging.html#python-multiprocessing "
+                       "for more information.")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
 def get_mp_context():
+    _check_multiproc_method()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
+
+
+def set_multiprocessing_worker_envs(parallel_config):
+    """ Set up environment variables that should be used when there are workers
+    in a multiprocessing environment. This should be called by the parent 
+    process before worker processes are created"""
+
+    _check_multiproc_method()
+
+    # Configure thread parallelism if OMP_NUM_THREADS isn't set
+    #
+    # Helps to avoid CPU contention. The default of spawning a thread per
+    # core combined with multiprocessing for each GPU can have a negative
+    # impact on performance. The contention is amplified when running in a
+    # container where CPU limits can cause throttling.
+    default_omp_num_threads = 1
+    if "OMP_NUM_THREADS" not in os.environ and (
+            current_parallelism :=
+            torch.get_num_threads()) > default_omp_num_threads:
+        logger.warning(
+            "Reducing Torch parallelism from %d threads to %d to avoid "
+            "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
+            "external environment to tune this value as needed.",
+            current_parallelism, default_omp_num_threads)
+        os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
+        torch.set_num_threads(default_omp_num_threads)
+
+    # workaround for https://github.com/vllm-project/vllm/issues/6103
+    if HAS_TRITON and parallel_config.world_size > 1:
+        maybe_set_triton_cache_manager()
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 02d37cd7fbf23..a9efc4f9a801c 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -7,6 +7,7 @@
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
+from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
@@ -25,14 +26,16 @@ def _init_executor(self) -> None:
         self._init_worker()
 
     def _init_worker(self):
-        from vllm.worker.neuron_worker import NeuronWorker
+        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
-        self.driver_worker = NeuronWorker(
+        wrapper.init_worker(
             vllm_config=self.vllm_config,
             local_rank=0,
             rank=0,
-            distributed_init_method=distributed_init_method)
+            distributed_init_method=distributed_init_method,
+        )
+        self.driver_worker = wrapper.worker
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
index d06b0ccb7906e..057a32364e512 100644
--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -1,19 +1,17 @@
 from typing import List, Set, Tuple
 
 import openvino as ov
-import openvino.properties.hint as hints
-import torch
 
 import vllm.envs as envs
-from vllm.config import CacheConfig, ModelConfig
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip,
-                        get_open_port, make_async)
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
@@ -29,25 +27,17 @@ def _init_executor(self) -> None:
             current_platform.is_openvino_gpu(), \
             "OpenVINO backend supports only CPU and GPU devices"
 
-        self.ov_core = ov.Core()
-        self.model_config = _verify_and_get_model_config(self.model_config)
-        self.cache_config = _verify_and_get_cache_config(
-            self.ov_core, self.cache_config)
-
         # Instantiate the worker and load the model to CPU.
         self._init_worker()
 
     def _init_worker(self):
-        from vllm.worker.openvino_worker import OpenVINOWorker
 
-        assert (
-            self.parallel_config.world_size == 1
-        ), "OpenVINOExecutor only supports single CPU socket currently."
+        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
 
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
-        self.driver_worker = OpenVINOWorker(
-            ov_core=self.ov_core,
+        wrapper.init_worker(
+            ov_core=ov.Core(),
             vllm_config=self.vllm_config,
             local_rank=0,
             rank=0,
@@ -55,6 +45,7 @@ def _init_worker(self):
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
+        self.driver_worker = wrapper.worker
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
@@ -132,70 +123,3 @@ async def check_health_async(self) -> None:
         # OpenVINOExecutor will always be healthy as long as
         # it's running.
         return
-
-
-def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
-    if config.dtype != torch.float32:
-        logger.warning(
-            f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}."  # noqa: G004, E501
-        )
-        config.dtype = torch.float32
-    if not config.enforce_eager:
-        logger.warning(
-            "CUDA graph is not supported on OpenVINO backend, fallback to the "
-            "eager mode.")
-        config.enforce_eager = True
-    return config
-
-
-def _verify_and_get_cache_config(ov_core: ov.Core,
-                                 config: CacheConfig) -> CacheConfig:
-    if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
-        if not current_platform.is_openvino_cpu():
-            logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
-                        "ignored for GPU, f16 data type will be used.")
-            config.cache_dtype = ov.Type.f16
-        else:
-            logger.info("KV cache type is overridden to u8 via "
-                        "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
-            config.cache_dtype = ov.Type.u8
-    else:
-        if current_platform.is_openvino_cpu():
-            ov_device = envs.VLLM_OPENVINO_DEVICE
-            inference_precision = ov_core.get_property(
-                ov_device, hints.inference_precision)
-            if inference_precision == ov.Type.bf16:
-                config.cache_dtype = ov.Type.bf16
-            else:
-                config.cache_dtype = ov.Type.f16
-        else:
-            config.cache_dtype = ov.Type.f16
-
-    if current_platform.is_openvino_cpu():
-        if config.block_size != 32:
-            logger.info(
-                f"OpenVINO CPU optimal block size is 32, overriding currently set {config.block_size}"  # noqa: G004, E501
-            )
-            config.block_size = 32
-    else:
-        if config.block_size != 16:
-            logger.info(
-                f"OpenVINO GPU optimal block size is 16, overriding currently set {config.block_size}"  # noqa: G004, E501
-            )
-            config.block_size = 16
-
-    kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
-    if kv_cache_space >= 0:
-        if kv_cache_space == 0 and current_platform.is_openvino_cpu():
-            config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
-            logger.warning(
-                "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
-                "for OpenVINO backend is not set, using 4 by default.")
-        else:
-            config.openvino_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore
-    else:
-        raise RuntimeError(
-            "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
-            f" {kv_cache_space}, expect a positive integer value.")
-
-    return config
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 66bab2c686c67..4bf5cbbd18ffe 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -15,8 +15,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, get_vllm_instance_id,
-                        make_async)
+                        get_ip, get_open_port, make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -91,17 +90,6 @@ def _configure_ray_workers_use_nsight(self,
 
         return ray_remote_kwargs
 
-    def _get_worker_wrapper_args(self) -> Dict[str, Any]:
-        (worker_module_name, worker_class_name,
-         worker_class_fn) = self._get_worker_module_and_class()
-
-        return dict(
-            worker_module_name=worker_module_name,
-            worker_class_name=worker_class_name,
-            worker_class_fn=worker_class_fn,
-            trust_remote_code=self.model_config.trust_remote_code,
-        )
-
     # child class could overwrite this to return actual env vars.
     def _get_env_vars_to_be_updated(self):
         return self._env_vars_for_all_workers
@@ -135,7 +123,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Create the workers.
         driver_ip = get_ip()
-        worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
             if not bundle.get("GPU", 0):
                 continue
@@ -150,7 +137,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 num_gpus=num_gpus,
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
+            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
 
             if self.use_ray_spmd_worker:
                 self.workers.append(worker)
@@ -161,7 +148,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     # as the resource holder for the driver process.
                     self.driver_dummy_worker = worker
                     self.driver_worker = RayWorkerWrapper(
-                        **worker_wrapper_kwargs)
+                        vllm_config=self.vllm_config)
                 else:
                     # Else, added to the list of workers.
                     self.workers.append(worker)
@@ -201,8 +188,14 @@ def sort_by_driver_then_worker_ip(worker):
         self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
 
         # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
-                                                    use_dummy_driver=True)
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()) \
+            ) # type: ignore
 
         node_workers = defaultdict(list)  # node id -> list of worker ranks
         node_gpus = defaultdict(list)  # node id -> list of gpu ids
@@ -228,18 +221,14 @@ def sort_by_driver_then_worker_ip(worker):
                 f"Every node should have a unique IP address. Got {n_nodes}"
                 f" nodes with node ids {list(node_workers.keys())} and "
                 f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` or "
-                "`HOST_IP` environment variable, make sure it is unique for"
+                " network configuration. If you set `VLLM_HOST_IP`"
+                " environment variable, make sure it is unique for"
                 " each node.")
 
-        VLLM_INSTANCE_ID = get_vllm_instance_id()
-
         # Set environment variables for the driver and workers.
         all_args_to_update_environment_variables = [({
             "CUDA_VISIBLE_DEVICES":
             ",".join(map(str, node_gpus[node_id])),
-            "VLLM_INSTANCE_ID":
-            VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
             **({
@@ -346,7 +335,6 @@ def _run_workers(
         async_run_tensor_parallel_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
@@ -406,18 +394,10 @@ def _run_workers(
             driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
 
             # Start the driver worker after all the ray workers.
-            if not use_dummy_driver:
-                driver_worker_output = [
-                    self.driver_worker.execute_method(method, *driver_args,
-                                                      **driver_kwargs)
-                ]
-            else:
-                assert self.driver_dummy_worker is not None
-                driver_worker_output = [
-                    ray.get(
-                        self.driver_dummy_worker.execute_method.remote(
-                            method, *driver_args, **driver_kwargs))
-                ]
+            driver_worker_output = [
+                self.driver_worker.execute_method(method, *driver_args,
+                                                  **driver_kwargs)
+            ]
 
         # Get the results of the ray workers.
         if self.workers:
@@ -434,12 +414,10 @@ def _check_ray_adag_installation(self):
         import pkg_resources
         from packaging import version
 
-        required_version = version.parse("2.35")
+        required_version = version.parse("2.40")
         current_version = version.parse(
             pkg_resources.get_distribution("ray").version)
-        # TODO: update the constraint once we adapt to the backward
-        # incompatible API change from ray 2.36
-        if current_version != required_version:
+        if current_version < required_version:
             raise ValueError(f"Ray version {required_version} is "
                              f"required, but found {current_version}")
 
@@ -465,6 +443,8 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
 
         logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
                     envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
         with InputNode() as input_data:
             # Example DAG: PP=2, TP=4
             # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
@@ -500,7 +480,10 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
 
             forward_dag = MultiOutputNode(outputs)
 
-        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
+        return forward_dag.experimental_compile(
+            enable_asyncio=enable_asyncio,
+            _overlap_gpu_communication=envs.
+            VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
 
     def __del__(self):
         self.shutdown()
@@ -527,8 +510,8 @@ async def execute_model_async(
 
         serialized_data = self.input_encoder.encode(execute_model_req)
         dag_future = await self.forward_dag.execute_async(serialized_data)
-        outputs = await dag_future
-        return self.output_decoder.decode(outputs[0])
+        output = await dag_future[0]
+        return self.output_decoder.decode(output)
 
     async def _driver_execute_model_async(
         self,
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
index a24bab6df370e..f3025cb537ab8 100644
--- a/vllm/executor/ray_hpu_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
@@ -2,8 +2,7 @@
 import os
 from collections import defaultdict
 from itertools import islice, repeat
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Type)
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import msgspec
 
@@ -16,9 +15,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, get_vllm_instance_id,
-                        make_async)
-from vllm.worker.worker_base import WorkerBase
+                        get_ip, get_open_port, make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -81,33 +78,6 @@ def shutdown(self) -> None:
     def finish_measurements(self):
         self._run_workers("finish_measurements")
 
-    def _get_worker_module_and_class(
-        self
-    ) -> Tuple[str, str, Optional[Callable[[],
-                                           Type[WorkerBase]]]]:  # noqa: F821
-        worker_class_fn = None
-        if self.scheduler_config.is_multi_step:
-            raise NotImplementedError(
-                "Multi-step execution is not implemented for HPU")
-        elif self.speculative_config:
-            raise NotImplementedError(
-                "Speculative decoding is not implemented for HPU")
-        else:
-            worker_module_name = "vllm.worker.hpu_worker"
-            worker_class_name = "HPUWorker"
-        return (worker_module_name, worker_class_name, worker_class_fn)
-
-    def _get_worker_wrapper_args(self) -> Dict[str, Any]:
-        (worker_module_name, worker_class_name,
-         worker_class_fn) = self._get_worker_module_and_class()
-
-        return dict(
-            worker_module_name=worker_module_name,
-            worker_class_name=worker_class_name,
-            worker_class_fn=worker_class_fn,
-            trust_remote_code=self.model_config.trust_remote_code,
-        )
-
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
         # Otherwise, the ray workers are allocated with a full GPU.
@@ -128,7 +98,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Create the workers.
         driver_ip = get_ip()
-        worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
             if not bundle.get("HPU", 0):
                 continue
@@ -144,7 +113,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 resources={'HPU': num_gpus},
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
+            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
 
             if self.use_ray_spmd_worker:
                 self.workers.append(worker)
@@ -155,7 +124,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     # as the resource holder for the driver process.
                     self.driver_dummy_worker = worker
                     self.driver_worker = RayWorkerWrapper(
-                        **worker_wrapper_kwargs)
+                        vllm_config=self.vllm_config)
                 else:
                     # Else, added to the list of workers.
                     self.workers.append(worker)
@@ -194,9 +163,14 @@ def sort_by_driver_then_worker_ip(worker):
         # node will be placed first.
         self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
 
-        # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
-                                                    use_dummy_driver=True)
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()) \
+            ) # type: ignore
 
         node_workers = defaultdict(list)  # node id -> list of worker ranks
         node_gpus = defaultdict(list)  # node id -> list of gpu ids
@@ -222,16 +196,12 @@ def sort_by_driver_then_worker_ip(worker):
                 f"Every node should have a unique IP address. Got {n_nodes}"
                 f" nodes with node ids {list(node_workers.keys())} and "
                 f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` or "
-                "`HOST_IP` environment variable, make sure it is unique for"
+                " network configuration. If you set `VLLM_HOST_IP` "
+                "environment variable, make sure it is unique for"
                 " each node.")
 
-        VLLM_INSTANCE_ID = get_vllm_instance_id()
-
         # Set environment variables for the driver and workers.
         all_args_to_update_environment_variables = [({
-            "VLLM_INSTANCE_ID":
-            VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
         }, ) for (node_id, _) in worker_node_and_gpu_ids]
@@ -331,7 +301,6 @@ def _run_workers(
         async_run_tensor_parallel_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
@@ -391,18 +360,10 @@ def _run_workers(
             driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
 
             # Start the driver worker after all the ray workers.
-            if not use_dummy_driver:
-                driver_worker_output = [
-                    self.driver_worker.execute_method(method, *driver_args,
-                                                      **driver_kwargs)
-                ]
-            else:
-                assert self.driver_dummy_worker is not None
-                driver_worker_output = [
-                    ray.get(
-                        self.driver_dummy_worker.execute_method.remote(
-                            method, *driver_args, **driver_kwargs))
-                ]
+            driver_worker_output = [
+                self.driver_worker.execute_method(method, *driver_args,
+                                                  **driver_kwargs)
+            ]
 
         # Get the results of the ray workers.
         if self.workers:
diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
index d02fecb46f007..5118c13934f0d 100644
--- a/vllm/executor/ray_tpu_executor.py
+++ b/vllm/executor/ray_tpu_executor.py
@@ -13,7 +13,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        get_vllm_instance_id, make_async)
+                        make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -69,14 +69,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 placement_group_bundle_index=bundle_id,
             )
 
-            assert self.speculative_config is None
-            if self.scheduler_config.is_multi_step:
-                worker_module_name = "vllm.worker.multi_step_tpu_worker"
-                worker_class_name = "MultiStepTPUWorker"
-            else:
-                worker_module_name = "vllm.worker.tpu_worker"
-                worker_class_name = "TPUWorker"
-
             # GKE does not fetch environment information from metadata server
             # and instead sets these from within the Ray process. Therefore we
             # need to override the Ray environment variables manually.
@@ -95,11 +87,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 resources={"TPU": 1},
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(
-                worker_module_name=worker_module_name,
-                worker_class_name=worker_class_name,
-                trust_remote_code=self.model_config.trust_remote_code,
-            )
+            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
             if override_env:
                 worker.override_env_vars.remote(override_env)
 
@@ -109,10 +97,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 # as the resource holder for the driver process.
                 self.driver_dummy_worker = worker
                 self.driver_worker = RayWorkerWrapper(
-                    worker_module_name=worker_module_name,
-                    worker_class_name=worker_class_name,
-                    trust_remote_code=self.model_config.trust_remote_code,
-                )
+                    vllm_config=self.vllm_config)
             else:
                 # Else, added to the list of workers.
                 self.workers.append(worker)
@@ -152,19 +137,21 @@ def sort_by_driver_then_worker_ip(worker):
         self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
 
         # Get the set of TPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
-                                                    use_dummy_driver=True)
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()) \
+            ) # type: ignore
 
         node_workers = defaultdict(list)
         for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
             node_workers[node_id].append(i)
 
-        VLLM_INSTANCE_ID = get_vllm_instance_id()
-
         # Set environment variables for the driver and workers.
         all_args_to_update_environment_variables = [({
-            "VLLM_INSTANCE_ID":
-            VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
         }, ) for _ in worker_node_and_gpu_ids]
@@ -218,7 +205,6 @@ def _run_workers(
         async_run_remote_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
         use_ray_compiled_dag: bool = False,
         **kwargs,
@@ -260,14 +246,8 @@ def _run_workers(
         driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
 
         # Start the driver worker after all the ray workers.
-        if not use_dummy_driver:
-            driver_worker_output = self.driver_worker.execute_method(
-                method, *driver_args, **driver_kwargs)
-        else:
-            assert self.driver_dummy_worker is not None
-            driver_worker_output = ray.get(
-                self.driver_dummy_worker.execute_method.remote(
-                    method, *driver_args, **driver_kwargs))
+        driver_worker_output = self.driver_worker.execute_method(
+            method, *driver_args, **driver_kwargs)
         # Get the results of the ray workers.
         if self.workers:
             ray_worker_outputs = ray.get(ray_worker_outputs)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 41dd59bc65ec5..426aa1b5c728f 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -234,7 +234,7 @@ def initialize_ray_cluster(
     if current_platform.is_rocm() or current_platform.is_xpu():
         # Try to connect existing ray instance and create a new one if not found
         try:
-            ray.init("auto")
+            ray.init("auto", ignore_reinit_error=True)
         except ConnectionError:
             logger.warning(
                 "No existing RAY instance detected. "
@@ -277,10 +277,14 @@ def initialize_ray_cluster(
                 f"Total number of devices: {device_bundles}.")
     else:
         num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        # Log a warning message and delay resource allocation failure response.
+        # Avoid immediate rejection to allow user-initiated placement group
+        # created and wait cluster to be ready
         if parallel_config.world_size > num_devices_in_cluster:
-            raise ValueError(
-                f"The number of required {device_str}s exceeds the total "
-                f"number of available {device_str}s in the placement group.")
+            logger.warning(
+                "The number of required %ss exceeds the total "
+                "number of available %ss in the placement group.", device_str,
+                device_str)
         # Create a new placement group
         placement_group_specs: List[Dict[str, float]] = ([{
             device_str: 1.0
diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
index 2b1cdc09b0a9f..d2086f5fef26c 100644
--- a/vllm/executor/ray_xpu_executor.py
+++ b/vllm/executor/ray_xpu_executor.py
@@ -1,11 +1,13 @@
 import asyncio
 from typing import List, Optional
 
+import ray
+
 import vllm.envs as envs
 from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync
 from vllm.executor.xpu_executor import XPUExecutor
 from vllm.logger import init_logger
-from vllm.utils import get_vllm_instance_id, make_async
+from vllm.utils import make_async
 
 logger = init_logger(__name__)
 
@@ -14,15 +16,16 @@ class RayXPUExecutor(RayGPUExecutor, XPUExecutor):
 
     def _get_env_vars_to_be_updated(self):
         # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
-                                                    use_dummy_driver=True)
-
-        VLLM_INSTANCE_ID = get_vllm_instance_id()
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()))  # type: ignore
 
         # Set environment variables for the driver and workers.
         all_args_to_update_environment_variables = [({
-            "VLLM_INSTANCE_ID":
-            VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
         }, ) for (_, _) in worker_node_and_gpu_ids]
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
index 36b7e2265efab..722b86a95ff8a 100644
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
@@ -1,15 +1,11 @@
-from typing import Callable, List, Optional, Tuple, Type, Union
+from typing import List, Optional, Union
 
-import torch
-
-from vllm.config import ModelConfig, ParallelConfig
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
 from vllm.utils import make_async
-from vllm.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
 
@@ -23,20 +19,8 @@ def _init_executor(self) -> None:
         assert self.speculative_config is None, (
             "Speculative decoding not yet supported for XPU backend")
 
-        self.model_config = _verify_and_get_model_config(self.model_config)
         GPUExecutor._init_executor(self)
 
-    def _get_worker_module_and_class(
-            self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]:
-        worker_class_fn = None
-        if self.speculative_config is not None:
-            raise NotImplementedError(
-                "XPU does not support speculative decoding")
-        else:
-            worker_module_name = "vllm.worker.xpu_worker"
-            worker_class_name = "XPUWorker"
-        return (worker_module_name, worker_class_name, worker_class_fn)
-
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
     ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
@@ -53,26 +37,3 @@ async def execute_model_async(
         output = await make_async(self.driver_worker.execute_model
                                   )(execute_model_req=execute_model_req)
         return output
-
-
-def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
-    if config.dtype == torch.bfloat16:
-        logger.warning(
-            "bfloat16 is not fully supported on XPU, casting to float16.")
-        config.dtype = torch.float16
-    if not config.enforce_eager:
-        logger.warning(
-            "CUDA graph is not supported on XPU, fallback to the eager "
-            "mode.")
-        config.enforce_eager = True
-    return config
-
-
-def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
-    if (config.distributed_executor_backend is not None
-            and config.distributed_executor_backend != "ray"):
-        logger.warning(
-            "%s is not supported on XPU, fallback to ray distributed executor "
-            "backend.", config.distributed_executor_backend)
-        config.distributed_executor_backend = "ray"
-    return config
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 777747505e14a..7f56575279e9b 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,22 +1,92 @@
+import time
+from collections import defaultdict
 from contextlib import contextmanager
-from typing import Any
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
 
-_forward_context: Any = None
+import torch
 
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
 
-def get_forward_context() -> Any:
+logger = init_logger(__name__)
+
+track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
+last_logging_time: float = 0
+forward_start_time: float = 0
+batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
+batchsize_forward_time: defaultdict = defaultdict(list)
+
+
+@dataclass
+class ForwardContext:
+    static_forward_context: Dict[str, Any]
+    # TODO: extend to support per-layer dynamic forward context
+    dynamic_forward_context: Any
+
+
+_forward_context: Optional[ForwardContext] = None
+
+
+def get_forward_context() -> ForwardContext:
     """Get the current forward context."""
+    assert _forward_context is not None, (
+        "Forward context is not set. "
+        "Please use `set_forward_context` to set the forward context.")
     return _forward_context
 
 
 @contextmanager
-def set_forward_context(context: Any):
+def set_forward_context(context: Any, vllm_config: VllmConfig):
     """A context manager that stores the current forward context,
-    can be attention metadata, etc."""
+    can be attention metadata, etc.
+    Here we can inject common logic for every model forward pass.
+    """
+    global forward_start_time
+    need_to_track_batchsize = track_batchsize and context is not None
+    if need_to_track_batchsize:
+        forward_start_time = time.perf_counter()
     global _forward_context
     prev_context = _forward_context
-    _forward_context = context
+    _forward_context = ForwardContext(
+        static_forward_context=vllm_config.compilation_config.
+        static_forward_context,
+        dynamic_forward_context=context)
     try:
         yield
     finally:
+        global batchsize_counter
+        global last_logging_time, batchsize_logging_interval
+        if need_to_track_batchsize:
+            if hasattr(context, "num_prefill_tokens"):
+                # for v0 attention backends
+                batchsize = context.num_prefill_tokens + \
+                    context.num_decode_tokens
+            else:
+                # for v1 attention backends
+                batchsize = context.num_input_tokens
+            # we use synchronous scheduling right now,
+            # adding a sync point here should not affect
+            # scheduling of the next batch
+            torch.cuda.synchronize()
+            now = time.perf_counter()
+            # time measurement is in milliseconds
+            batchsize_forward_time[batchsize].append(
+                (now - forward_start_time) * 1000)
+            if now - last_logging_time > batchsize_logging_interval:
+                last_logging_time = now
+                forward_stats = []
+                for bs, times in batchsize_forward_time.items():
+                    if len(times) <= 1:
+                        # can be cudagraph / profiling run
+                        continue
+                    medium = torch.quantile(torch.tensor(times), q=0.5).item()
+                    medium = round(medium, 2)
+                    forward_stats.append((bs, len(times), medium))
+                forward_stats.sort(key=lambda x: x[1], reverse=True)
+                if forward_stats:
+                    logger.info(("Batchsize forward time stats "
+                                 "(batchsize, count, median_time(ms)): %s"),
+                                forward_stats)
         _forward_context = prev_context
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 54fbd7a321a6f..d4402e77a3886 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -38,34 +38,3 @@
     "InputProcessingContext",
     "InputRegistry",
 ]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "PromptInput":
-        msg = ("PromptInput has been renamed to PromptType. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PromptType
-
-    if name == "LLMInputs":
-        msg = ("LLMInputs has been renamed to DecoderOnlyInputs. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return DecoderOnlyInputs
-
-    if name == "EncoderDecoderLLMInputs":
-        msg = (
-            "EncoderDecoderLLMInputs has been renamed to EncoderDecoderInputs. "
-            "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return EncoderDecoderInputs
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 07ff9faa50f13..d54cbb5c37819 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -7,7 +7,8 @@
 from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
+    from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
+                                 MultiModalPlaceholderDict)
     from vllm.multimodal.inputs import MultiModalInputsV2
 
 
@@ -38,6 +39,9 @@ class TokensPrompt(TypedDict):
     prompt_token_ids: List[int]
     """A list of token IDs to pass to the model."""
 
+    token_type_ids: NotRequired[List[int]]
+    """A list of token type IDs to pass to the cross encoder model."""
+
     multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     DEPRECATED: Optional multi-modal data to pass to the model,
@@ -133,6 +137,9 @@ class TokenInputs(TypedDict):
     prompt_token_ids: List[int]
     """The token IDs of the prompt."""
 
+    token_type_ids: NotRequired[List[int]]
+    """The token type IDs of the prompt."""
+
     prompt: NotRequired[str]
     """
     The original prompt text corresponding to the token IDs, if available.
@@ -144,11 +151,22 @@ class TokenInputs(TypedDict):
     if the model supports it.
     """
 
+    multi_modal_inputs: NotRequired["MultiModalKwargs"]
+    """
+    Optional multi-modal inputs to pass to the model,
+    if the model supports it.
+    """
+
     multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
     """
     Placeholder ranges for the multi-modal data.
     """
 
+    multi_modal_hashes: NotRequired[List[str]]
+    """
+    The hashes of the multi-modal data.
+    """
+
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
@@ -160,8 +178,11 @@ class TokenInputs(TypedDict):
 
 def token_inputs(
     prompt_token_ids: List[int],
+    token_type_ids: Optional[List[int]] = None,
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
+    multi_modal_inputs: Optional["MultiModalKwargs"] = None,
+    multi_modal_hashes: Optional[List[str]] = None,
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
 ) -> TokenInputs:
@@ -170,8 +191,14 @@ def token_inputs(
 
     if prompt is not None:
         inputs["prompt"] = prompt
+    if token_type_ids is not None:
+        inputs["token_type_ids"] = token_type_ids
     if multi_modal_data is not None:
         inputs["multi_modal_data"] = multi_modal_data
+    if multi_modal_inputs is not None:
+        inputs["multi_modal_inputs"] = multi_modal_inputs
+    if multi_modal_hashes is not None:
+        inputs["multi_modal_hashes"] = multi_modal_hashes
     if multi_modal_placeholders is not None:
         inputs["multi_modal_placeholders"] = multi_modal_placeholders
     if mm_processor_kwargs is not None:
@@ -234,6 +261,15 @@ def prompt_token_ids(self) -> List[int]:
 
         assert_never(inputs)
 
+    @cached_property
+    def token_type_ids(self) -> List[int]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token" or inputs["type"] == "multimodal":
+            return inputs.get("token_type_ids", [])
+
+        assert_never(inputs)
+
     @cached_property
     def prompt_embeds(self) -> Optional[torch.Tensor]:
         inputs = self.inputs
@@ -255,6 +291,30 @@ def multi_modal_data(self) -> "MultiModalDataDict":
 
         assert_never(inputs)
 
+    @cached_property
+    def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_inputs", {})
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_kwargs", {})
+
+        assert_never(inputs)
+
+    @cached_property
+    def multi_modal_hashes(self) -> List[str]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_hashes", [])
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_hashes", [])
+
+        assert_never(inputs)
+
     @cached_property
     def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         inputs = self.inputs
@@ -340,34 +400,3 @@ def to_enc_dec_tuple_list(
     return [(enc_dec_prompt["encoder_prompt"],
              enc_dec_prompt["decoder_prompt"])
             for enc_dec_prompt in enc_dec_prompts]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "PromptInput":
-        msg = ("PromptInput has been renamed to PromptType. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PromptType
-
-    if name == "LLMInputs":
-        msg = ("LLMInputs has been renamed to DecoderOnlyInputs. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return DecoderOnlyInputs
-
-    if name == "EncoderDecoderLLMInputs":
-        msg = (
-            "EncoderDecoderLLMInputs has been renamed to EncoderDecoderInputs. "
-            "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return EncoderDecoderInputs
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index aacff87df6d79..3d606817e90aa 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -10,7 +10,7 @@
 from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.utils import print_warning_once
+from vllm.utils import print_info_once, print_warning_once
 
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                    PromptType, SingletonInputs, SingletonPrompt, token_inputs)
@@ -212,7 +212,7 @@ def _can_process_multimodal(self) -> bool:
         # updated to use the new multi-modal processor
         can_process_multimodal = self.mm_registry.has_processor(model_config)
         if not can_process_multimodal:
-            logger.info(
+            print_info_once(
                 "Your model uses the legacy input pipeline instead of the new "
                 "multi-modal processor. Please note that the legacy pipeline "
                 "will be removed in a future release. For more details, see: "
@@ -305,6 +305,7 @@ def _prompt_to_llm_inputs(
             tokens_content = parsed["content"]
 
             prompt_token_ids = tokens_content["prompt_token_ids"]
+            token_type_ids = tokens_content.get("token_type_ids")
             multi_modal_data = tokens_content.get("multi_modal_data")
             mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
 
@@ -318,6 +319,7 @@ def _prompt_to_llm_inputs(
 
             return token_inputs(
                 prompt_token_ids=prompt_token_ids,
+                token_type_ids=token_type_ids,
                 multi_modal_data=multi_modal_data,
                 mm_processor_kwargs=mm_processor_kwargs,
             )
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 68b4756331e6d..0b85484c48714 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -2,7 +2,7 @@
 from collections import UserDict
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
-                    Optional, Protocol, Type, cast)
+                    Optional, Protocol, Type)
 
 from torch import nn
 from transformers import PretrainedConfig, ProcessorMixin
@@ -11,8 +11,8 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once,
-                        resolve_mm_processor_kwargs)
+from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
+                        print_warning_once, resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
 from .parse import is_encoder_decoder_inputs
@@ -47,7 +47,6 @@ def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
         Raises:
             TypeError: If the model is not of the specified type.
         """
-
         hf_config = self.model_config.hf_config
         if not isinstance(hf_config, hf_config_type):
             raise TypeError("Invalid type of HuggingFace config. "
@@ -60,20 +59,69 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]:
         """
         Get the HuggingFace image processor configuration of the model.
         """
-
         return self.model_config.hf_image_processor_config
 
+    def get_mm_config(self):
+        """
+        Get the multimodal config of the model.
+
+        Raises:
+            RuntimeError: If the model is not a multimodal model.
+        """
+        mm_config = self.model_config.multimodal_config
+        if mm_config is None:
+            raise RuntimeError("Not a multimodal model")
+
+        return mm_config
+
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
+        return cached_get_processor(
+            self.model_config.model,
+            trust_remote_code=self.model_config.trust_remote_code,
+            **merged_kwargs,
+        )
+
 
 @dataclass(frozen=True)
 class InputProcessingContext(InputContext):
     tokenizer: AnyTokenizer
     """The tokenizer used to tokenize the inputs."""
 
-    def get_hf_processor(self) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
         return cached_get_processor(
-            self.model_config.tokenizer,
+            self.model_config.model,
             tokenizer=self.tokenizer,  # Override the tokenizer with ours
             trust_remote_code=self.model_config.trust_remote_code,
+            **merged_kwargs,
+        )
+
+    def resolve_hf_processor_call_kwargs(
+        self,
+        hf_processor: ProcessorMixin,
+        inference_kwargs: Mapping[str, object],
+    ) -> Mapping[str, object]:
+        assert callable(hf_processor)
+
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        return resolve_mm_processor_kwargs(
+            base_kwargs,
+            inference_kwargs,
+            hf_processor,
         )
 
 
@@ -136,12 +184,12 @@ class InputRegistry:
     """
 
     def __init__(self) -> None:
-        self._dummy_factories_by_model_type: Dict[Type[nn.Module],
-                                                  DummyDataFactory] = {}
-        self._dummy_encoder_factories_by_model_type: Dict[
-            Type[nn.Module], DummyDataFactory] = {}
-        self._input_processors_by_model_type: Dict[Type[nn.Module],
-                                                   InputProcessor] = {}
+        self._dummy_factories_by_model_type = \
+            ClassRegistry[nn.Module, DummyDataFactory]()
+        self._dummy_encoder_factories_by_model_type = \
+            ClassRegistry[nn.Module, DummyDataFactory]()
+        self._input_processors_by_model_type = \
+            ClassRegistry[nn.Module, InputProcessor]()
 
     def _default_dummy_data_factory(
         self,
@@ -171,7 +219,8 @@ def register_dummy_data(self, factory: DummyDataFactory):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_factories_by_model_type:
+            if self._dummy_factories_by_model_type.contains(model_cls,
+                                                            strict=True):
                 logger.warning(
                     "Model class %s already has dummy data "
                     "registered to %s. It is overwritten by the new one.",
@@ -195,7 +244,8 @@ def register_dummy_encoder_data(self, factory: DummyDataFactory):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_encoder_factories_by_model_type:
+            if self._dummy_encoder_factories_by_model_type.contains(
+                    model_cls, strict=True):
                 logger.warning(
                     "Model class %s already has dummy encoder data "
                     "registered to %s. It is overwritten by the new one.",
@@ -232,19 +282,35 @@ def dummy_data_for_profiling(
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
-
-        model_cls, _ = get_model_architecture(model_config)
-        if is_encoder_data:
-            dummy_factory = self._get_dummy_encoder_data_factory(model_cls)
+        from vllm.multimodal import MultiModalKwargs
+        from vllm.multimodal.utils import cached_get_tokenizer
+
+        if mm_registry.has_processor(model_config):
+            tokenizer = cached_get_tokenizer(
+                model_config.tokenizer,
+                trust_remote_code=model_config.trust_remote_code,
+            )
+            processor = mm_registry.create_processor(model_config, tokenizer)
+
+            mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
+            mm_max_tokens = mm_registry.get_max_tokens_by_modality(
+                model_config)
+
+            dummy_data = processor.get_dummy_data(seq_len, mm_counts,
+                                                  mm_max_tokens)
         else:
-            dummy_factory = self._get_dummy_data_factory(model_cls)
-        mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
-        mm_processor_kwargs = get_allowed_kwarg_only_overrides(
-            dummy_factory, overrides=model_config.mm_processor_kwargs)
+            model_cls, _ = get_model_architecture(model_config)
+            if is_encoder_data:
+                dummy_factory = self._get_dummy_encoder_data_factory(model_cls)
+            else:
+                dummy_factory = self._get_dummy_data_factory(model_cls)
+            mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
+            mm_processor_kwargs = get_allowed_kwarg_only_overrides(
+                dummy_factory, overrides=model_config.mm_processor_kwargs)
 
-        dummy_data = dummy_factory(InputContext(model_config), seq_len,
-                                   _MultiModalCounts(mm_counts),
-                                   **mm_processor_kwargs)
+            dummy_data = dummy_factory(InputContext(model_config), seq_len,
+                                       _MultiModalCounts(mm_counts),
+                                       **mm_processor_kwargs)
 
         # Having more tokens is over-conservative but otherwise fine
         num_tokens = dummy_data.seq_data.prompt_token_ids
@@ -257,7 +323,9 @@ def dummy_data_for_profiling(
                 raise AssertionError(
                     f"Expected at least {seq_len} dummy tokens for profiling, "
                     f"but found {len(num_tokens)} tokens instead.")
-        if dummy_data.multi_modal_data is not None:
+
+        if (dummy_data.multi_modal_data is not None and
+                not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)):
             for k, v in dummy_data.multi_modal_data.items():
                 num_items = len(v) if isinstance(v, list) else 1
                 num_expected = mm_counts[k]
@@ -287,7 +355,8 @@ def register_input_processor(self, processor: InputProcessor):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_processors_by_model_type:
+            if self._input_processors_by_model_type.contains(model_cls,
+                                                             strict=True):
                 logger.warning(
                     "Model class %s already has input processor "
                     "registered to %s. It is overwritten by the new one.",
@@ -339,7 +408,7 @@ def process_input(self, model_config: "ModelConfig",
         # If it's empty, it'll fall back to the default kwarg values
         mm_processor_kwargs = resolve_mm_processor_kwargs(
             model_config.mm_processor_kwargs,
-            cast(Dict[str, Any], inputs.get("mm_processor_kwargs")),
+            inputs.get("mm_processor_kwargs", {}),  # type: ignore
             processor,
         )
 
diff --git a/vllm/logger.py b/vllm/logger.py
index 9e16e591315ba..538db0dcf19aa 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -50,7 +50,7 @@
 
 
 def _configure_vllm_root_logger() -> None:
-    logging_config: Optional[Dict] = None
+    logging_config: Dict = {}
 
     if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
         raise RuntimeError(
@@ -75,6 +75,11 @@ def _configure_vllm_root_logger() -> None:
                              type(custom_config).__name__)
         logging_config = custom_config
 
+    for formatter in logging_config.get("formatters", {}).values():
+        # This provides backwards compatibility after #10134.
+        if formatter.get("class") == "vllm.logging.NewLineFormatter":
+            formatter["class"] = "vllm.logging_utils.NewLineFormatter"
+
     if logging_config:
         dictConfig(logging_config)
 
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 3443c3feb4d2a..545ec21ca74c1 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -1,5 +1,5 @@
 # pylint: disable=unused-argument
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
 
 import torch
 import torch.nn as nn
@@ -32,6 +32,44 @@ def dec(*args, **kwargs):
     return dec
 
 
+def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
+    """ 
+    For `ColumnParallelLinearWithLoRA` or classes that inherit from 
+    `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
+    """
+    assert (layer.n_slices == len(layer.lora_a_stacked) == len(
+        layer.lora_b_stacked) == len(layer.output_slices))
+    if layer.lora_bias_stacked is not None:
+        assert layer.n_slices == len(layer.lora_bias_stacked)
+
+    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
+
+    x = x.view(-1, x.shape[-1])
+    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
+
+    # Since communication is needed, the buffer is directly initialized as a
+    # tensor rather than a tuple of tensor.
+    buffers = torch.zeros(
+        (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+        dtype=torch.float32,
+        device=x.device,
+    )
+
+    layer.punica_wrapper.add_shrink(buffers, x, layer.lora_a_stacked, 1.0)
+    buffers = tensor_model_parallel_all_gather(buffers)
+    layer.punica_wrapper.add_expand(output,
+                                    buffers,
+                                    layer.lora_b_stacked,
+                                    layer.lora_bias_stacked,
+                                    layer.output_slices,
+                                    offset_start=0,
+                                    add_input=True)
+
+    output = output.view(*out_orig_shape)
+    # now have column partitioned and packed output
+    return output
+
+
 # these layers are based on the tensor parallelism strategy given in
 # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
 # https://arxiv.org/abs/2311.03285.
@@ -44,42 +82,22 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
     Based on S-LoRA, slicing happens along the rank dim.
     """
 
+    # For all LoRA layers where the `base_layer` is `ColumnParallelLinear`,
+    # their `lora_a` and `lora_b` have different sharding patterns. After
+    # completing the `lora_a` GEMM , a gather operation is performed.
+    # Therefore, the sharding of `lora_a` only needs to correspond with the
+    # gather operation.
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.lora_a_stacked.shape[2]
+        shard_size = self.lora_a_stacked[0].shape[2]
         start_idx = tp_rank * shard_size
         lora_a = lora_a[:, start_idx:start_idx + shard_size]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-
-        x = x.view(-1, x.shape[-1])
-        output, out_orig_shape = output.view(-1,
-                                             output.shape[-1]), output.shape
-        buffer = torch.zeros(
-            (x.shape[0], self.lora_a_stacked.shape[2]),
-            dtype=torch.float32,
-            device=x.device,
-        )
-        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
-        buffer = tensor_model_parallel_all_gather(buffer)
-        self.punica_wrapper.add_expand(output,
-                                       buffer,
-                                       self.lora_b_stacked,
-                                       add_input=True)
-        # now have column partitioned output
-
-        if self.bias_stacked is not None:
-            self.bias_stacked = self.bias_stacked.view(
-                -1, self.bias_stacked.shape[-1])
-            self.bias_stacked = self.bias_stacked[
-                self.punica_wrapper.token_lora_indices]
-            output += self.bias_stacked
-
-        output = output.view(*out_orig_shape)
-        return output
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
 
     @classmethod
     @_fully_sharded_can_replace
@@ -100,59 +118,6 @@ def can_replace_layer(
         )
 
 
-def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
-    """
-    MergedColumnParallelLinearWithShardedLoRA and
-    MergedQKVParallelLinearWithShardedLora share the same
-    LoRa weight application method.
-    
-    The main difference is the step by shard_size for lora_b which can
-    vary for MergedQKVParallelLinearWithShardedLora but is constant for
-    MergedColumnParallelLinearWithShardedLoRA.
-    """
-    # expecting 2 for column parallel and 3 for qkv
-    n = len(layer.lora_a_stacked)
-    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
-
-    x = x.view(-1, x.shape[-1])
-    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
-    buffers = torch.zeros(
-        (n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
-        dtype=torch.float32,
-        device=x.device,
-    )
-    for idx in range(n):
-        layer.punica_wrapper.add_shrink(buffers[idx], x,
-                                        layer.lora_a_stacked[idx], 1.0)
-
-    buffers = tensor_model_parallel_all_gather(buffers)
-    left_offset = 0
-    for idx in range(n):
-        shard_size = layer.lora_b_stacked[idx].shape[2]
-
-        if layer.bias_stacked is not None:
-            bias = layer.bias_stacked[idx]
-            if bias is not None:
-                bias = bias.view(-1, bias.shape[-1])
-                bias = bias[layer.punica_wrapper.token_lora_indices]
-                bias[layer.punica_wrapper.token_lora_indices == -1] = 0
-                output[:, left_offset:left_offset + shard_size] += bias
-
-        layer.punica_wrapper.add_expand_slice(
-            output,
-            buffers[idx],
-            layer.lora_b_stacked[idx],
-            left_offset,
-            shard_size,
-            add_input=True,
-        )
-        left_offset += shard_size
-
-    output = output.view(*out_orig_shape)
-    # now have column partitioned and packed output
-    return output
-
-
 class MergedColumnParallelLinearWithShardedLoRA(
         MergedColumnParallelLinearWithLoRA):
     """
@@ -176,8 +141,9 @@ def slice_lora_a(
         ]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -209,30 +175,15 @@ class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.lora_a_stacked.shape[2]
+        shard_size = self.lora_a_stacked[0].shape[2]
         start_idx = tp_rank * shard_size
         lora_a = lora_a[:, start_idx:start_idx + shard_size]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-
-        x = x.view(-1, x.shape[-1])
-        output, out_orig_shape = output.view(-1,
-                                             output.shape[-1]), output.shape
-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
-                             dtype=torch.float32,
-                             device=x.device)
-        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
-        buffer = tensor_model_parallel_all_gather(buffer)
-        self.punica_wrapper.add_expand(output,
-                                       buffer,
-                                       self.lora_b_stacked,
-                                       add_input=True)
-        # now have column partitioned output
-        output = output.view(*out_orig_shape)
-        return output
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
 
     @classmethod
     @_fully_sharded_can_replace
@@ -273,8 +224,9 @@ def slice_lora_a(
         ]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -307,7 +259,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
     """
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
-        shard_size = self.lora_b_stacked.shape[2]
+        shard_size = self.lora_b_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         lora_b = lora_b[:, start_idx:end_idx]
@@ -316,20 +268,24 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         if bias is None:
             return bias
-        shard_size = self.bias_stacked.shape[2]
+        self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                      self.lora_bias_stacked)
+        shard_size = self.lora_bias_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         bias = bias[start_idx:end_idx]
         return bias
 
-    def apply(self, x: torch.Tensor) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
 
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
         buffer = torch.zeros(
-            (x.shape[0], self.lora_a_stacked.shape[2]),
+            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
             dtype=torch.float32,
             device=x.device,
         )
@@ -343,18 +299,18 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # remains is a standard all_reduce. User should be aware though that
         # the output is not the same as a normal row_parallel, it should be
         # reduced before being used
-        shard_size = self.lora_b_stacked.shape[2]
-        start_idx = self.tp_rank * shard_size
-
-        if self.bias_stacked is not None:
-            bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1])
-            bias = bias[self.punica_wrapper.token_lora_indices]
-            bias[self.punica_wrapper.token_lora_indices == -1] = 0
-            output += bias
-
-        self.punica_wrapper.add_expand_slice(output, buffer,
-                                             self.lora_b_stacked, start_idx,
-                                             shard_size)
+        # NOTE offset are based on the rank.
+        shard_size = self.lora_b_stacked[0].shape[2]
+        offset_start = self.tp_rank * shard_size
+        self.punica_wrapper.add_expand(
+            output,
+            buffer,
+            self.lora_b_stacked,
+            self.lora_bias_stacked,
+            self.output_slices,
+            offset_start=offset_start,
+            add_input=True,
+        )
         output = output.view(*out_orig_shape)
         return output
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 6afe80219fe07..a6c93a3d8bfe9 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,7 +1,7 @@
 # pylint: disable=unused-argument
 import math
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast
 
 import torch
 import torch.nn as nn
@@ -17,20 +17,23 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-from vllm.lora.punica import PunicaWrapper
+# yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
+# yapf: enable
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
     LinearScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
-    pass
+    from vllm.lora.punica_wrapper import PunicaWrapperBase
 
 
 def _get_lora_device(base_layer: nn.Module) -> torch.device:
@@ -67,63 +70,6 @@ def dec(*args, **kwargs):
     return dec
 
 
-def apply_bias(
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    bias_stacked: torch.Tensor,
-):
-    """Applies bias to output
-
-    Input shapes:
-        bias_stacked:    (num_loras, output_dim)
-        indices:         (batch_size)
-        output:          (batch_size, output_dim)
-    """
-    org_output = output
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-
-    bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1])
-    bias_stacked = bias_stacked[indices]
-    bias_stacked[indices == -1] = 0
-    output += bias_stacked
-
-    return output.view_as(org_output)
-
-
-def apply_bias_packed_nslice(
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    output_slices: Tuple[int, ...],
-    bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-):
-    """Applies bias to output
-
-    Input shapes:
-        bias_stacked:      3 element tuple of (num_loras, output_dim)
-        indices:           (batch_size)
-        output:            (batch_size, q_slice_size + 2*kv_slice_size)
-        output_slices:     n-1 element tuple of (slice_size...),
-                           where n is number of slices
-    """
-    org_output = output
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-
-    offset_left = 0
-    for slice_idx, slice in enumerate(output_slices):
-        bias = bias_stacked[slice_idx]
-        if bias is not None:
-            bias = bias.view(-1, bias.shape[-1])
-            bias = bias[indices]
-            bias[indices == -1] = 0
-            output[:, offset_left:offset_left + slice] += bias
-
-        offset_left += slice
-
-    return output.view_as(org_output)
-
-
 @dataclass
 class LoRAMapping(AdapterMapping):
     is_prefill: bool = False
@@ -169,9 +115,9 @@ def set_lora(
 
     def set_mapping(
         self,
-        punica_wrapper: PunicaWrapper,
+        punica_wrapper,
     ):
-        self.punica_wrapper: PunicaWrapper = punica_wrapper
+        self.punica_wrapper: PunicaWrapperBase = punica_wrapper
 
     @classmethod
     def can_replace_layer(
@@ -306,12 +252,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-
-        # Embedding layer only need expand op
-        self.punica_wrapper.add_expand(full_output,
-                                       full_lora_a_embeddings,
-                                       self.lora_b_stacked,
-                                       add_input=True)
+        self.punica_wrapper.add_lora_embedding(full_output,
+                                               full_lora_a_embeddings,
+                                               self.lora_b_stacked,
+                                               add_input=True)
         return full_output.view_as(full_output_org)
 
     @classmethod
@@ -325,14 +269,19 @@ def can_replace_layer(
         return type(source_layer) is VocabParallelEmbedding
 
 
-class ReplicatedLinearWithLoRA(BaseLayerWithLoRA):
+class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
 
-    def __init__(self, base_layer: ReplicatedLinear) -> None:
+    def __init__(self, base_layer: LinearBase):
         super().__init__()
         self.base_layer = base_layer
         self.input_size = self.base_layer.input_size
-        self.output_size = self.base_layer.output_size
         self.device = _get_lora_device(self.base_layer)
+        self.lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
+
+        self.output_slices: Tuple[int, ...]
+        self.tp_size: int
+        self.output_size: int
+        self.n_slices: int
 
     def create_lora_weights(
         self,
@@ -341,39 +290,64 @@ def create_lora_weights(
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
         self.lora_config = lora_config
-        lora_a_output_size = lora_config.max_lora_rank
-        self.lora_a_stacked = torch.zeros(
-            max_loras,
-            1,
-            lora_a_output_size,
-            self.input_size,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            max_loras,
-            1,
-            self.output_size,
-            lora_config.max_lora_rank,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        if lora_config.bias_enabled:
-            self.bias_stacked = torch.zeros(
+        #
+        if isinstance(self.base_layer, ReplicatedLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, ColumnParallelLinear):
+            lora_a_out_size = (lora_config.max_lora_rank if
+                               not lora_config.fully_sharded_loras else divide(
+                                   lora_config.max_lora_rank, self.tp_size))
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, RowParallelLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = (self.output_size if
+                               not lora_config.fully_sharded_loras else divide(
+                                   self.output_size, self.tp_size))
+        else:
+            raise NotImplementedError
+
+        self.lora_a_stacked = tuple(
+            torch.zeros(
                 max_loras,
                 1,
-                self.output_size,
+                lora_a_out_size,
+                self.input_size,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            )
-        else:
-            self.bias_stacked = None
+            ) for _ in range(self.n_slices))
+        self.lora_b_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                lora_b_out_size,
+                lora_config.max_lora_rank,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        if lora_config.bias_enabled:
+            lora_bias_out_size = lora_b_out_size
+            self.lora_bias_stacked = tuple(
+                torch.zeros(
+                    max_loras,
+                    1,
+                    lora_bias_out_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ) for _ in range(self.n_slices))
+        self.output_slices = (self.lora_b_stacked[0].shape[2], )
 
     def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[index] = 0
+        for s_index in range(self.n_slices):
+            self.lora_a_stacked[s_index][index] = 0
+            self.lora_b_stacked[s_index][index] = 0
+            if self.lora_config.bias_enabled:
+                # Make mypy happy
+                self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                              self.lora_bias_stacked)
+                self.lora_bias_stacked[s_index][index] = 0
 
     def set_lora(
         self,
@@ -381,35 +355,56 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        lora_bias: Optional[torch.Tensor] = None,
     ):
-        self.reset_lora(index)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
+        # Except for QKVParallelLinearWithLora and
+        # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
+        # store weights in a tuple of size 1. These two layers will
+        # override this function.
+        assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
+                self.n_slices == 1)
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+        self.reset_lora(index)
+        if self.tp_size > 1:
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
+
+        self.lora_a_stacked[0][index,
+                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                   lora_a.T, non_blocking=True)
+        self.lora_b_stacked[0][index,
+                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                   lora_b.T, non_blocking=True)
+        if lora_bias is not None:
+
+            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            assert len(self.lora_bias_stacked)
+            self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
+                lora_bias.T, non_blocking=True)
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias(
-                self.indices,
-                output,
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, 1.0)
+        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
+                                            self.lora_b_stacked,
+                                            self.lora_bias_stacked, 1.0,
+                                            self.output_slices)
         return output
 
+
+class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
+
+    def __init__(self, base_layer: ReplicatedLinear) -> None:
+        super().__init__(base_layer, )
+        # To ensure interface compatibility, set to 1 always.
+        self.tp_size = 1
+        self.output_size = self.base_layer.output_size
+        self.n_slices = 1
+
     def forward(self, input_):
         """Forward of ReplicatedLinearWithLoRA
 
@@ -442,80 +437,55 @@ def can_replace_layer(
         return type(source_layer) is ReplicatedLinear
 
 
-class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
+class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
     """
     LoRA on top of ColumnParallelLinear layer.
-
     LoRA B is sliced for tensor parallelism.
+    There are two types for the `base_layer`:
+    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
+    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
     """
 
     def __init__(self, base_layer: ColumnParallelLinear) -> None:
-        super().__init__()
-        self.base_layer = base_layer
+        super().__init__(base_layer)
+        # The base_layer type is ColumnParallelLinear or
+        # MergedColumnParallelLinear, their weight sharding logic is
+        # inconsistent when TP is greater than 1.
+        self.is_merged_col_linear = type(
+            base_layer) is MergedColumnParallelLinear
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.input_size = self.base_layer.input_size
         self.output_size = self.base_layer.output_size_per_partition
-        self.device = _get_lora_device(self.base_layer)
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        self.lora_config = lora_config
-        self.tp_size = get_tensor_model_parallel_world_size()
-        lora_a_output_size_per_partition = (
-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-            else divide(lora_config.max_lora_rank, self.tp_size))
-        self.lora_a_stacked = torch.zeros(
-            max_loras,
-            1,
-            lora_a_output_size_per_partition,
-            self.input_size,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            max_loras,
-            1,
-            self.output_size,
-            lora_config.max_lora_rank,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-
-        if lora_config.bias_enabled:
-            self.bias_stacked = torch.zeros(
-                max_loras,
-                1,
-                self.output_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            )
-        else:
-            self.bias_stacked = None
-
-        self.output_dim = self.lora_b_stacked.shape[2]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[index] = 0
+        # There is only one LoRA layer
+        self.n_slices = 1
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         return lora_a
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        shard_size = self.output_dim
-        start_idx = tensor_model_parallel_rank * shard_size
-        end_idx = (tensor_model_parallel_rank + 1) * shard_size
-        lora_b = lora_b[:, start_idx:end_idx]
+        # Applicable to cases where the base_layer is
+        # MergedColumnParallelLinear.
+        if self.is_merged_col_linear:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = self.output_size // 2
+            offset = lora_b.shape[-1] // 2
+
+            left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) *
+                                 shard_size]
+            right_weight = lora_b[:, offset + tp_rank * shard_size:offset +
+                                  (tp_rank + 1) * shard_size]
+            lora_b = torch.cat([left_weight, right_weight], dim=1)
+        # Applicable to cases where the base_layer is
+        # ColumnParallelLinear.
+        else:
+            tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+            shard_size = self.output_dim
+            start_idx = tensor_model_parallel_rank * shard_size
+            end_idx = (tensor_model_parallel_rank + 1) * shard_size
+            lora_b = lora_b[:, start_idx:end_idx]
         return lora_b
 
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        # TODO: Fix the slicing logic of bias.
         if bias is None:
             return bias
         tensor_model_parallel_rank = get_tensor_model_parallel_rank()
@@ -525,46 +495,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         bias = bias[start_idx:end_idx]
         return bias
 
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            bias = self.slice_bias(bias)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
-
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias(
-                self.indices,
-                output,
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, 1.0)
-        return output
-
     def forward(self, input_):
         """Forward of ColumnParallelLinear
 
@@ -612,8 +542,20 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     Both slices must have the same size.
     """
 
-    def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
+    def __init__(
+        self, base_layer: Union[MergedColumnParallelLinear,
+                                QKVParallelLinear]) -> None:
         super().__init__(base_layer)
+        # There are two LoRA layers
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        # the output_sizes in MergedColumnParallelLinear is not sharded by tp
+        # we need to divide it by the tp_size to get correct slices size
+        output_sizes = self.base_layer.output_sizes
+        self.output_slices = tuple(
+            divide(output_size, self.tp_size) for output_size in output_sizes)
+        self.n_slices = len(self.output_slices)
+        self.output_ids = (self.tp_rank, ) * self.n_slices
 
     def create_lora_weights(
         self,
@@ -621,16 +563,11 @@ def create_lora_weights(
         lora_config: LoRAConfig,
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
+        """
+        The main reason for overriding this function is to enhance  code 
+        maintainability.
+        """
         self.lora_config = lora_config
-        n_slices = 2
-        if not (len(self.base_layer.output_sizes) == n_slices
-                and self.base_layer.output_sizes[0]
-                == self.base_layer.output_sizes[1]):
-            raise ValueError(
-                "LoRAColumnParallelLinear2Slice requires 2 slices with "
-                "the same size.")
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
 
         lora_a_output_size_per_partition = (
             lora_config.max_lora_rank if not lora_config.fully_sharded_loras
@@ -644,38 +581,25 @@ def create_lora_weights(
                 self.input_size,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for _ in range(n_slices))
+            ) for _ in range(self.n_slices))
         self.lora_b_stacked = tuple(
             torch.zeros(
                 max_loras,
                 1,
-                self.output_size // 2,
+                output_size,
                 lora_config.max_lora_rank,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for _ in range(n_slices))
+            ) for output_size in self.output_slices)
         if lora_config.bias_enabled:
-            self.bias_stacked = tuple(
+            self.lora_bias_stacked = tuple(
                 torch.zeros(
                     max_loras,
                     1,
-                    self.output_size // 2,
+                    output_size,
                     dtype=lora_config.lora_dtype,
                     device=self.device,
-                ) for _ in range(n_slices))
-        else:
-            self.bias_stacked = None
-
-        self.output_dim = self.lora_b_stacked[0].shape[2]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[0][index] = 0
-        self.lora_a_stacked[1][index] = 0
-        self.lora_b_stacked[0][index] = 0
-        self.lora_b_stacked[1][index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[0][index] = 0
-            self.bias_stacked[1][index] = 0
+                ) for output_size in self.output_slices)
 
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
@@ -685,27 +609,21 @@ def slice_lora_a(
     def slice_lora_b(
         self, lora_b: List[Union[torch.Tensor, None]]
     ) -> List[Union[torch.Tensor, None]]:
-        #NOTE: lora_b contains 2 subloras, and each sublora could be None.
-        shard_size = self.output_dim
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-        lora_b = [
-            lora_b[0][:, start_idx:end_idx] if lora_b[0] is not None else None,
-            lora_b[1][:, start_idx:end_idx] if lora_b[1] is not None else None,
-        ]
+        for i, (shard_id, shard_size) in enumerate(
+                zip(self.output_ids, self.output_slices)):
+            if (lora_b_i := lora_b[i]) is not None:
+                lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size *
+                                     (shard_id + 1)]
         return lora_b
 
     def slice_bias(
         self, bias: List[Union[torch.Tensor,
                                None]]) -> List[Union[torch.Tensor, None]]:
-        # NOTE : each bias could be None.
-        shard_size = self.output_dim
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-        bias = [
-            bias[0][start_idx:end_idx] if bias[0] is not None else None,
-            bias[1][start_idx:end_idx] if bias[1] is not None else None
-        ]
+        for i, (shard_id, shard_size) in enumerate(
+                zip(self.output_ids, self.output_slices)):
+            if (bias_i := bias[i]) is not None:
+                bias[i] = bias_i[shard_size * shard_id:shard_size *
+                                 (shard_id + 1)]
         return bias
 
     def set_lora(
@@ -714,54 +632,35 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        lora_bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
 
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        if lora_a[0] is not None:
-            self.lora_a_stacked[0][
-                index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_(
-                    lora_a[0].T, non_blocking=True)
-            self.lora_b_stacked[0][
-                index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_(
-                    lora_b[0].T, non_blocking=True)
-        if bias is not None and bias[0] is not None:
-            self.bias_stacked[0][index,
-                                 0, :bias[0].shape[0]].copy_(bias[0].T,
-                                                             non_blocking=True)
-        if lora_a[1] is not None:
-            self.lora_a_stacked[1][
-                index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
-                    lora_a[1].T, non_blocking=True)
-            self.lora_b_stacked[1][
-                index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_(
-                    lora_b[1].T, non_blocking=True)
-        if bias is not None and bias[1] is not None:
-            self.bias_stacked[1][index,
-                                 0, :bias[1].shape[0]].copy_(bias[1].T,
-                                                             non_blocking=True)
-
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias_packed_nslice(
-                self.indices,
-                output,
-                (self.output_dim, self.output_dim),
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora_packed_nslice(
-            output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0,
-            (self.output_dim, self.output_dim))
-        return output
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
+
+        for i in range(self.n_slices):
+            if (lora_a_i := lora_a[i]) is not None:
+                self.lora_a_stacked[i][
+                    index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_(
+                        lora_a_i.T, non_blocking=True)
+            if (lora_b_i := lora_b[i]) is not None:
+                self.lora_b_stacked[i][
+                    index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_(
+                        lora_b_i.T, non_blocking=True)
+
+        if lora_bias is not None:
+            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            for i in range(self.n_slices):
+                if (lora_bias_i := lora_bias[i]) is not None:
+                    self.lora_bias_stacked[i][index,
+                                              0, :lora_bias_i.shape[0]].copy_(
+                                                  lora_bias_i.T,
+                                                  non_blocking=True)
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -779,7 +678,7 @@ def can_replace_layer(
 class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
     """
     ColumnParallelLinear layer that is specifically designed for
-    qkv_proj. Certain models, such as chtglm3 and baichuan-7b,
+    qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
     only contains a single LoRA within their qkv_proj layer.
 
     During inference with Tensor Parallel, the weights of lora_b
@@ -791,7 +690,6 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
-        self.tp_size = get_tensor_model_parallel_world_size()
         self.q_proj_total_size = (self.base_layer.total_num_heads *
                                   self.base_layer.head_size)
         self.q_proj_shard_size = (self.base_layer.num_heads *
@@ -800,6 +698,8 @@ def __init__(self, base_layer: QKVParallelLinear) -> None:
                                    self.base_layer.head_size)
         self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
                                    self.base_layer.head_size)
+        # There is only one LoRA layer
+        self.n_slices = 1
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         tp_rank = get_tensor_model_parallel_rank()
@@ -834,32 +734,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
         return bias
 
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
-
     @classmethod
     @_not_fully_sharded_can_replace
     def can_replace_layer(cls, source_layer: nn.Module,
@@ -869,8 +743,8 @@ def can_replace_layer(cls, source_layer: nn.Module,
             packed_modules_list) == 1
 
 
-class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
-    """ColumnParallelLinear layer that is composed of 3 sublayers (slices)
+class MergedQKVParallelLinearWithLora(MergedColumnParallelLinearWithLoRA):
+    """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
     packed together in qkv proj fashion
     (q_proj + k_proj + v_proj -> qkv_proj).
 
@@ -882,16 +756,11 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        self.lora_config = lora_config
+        # There are three LoRA layer.
+        self.n_slices = len(self.base_layer.output_sizes)
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
+
         self.q_proj_shard_size = (self.base_layer.num_heads *
                                   self.base_layer.head_size)
         self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
@@ -899,227 +768,28 @@ def create_lora_weights(
         self.q_shard_id = self.tp_rank
         self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas
 
-        lora_a_output_size_per_partition = (
-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-            else divide(lora_config.max_lora_rank, self.tp_size))
-        # q, k, v
-        self.lora_a_stacked = (
-            torch.zeros(
-                max_loras,
-                1,
-                lora_a_output_size_per_partition,
-                self.input_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-            torch.zeros(
-                max_loras,
-                1,
-                lora_a_output_size_per_partition,
-                self.input_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-            torch.zeros(
-                max_loras,
-                1,
-                lora_a_output_size_per_partition,
-                self.input_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-        )
-        self.lora_b_stacked = (
-            torch.zeros(
-                max_loras,
-                1,
-                self.q_proj_shard_size,
-                lora_config.max_lora_rank,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-            torch.zeros(
-                max_loras,
-                1,
-                self.kv_proj_shard_size,
-                lora_config.max_lora_rank,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-            torch.zeros(
-                max_loras,
-                1,
-                self.kv_proj_shard_size,
-                lora_config.max_lora_rank,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ),
-        )
-        if lora_config.bias_enabled:
-            self.bias_stacked = (
-                torch.zeros(
-                    max_loras,
-                    1,
-                    self.q_proj_shard_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ),
-                torch.zeros(
-                    max_loras,
-                    1,
-                    self.kv_proj_shard_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ),
-                torch.zeros(
-                    max_loras,
-                    1,
-                    self.kv_proj_shard_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ),
-            )
-        else:
-            self.bias_stacked = None
-
         self.output_slices = (
             self.q_proj_shard_size,
             self.kv_proj_shard_size,
             self.kv_proj_shard_size,
         )
-        self.packed_indices: Optional[torch.Tensor] = None
-        self.standard_indices: Optional[torch.Tensor] = None
-        # lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[0][index] = 0
-        self.lora_b_stacked[0][index] = 0
-        self.lora_a_stacked[1][index] = 0
-        self.lora_b_stacked[1][index] = 0
-        self.lora_a_stacked[2][index] = 0
-        self.lora_b_stacked[2][index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[0][index] = 0
-            self.bias_stacked[1][index] = 0
-            self.bias_stacked[2][index] = 0
-
-    def slice_lora_a(
-        self, lora_a: List[Union[torch.Tensor, None]]
-    ) -> List[Union[torch.Tensor, None]]:
-        return lora_a
-
-    def slice_lora_b(
-        self, lora_b: List[Union[torch.Tensor, None]]
-    ) -> List[Union[torch.Tensor, None]]:
-        lora_b_q, lora_b_k, lora_b_v = None, None, None
-        if lora_b[0] is not None:
-            lora_b_q = lora_b[0][:, self.q_proj_shard_size *
-                                 self.q_shard_id:self.q_proj_shard_size *
-                                 (self.q_shard_id + 1), ]
-        if lora_b[1] is not None:
-            lora_b_k = lora_b[1][:, self.kv_proj_shard_size *
-                                 self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1), ]
-        if lora_b[2] is not None:
-            lora_b_v = lora_b[2][:, self.kv_proj_shard_size *
-                                 self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1), ]
-        lora_b = [lora_b_q, lora_b_k, lora_b_v]
-        return lora_b
-
-    def slice_bias(
-        self, bias: List[Union[torch.Tensor,
-                               None]]) -> List[Union[torch.Tensor, None]]:
-        bias_q, bias_k, bias_v = bias
-        if bias_q is not None:
-            bias_q = bias_q[self.q_proj_shard_size *
-                            self.q_shard_id:self.q_proj_shard_size *
-                            (self.q_shard_id + 1)]
-        if bias_k is not None:
-            bias_k = bias_k[self.kv_proj_shard_size *
-                            self.kv_shard_id:self.kv_proj_shard_size *
-                            (self.kv_shard_id + 1)]
-        if bias_v is not None:
-            bias_v = bias_v[self.kv_proj_shard_size *
-                            self.kv_shard_id:self.kv_proj_shard_size *
-                            (self.kv_shard_id + 1)]
-        bias = [bias_q, bias_k, bias_v]
-        return bias
+        self.output_ids = (
+            self.q_shard_id,
+            self.kv_shard_id,
+            self.kv_shard_id,
+        )
 
-    def set_lora(
+    def create_lora_weights(
         self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        if lora_b[0] is not None:
-            lora_b_q = lora_b[0]
-            self.lora_b_stacked[0][
-                index, 0, :lora_b_q.shape[1], :lora_b_q.shape[0]].copy_(
-                    lora_b_q.T, non_blocking=True)
-        if lora_b[1] is not None:
-            lora_b_k = lora_b[1]
-            self.lora_b_stacked[1][
-                index, 0, :lora_b_k.shape[1], :lora_b_k.shape[0]].copy_(
-                    lora_b_k.T, non_blocking=True)
-        if lora_b[2] is not None:
-            lora_b_v = lora_b[2]
-            self.lora_b_stacked[2][
-                index, 0, :lora_b_v.shape[1], :lora_b_v.shape[0]].copy_(
-                    lora_b_v.T, non_blocking=True)
-
-        if lora_a[0] is not None:
-            self.lora_a_stacked[0][
-                index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_(
-                    lora_a[0].T, non_blocking=True)
-        if lora_a[1] is not None:
-            self.lora_a_stacked[1][
-                index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
-                    lora_a[1].T, non_blocking=True)
-        if lora_a[2] is not None:
-            self.lora_a_stacked[2][
-                index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_(
-                    lora_a[2].T, non_blocking=True)
-
-        if bias is not None:
-            if bias[0] is not None:
-                self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_(
-                    bias[0].T, non_blocking=True)
-            if bias[1] is not None:
-                self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_(
-                    bias[1].T, non_blocking=True)
-            if bias[2] is not None:
-                self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_(
-                    bias[2].T, non_blocking=True)
-
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias_packed_nslice(
-                self.indices,
-                output,
-                self.output_slices,
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora_packed_nslice(output, x,
-                                                   self.lora_a_stacked,
-                                                   self.lora_b_stacked, 1.0,
-                                                   self.output_slices)
-        return output
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        """
+        The main reason for overloading this function is to handle inconsistent 
+        weight dimensions in qkv lora.
+        """
+        super().create_lora_weights(max_loras, lora_config, model_config)
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -1134,76 +804,25 @@ def can_replace_layer(
                 and len(packed_modules_list) == 3)
 
 
-class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
+class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
 
     def __init__(self, base_layer: RowParallelLinear) -> None:
-        super().__init__()
-        self.base_layer = base_layer
+        super().__init__(base_layer)
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # reset input_size
         self.input_size = self.base_layer.input_size_per_partition
         self.output_size = self.base_layer.output_size
-        self.device = _get_lora_device(self.base_layer)
 
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        self.lora_config = lora_config
         self.tp_rank = get_tensor_model_parallel_rank()
-        self.lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                lora_config.max_lora_rank,
-                self.input_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        tp_size = get_tensor_model_parallel_world_size()
-        lora_b_output_size_per_partition = (
-            self.output_size if not lora_config.fully_sharded_loras else
-            divide(self.output_size, tp_size))
-
-        self.lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                lora_b_output_size_per_partition,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-
-        if lora_config.bias_enabled:
-            self.bias_stacked = torch.zeros(
-                (
-                    max_loras,
-                    1,
-                    self.output_size,
-                ),
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            )
-        else:
-            self.bias_stacked = None
-        # Lazily initialized
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[index] = 0
+        # There is only one LoRA layer.
+        self.n_slices = 1
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+
         shard_size = self.input_size
-        start_idx = tensor_model_parallel_rank * shard_size
-        end_idx = (tensor_model_parallel_rank + 1) * shard_size
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
         lora_a = lora_a[start_idx:end_idx, :]
         return lora_a
 
@@ -1213,46 +832,6 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         return bias
 
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-
-        if self.base_layer.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
-
-    def apply(self, x: torch.Tensor) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias(
-                self.indices,
-                output,
-                self.bias_stacked,
-            )
-        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, 1.0)
-        return output
-
     def forward(self, input_):
         """Forward of RowParallelLinear
 
@@ -1270,10 +849,9 @@ def forward(self, input_):
             input_parallel = input_
         else:
             # TODO: simplify code below
-            tp_rank = get_tensor_model_parallel_rank()
             splitted_input = split_tensor_along_last_dim(
                 input_, num_partitions=self.base_layer.tp_size)
-            input_parallel = splitted_input[tp_rank].contiguous()
+            input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
         output_parallel = self.apply(input_parallel)
@@ -1491,6 +1069,11 @@ def _get_logits(
         ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
                                                       posinf=float("inf"),
                                                       neginf=float("-inf")))
+
+        # HPU needs special handling to prune out dummy samples.
+        if current_platform.is_hpu():
+            lora_logits = lora_logits[:logits.shape[0], :]
+
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index b648312ba76ec..dde347b78bf81 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -4,6 +4,7 @@
 import torch
 import torch.types
 
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.utils import is_pin_memory_available
 
 
@@ -59,6 +60,23 @@ def extra_vocab_size(self) -> int:
         return self.embeddings_tensor.shape[
             0] if self.embeddings_tensor is not None else 0
 
+    @classmethod
+    def from_config(
+        cls,
+        module_name: str,
+        peft_helper: PEFTHelper,
+        embeddings_tensor: Optional[torch.Tensor] = None,
+    ) -> "LoRALayerWeights":
+        return cls(
+            module_name,
+            peft_helper.r,
+            peft_helper.lora_alpha,
+            None,
+            None,
+            None,
+            embeddings_tensor,
+        )
+
     @classmethod
     def create_dummy_lora_weights(
             cls,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 2ffefe61427e3..70806a77b9fff 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -21,7 +21,8 @@
                               LinearScalingRotaryEmbeddingWithLora,
                               LoRAMapping)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.punica import PunicaWrapper
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              is_regex_target_modules,
                              parse_fine_tuned_lora_name, replace_submodule)
@@ -104,14 +105,12 @@ def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
     def from_lora_tensors(
         cls,
         lora_model_id: int,
-        rank: int,
-        lora_alpha: int,
         tensors: Dict[str, torch.Tensor],
+        peft_helper: PEFTHelper,
         device: str = "cuda",
         dtype: Optional[torch.dtype] = None,
         embeddings: Optional[Dict[str, torch.Tensor]] = None,
         target_embedding_padding: Optional[int] = None,
-        scaling_factor: Optional[float] = None,
         embedding_modules: Optional[Dict[str, str]] = None,
         embedding_padding_modules: Optional[List[str]] = None,
     ) -> "LoRAModel":
@@ -135,10 +134,9 @@ def from_lora_tensors(
                         if pin_memory:
                             lora_embeddings_tensor = (
                                 lora_embeddings_tensor.pin_memory())
-                loras[module_name] = LoRALayerWeights(module_name, rank,
-                                                      lora_alpha, None, None,
-                                                      None,
-                                                      lora_embeddings_tensor)
+                loras[module_name] = LoRALayerWeights.from_config(
+                    module_name, peft_helper, lora_embeddings_tensor)
+
             if is_bias:
                 loras[module_name].bias = tensor.to(device=device,
                                                     dtype=dtype).t()
@@ -170,7 +168,11 @@ def from_lora_tensors(
 
         for lora in loras.values():
             lora.optimize()
-        return cls(lora_model_id, rank, loras, scaling_factor=scaling_factor)
+
+        return cls(lora_model_id,
+                   peft_helper.r,
+                   loras,
+                   scaling_factor=peft_helper.vllm_scaling_factor)
 
     @classmethod
     def from_local_checkpoint(
@@ -212,6 +214,9 @@ def from_local_checkpoint(
                                                     "new_embeddings.bin")
         with open(lora_config_path) as f:
             config = json.load(f)
+
+        config["vllm_max_position_embeddings"] = max_position_embeddings
+        peft_helper = PEFTHelper.from_dict(config)
         if os.path.isfile(lora_tensor_path):
             tensors: Dict[str, torch.Tensor] = {}
             # Find unexpected modules.
@@ -242,7 +247,7 @@ def from_local_checkpoint(
             # When a bin file is provided, we rely on config to find unexpected
             # modules.
             unexpected_modules = []
-            target_modules = config["target_modules"]
+            target_modules = peft_helper.target_modules
             if not isinstance(target_modules, list):
                 target_modules = [target_modules]
             for module in target_modules:
@@ -256,7 +261,7 @@ def from_local_checkpoint(
             # https://github.com/vllm-project/vllm/pull/5909. But there's no
             # other better mechanism.
             if unexpected_modules and not is_regex_target_modules(
-                    config["target_modules"], expected_lora_modules):
+                    peft_helper.target_modules, expected_lora_modules):
                 raise ValueError(
                     f"While loading {lora_dir}, expected"
                     f" target modules in {expected_lora_modules}"
@@ -274,30 +279,17 @@ def from_local_checkpoint(
             embeddings = torch.load(new_embeddings_bin_file_path,
                                     map_location=device)
 
-        rank = config["r"]
-        lora_alpha = config["lora_alpha"]
-        context_length = config.get("context_length", None)
-        scaling_factor = None
-        if context_length:
-            if max_position_embeddings is None:
-                max_position_embeddings = context_length
-            scaling_factor = float(
-                math.ceil(context_length / max_position_embeddings))
-
         return cls.from_lora_tensors(
             lora_model_id=get_lora_id()
             if lora_model_id is None else lora_model_id,
-            rank=rank,
-            lora_alpha=lora_alpha,
             tensors=tensors,
+            peft_helper=peft_helper,
             device=device,
             dtype=dtype,
             embeddings=embeddings,
             target_embedding_padding=target_embedding_padding,
-            scaling_factor=scaling_factor,
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embedding_padding_modules,
-        )
+            embedding_padding_modules=embedding_padding_modules)
 
 
 class LoRAModelManager(AdapterModelManager):
@@ -331,9 +323,9 @@ def __init__(
         self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
-        self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
-                                            max_batches=self.max_num_seqs,
-                                            device=self.device)
+        self.punica_wrapper = get_punica_wrapper(max_num_batched_tokens,
+                                                 max_batches=self.max_num_seqs,
+                                                 device=self.device)
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
@@ -555,17 +547,17 @@ def create_dummy_lora(
                         input_dim,
                         output_dim,
                         rank,
-                        module.lora_a_stacked.dtype,
+                        module.lora_a_stacked[0].dtype,
                         "cpu",
                         embeddings_tensor_dim=embeddings_tensor_dim,
                         bias_enabled=bias_enabled)
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
-                        module.lora_a_stacked.shape[-1],
-                        module.lora_b_stacked.shape[-2],
+                        module.lora_a_stacked[0].shape[-1],
+                        module.lora_b_stacked[0].shape[-2],
                         rank,
-                        module.lora_a_stacked.dtype,
+                        module.lora_a_stacked[0].dtype,
                         "cpu",
                         bias_enabled=bias_enabled,
                     )
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 6a32387a6f36c..42adb191b8ead 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -9,6 +9,8 @@
 import triton
 import triton.language as tl
 
+from vllm.utils import direct_register_custom_op
+
 from .utils import get_lora_op_configs
 
 
@@ -75,7 +77,9 @@ def _bgmv_expand_kernel(
             other=0.0,
         )  # [BLOCK_N,BLOCK_K]
         if ADD_INPUTS:
-            tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask)
+            tiled_out = tl.load(c_ptr + current_n * cn_stride,
+                                mask=c_mask,
+                                other=0.0)
             accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
         else:
             accumulator = tl.sum(tiled_a * tiled_b, 1)
@@ -160,9 +164,24 @@ def _bgmv_expand(
     return
 
 
+def bgmv_expand_fake(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+) -> None:
+    return
+
+
 try:
-    bgmv_expand = torch.library.custom_op("lora::bgmv_expand",
-                                          _bgmv_expand,
-                                          mutates_args=["output_tensor"])
+    direct_register_custom_op(
+        op_name="bgmv_expand",
+        op_func=_bgmv_expand,
+        mutates_args=["output_tensor"],
+        fake_impl=bgmv_expand_fake,
+    )
+    bgmv_expand = torch.ops.vllm.bgmv_expand
+
 except AttributeError:
     bgmv_expand = _bgmv_expand
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index 73628fd20d327..f397d752a3ea9 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -9,6 +9,8 @@
 import triton
 import triton.language as tl
 
+from vllm.utils import direct_register_custom_op
+
 from .utils import get_lora_op_configs
 
 
@@ -78,7 +80,13 @@ def _bgmv_expand_slice_kernel(
         )  # [BLOCK_N,BLOCK_K]
 
         if ADD_INPUTS:
-            tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask)
+            # explicitly pass in other=None to tell triton that masked values
+            # can be uninitialized. This is OK because the later tl.store
+            # operation uses the same mask, eliminating the risk of garbage
+            # values propagating
+            tiled_out = tl.load(c_ptr + current_n * cn_stride,
+                                mask=c_mask,
+                                other=None)
             accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
         else:
             accumulator = tl.sum(tiled_a * tiled_b, 1)
@@ -173,9 +181,26 @@ def _bgmv_expand_slice(
     return
 
 
+def bgmv_expand_slice_fake(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+) -> None:
+    return
+
+
 try:
-    bgmv_expand_slice = torch.library.custom_op("lora::bgmv_expand_slice",
-                                                _bgmv_expand_slice,
-                                                mutates_args=["output_tensor"])
+    direct_register_custom_op(
+        op_name="bgmv_expand_slice",
+        op_func=_bgmv_expand_slice,
+        mutates_args=["output_tensor"],
+        fake_impl=bgmv_expand_slice_fake,
+    )
+    bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
+
 except AttributeError:
     bgmv_expand_slice = _bgmv_expand_slice
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index 0846ff36b1692..f3ef01d39e776 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -9,6 +9,8 @@
 import triton
 import triton.language as tl
 
+from vllm.utils import direct_register_custom_op
+
 from .utils import get_lora_op_configs
 
 
@@ -142,9 +144,24 @@ def _bgmv_shrink(
     return
 
 
+def bgmv_shrink_fake(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+) -> None:
+    return
+
+
 try:
-    bgmv_shrink = torch.library.custom_op("lora::bgmv_shrink",
-                                          _bgmv_shrink,
-                                          mutates_args=["output_tensor"])
+    direct_register_custom_op(
+        op_name="bgmv_shrink",
+        op_func=_bgmv_shrink,
+        mutates_args=["output_tensor"],
+        fake_impl=bgmv_shrink_fake,
+    )
+    bgmv_shrink = torch.ops.vllm.bgmv_shrink
+
 except AttributeError:
     bgmv_shrink = _bgmv_shrink
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 4910cb4061298..77c5178493c44 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -9,6 +9,8 @@
 import triton
 import triton.language as tl
 
+from vllm.utils import direct_register_custom_op
+
 
 @triton.jit
 def _sgmv_expand_kernel(
@@ -88,7 +90,10 @@ def _sgmv_expand_kernel(
     c_mask = (offset_cm[:, None] <
               (cur_seq_start + M)) & (offset_cn[None, :] < N)
     if ADD_INPUTS:
-        tiled_out = tl.load(c_ptr, mask=c_mask)
+        # explicitly pass in other=None to tell triton that masked values
+        # can be uninitialized. This is OK because the later tl.store operation
+        # uses the same mask, eliminating the risk of garbage values propagating
+        tiled_out = tl.load(c_ptr, mask=c_mask, other=None)
         tiled_c += tiled_out
     tl.store(c_ptr, tiled_c, mask=c_mask)
 
@@ -193,9 +198,30 @@ def _sgmv_expand(
     return
 
 
+def sgmv_expand_fake(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    token_nums: int,
+    add_inputs: bool = False,
+) -> None:
+    return
+
+
 try:
-    sgmv_expand = torch.library.custom_op("lora::sgmv_expand",
-                                          _sgmv_expand,
-                                          mutates_args=["output_tensor"])
+
+    direct_register_custom_op(
+        op_name="sgmv_expand",
+        op_func=_sgmv_expand,
+        mutates_args=["output_tensor"],
+        fake_impl=sgmv_expand_fake,
+    )
+    sgmv_expand = torch.ops.vllm.sgmv_expand
+
 except AttributeError:
     sgmv_expand = _sgmv_expand
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 844f5cec39e93..55c4fb68ed128 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -9,6 +9,8 @@
 import triton
 import triton.language as tl
 
+from vllm.utils import direct_register_custom_op
+
 
 @triton.jit
 def _sgmv_expand_slice_kernel(
@@ -94,7 +96,10 @@ def _sgmv_expand_slice_kernel(
     c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] <
                                                            (slice_offset + N))
     if ADD_INPUTS:
-        tiled_out = tl.load(c_ptr, mask=c_mask)
+        # explicitly pass in other=None to tell triton that masked values
+        # can be uninitialized. This is OK because the later tl.store operation
+        # uses the same mask, eliminating the risk of garbage values propagating
+        tiled_out = tl.load(c_ptr, mask=c_mask, other=None)
         tiled_c += tiled_out
     tl.store(c_ptr, tiled_c, mask=c_mask)
 
@@ -206,9 +211,31 @@ def _sgmv_expand_slice(
     return
 
 
+def sgmv_expand_slice_fake(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    token_nums: int,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = False,
+) -> None:
+    return
+
+
 try:
-    sgmv_expand_slice = torch.library.custom_op("lora::sgmv_expand_slice",
-                                                _sgmv_expand_slice,
-                                                mutates_args=["output_tensor"])
+    direct_register_custom_op(
+        op_name="sgmv_expand_slice",
+        op_func=_sgmv_expand_slice,
+        mutates_args=["output_tensor"],
+        fake_impl=sgmv_expand_slice_fake,
+    )
+    sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
+
 except AttributeError:
     sgmv_expand_slice = _sgmv_expand_slice
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index b4d893047b06b..37d1dc84eebca 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -9,6 +9,8 @@
 import triton
 import triton.language as tl
 
+from vllm.utils import direct_register_custom_op
+
 
 @triton.jit
 def _sgmv_shrink_kernel(
@@ -190,9 +192,29 @@ def _sgmv_shrink(
     return
 
 
+def sgmv_shrink_fake(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    token_nums: int,
+    scaling: float,
+) -> None:
+    return
+
+
 try:
-    sgmv_shrink = torch.library.custom_op("lora::sgmv_shrink",
-                                          _sgmv_shrink,
-                                          mutates_args=["output_tensor"])
+    direct_register_custom_op(
+        op_name="sgmv_shrink",
+        op_func=_sgmv_shrink,
+        mutates_args=["output_tensor"],
+        fake_impl=sgmv_shrink_fake,
+    )
+    sgmv_shrink = torch.ops.vllm.sgmv_shrink
+
 except AttributeError:
     sgmv_shrink = _sgmv_shrink
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
new file mode 100644
index 0000000000000..edf4ba5659575
--- /dev/null
+++ b/vllm/lora/peft_helper.py
@@ -0,0 +1,70 @@
+# Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
+
+import math
+from dataclasses import MISSING, dataclass, field, fields
+from typing import Literal, Optional, Union
+
+
+@dataclass
+class PEFTHelper:
+    # Required fields
+    r: int
+    lora_alpha: int
+    target_modules: Union[list[str], str]
+
+    bias: Literal["none", "all", "lora_only"] = field(default="none")
+    modules_to_save: Optional[list[str]] = field(default=None)
+    use_rslora: bool = field(default=False)
+    use_dora: bool = field(default=False)
+    # long lora field
+    context_length: int = field(default=0)
+    # Extra vllm field, start with 'vllm_' to avoid conflict
+    vllm_max_position_embeddings: Optional[int] = field(default=False)
+    vllm_scaling_factor: Optional[float] = field(default=None)
+
+    def _validate_features(self):
+        error_msg = []
+
+        if self.modules_to_save:
+            error_msg.append("vLLM only supports modules_to_save being None.")
+        if self.use_rslora:
+            error_msg.append("vLLM does not yet support RSLoRA.")
+
+        if self.use_dora:
+            error_msg.append("vLLM does not yet support DoRA.")
+
+        if error_msg:
+            raise ValueError(f"{', '.join(error_msg)}")
+
+    def __post_init__(self):
+        self._validate_features()
+        if self.context_length:
+            if self.vllm_max_position_embeddings is None:
+                self.vllm_max_position_embeddings = self.context_length
+            self.vllm_scaling_factor = float(
+                math.ceil(self.context_length /
+                          self.vllm_max_position_embeddings))
+
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> "PEFTHelper":
+        # Get all field information from the class
+        class_fields = {f.name: f for f in fields(cls)}
+        # Check for required fields
+        required_fields = {
+            name
+            for name, f in class_fields.items()
+            if f.default is MISSING and f.default_factory is MISSING
+        }
+
+        # Identify any missing required fields
+        missing_fields = required_fields - set(config_dict.keys())
+        if missing_fields:
+            raise ValueError(
+                f"Missing required configuration fields: {missing_fields}")
+
+        # Filter out fields that aren't defined in the class
+        filtered_dict = {
+            k: v
+            for k, v in config_dict.items() if k in class_fields
+        }
+        return cls(**filtered_dict)
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
deleted file mode 100644
index 082041f390750..0000000000000
--- a/vllm/lora/punica.py
+++ /dev/null
@@ -1,611 +0,0 @@
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
-https://arxiv.org/abs/2310.18547
-"""
-
-from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
-
-import torch
-
-from vllm.triton_utils import HAS_TRITON
-
-if HAS_TRITON:
-    from vllm.lora.ops.bgmv_expand import bgmv_expand
-    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
-    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
-    from vllm.lora.ops.sgmv_expand import sgmv_expand
-    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
-    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
-
-if TYPE_CHECKING:
-    # avoid circuit import
-    from vllm.lora.layers import LoRAMapping
-    from vllm.lora.models import LongContextLoRAContext
-
-
-def compute_meta(
-    token_lora_tensor: torch.Tensor
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]:
-    """
-    Get the information required for the sgmv kernel. With the  features:
-    1. If consecutive requests in the batch use the same LoRA, this function
-    will combine them into a single request, improving sgmv kernel inference
-    performance.
-    2. At the beginning of each prefill stage inference, recalculations are
-    needed based on the input, but only once.
-    """
-
-    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
-        token_lora_tensor, return_counts=True)
-    cum_result = torch.cumsum(seq_length_tensor, dim=0)
-    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
-    b_seq_start_tensor[1:].copy_(cum_result[:-1])
-    max_length = seq_length_tensor.max().item()
-    token_nums = seq_length_tensor.sum().item()
-    batch_size = lora_indices_tensor.size(0)
-    no_lora = False
-    # -1 means no lora should be applied. Use `no_lora` to determine whether
-    # the current step requires LoRA. If LoRA is not needed, the prefill stage
-    # does not need to launch the triton kernel, which can improve performance
-    if batch_size == 1 and lora_indices_tensor == -1:
-        no_lora = True
-    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
-            batch_size, max_length, token_nums, no_lora)
-
-
-# TODO see if this can be vectorized
-def convert_mapping(
-    mapping: "LoRAMapping",
-    lora_index_to_id: List[Optional[int]],
-    max_loras: int,
-    vocab_size: int,
-    extra_vocab_size: int,
-    device: torch.device,
-    long_lora_context: Optional["LongContextLoRAContext"] = None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor], List[int]]:
-    """Converts LoRAMapping to index tensors.
-
-    Args:
-        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
-        lora_index_to_id: List mapping LoRA ids to LoRA indices.
-        max_loras: Maximum number of LoRAs.
-        vocab_size: Model vocab size.
-        extra_vocab_size: Extra vocab size each LoRA can have.
-        long_lora_context: Passed if there are long context lora in a batch.
-
-    Returns:
-        A tuple of tensors:
-            base_indices: Tensor of shape [batch_size] mapping batch rows to
-                LoRA indices.
-            sampler_indices: Tensor of shape [batch_size] mapping requests to
-                LoRA indices for sampler. For generation, this will be the
-                same as base_indicies. For prefill, this will map requests
-                to LoRA indices.
-            sampler_indices_padded: Tensor of shape [batch_size] mapping
-                requests to LoRA indices for sampler with padding.
-                Same as sampler_indicies, but -1 is replaced with
-                max_loras.
-            embeddings_indices: Tensor of shape [2, batch_size] mapping
-                requests to embedding indices. First row is for embeddings
-                added by the LoRAs, second row is for the LoRA.lora_a
-                embeddings.
-            long_lora_indices: Tensor of shape [batch_size] mapping
-                requests to RoPE offsets and rot dims for long LoRAs.
-                None if long context lora doesn't exist.
-            indices_len: List of lengths of the above tensors. It contains
-                (base_indices, sampler_indices, sampler_indices_padded,
-                embeddings_indices, long_lora_indices).
-    """
-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
-    embedding_indices = index_mapping_indices.copy()
-    lora_indices = index_mapping_indices.copy()
-    long_lora_offsets: Optional[torch.Tensor] = None
-    if long_lora_context:
-        long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device=device,
-                                        dtype=torch.long)
-    prompt_mapping: List[int] = [
-        lora_index_to_id.index(x) if x > 0 else -1
-        for x in mapping.prompt_mapping
-    ]
-    lora_idx = None
-    for i in range(len(index_mapping_indices)):
-        # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                    if index_mapping_indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
-        lora_indices[i] = lora_idx
-        if long_lora_context:
-            assert long_lora_offsets is not None
-            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
-                index_mapping_indices[i], 0)
-            long_lora_offsets[i] = lora_offset
-
-    indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices,
-        lora_indices,
-        embedding_indices,
-    ]
-    if long_lora_context:
-        assert long_lora_offsets is not None
-        indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         dtype=torch.long,
-                                         device=device)
-    embeddings_indices = torch.stack([
-        indices[2] * extra_vocab_size,
-        indices[2] * (vocab_size + extra_vocab_size),
-    ])
-    embeddings_indices[embeddings_indices == -1] = max_loras - 1
-    base_indices = indices[1]
-    sampler_indices = prompt_mapping_tensor
-    sampler_indices_padded = sampler_indices.clone()
-    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = torch.arange(
-        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
-            sampler_indices_padded * len(sampler_indices_padded))
-    long_lora_indices = None
-    long_lora_indices_len: Optional[int] = None
-    if long_lora_context:
-        long_lora_indices = indices[3]
-        long_lora_indices_len = long_lora_indices.shape[-1]
-    # Contain length of indices tensors. Used to index into each tensor.
-    indices_len = [
-        base_indices.shape[-1],
-        sampler_indices.shape[-1],
-        sampler_indices_padded.shape[-1],
-        embeddings_indices.shape[-1],
-    ]
-    if long_lora_indices_len is not None:
-        indices_len.append(long_lora_indices_len)
-    else:
-        # If long_lora doesn't exist,append None
-        indices_len.append(None)
-
-    return (
-        base_indices,
-        sampler_indices,
-        sampler_indices_padded,
-        embeddings_indices,
-        long_lora_indices,
-        indices_len,
-    )
-
-
-class PunicaWrapper:
-    """
-    PunicaWrapper is designed to manage and provide metadata for the punica 
-    kernel. The main function is to maintain the state information for 
-    Multi-LoRA, and to provide the interface for the punica kernel.
-    """
-
-    def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: Union[torch.device, str]):
-        self._token_lora_indices = torch.empty(max_num_batched_tokens,
-                                               dtype=torch.long,
-                                               device=device)
-        self._sampler_indices = torch.empty(max_num_batched_tokens,
-                                            dtype=torch.long,
-                                            device=device)
-        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
-                                                   dtype=torch.long,
-                                                   device=device)
-        self._embeddings_indices = torch.empty(2,
-                                               max_num_batched_tokens,
-                                               dtype=torch.long,
-                                               device=device)
-        self._long_lora_indices = torch.empty(max_num_batched_tokens,
-                                              dtype=torch.long,
-                                              device=device)
-
-        # 5 is the number of indicies tensors.
-        # base_indices, sampler_indices, sampler_indices_padded,
-        # embeddings_indices,long_lora_indices
-        self.indices_len: List[Optional[int]] = [None] * 5
-        # these attributes are the information required for sgmv kernel
-        self._seq_start_locs = torch.empty(max_batches,
-                                           dtype=torch.long,
-                                           device=device)
-        self._seq_lengths = torch.empty(max_batches,
-                                        dtype=torch.long,
-                                        device=device)
-        self._lora_indices_per_batch = torch.empty(max_batches,
-                                                   dtype=torch.long,
-                                                   device=device)
-        self.device: torch.device = device
-        self.max_length: int = 0
-        self.token_nums: int = 0
-        self.batch_size: int = -1
-        self.is_prefill = False
-        self.no_lora = False
-
-    def update_metadata(
-        self,
-        mapping: "LoRAMapping",
-        lora_index_to_id: List[Optional[int]],
-        max_loras: int,
-        vocab_size: int,
-        extra_vocab_size: int,
-        long_lora_context: Optional["LongContextLoRAContext"] = None,
-    ):
-
-        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
-                                   vocab_size, extra_vocab_size,
-                                   long_lora_context)
-        if mapping.is_prefill:
-            # Update metadata required for prefill-related operators.
-            self._update_prefill_metada(self.token_lora_indices)
-            self.is_prefill = True
-        else:
-            self.is_prefill = False
-
-    def _update_base_metadata(
-        self,
-        mapping: "LoRAMapping",
-        lora_index_to_id: List[Optional[int]],
-        max_loras: int,
-        vocab_size: int,
-        extra_vocab_size: int,
-        long_lora_context: Optional["LongContextLoRAContext"] = None,
-    ):
-        (
-            base_indices,
-            sampler_indices,
-            sampler_indices_padded,
-            embeddings_indices,
-            long_lora_offsets_tensor,
-            indices_len,
-        ) = convert_mapping(
-            mapping,
-            lora_index_to_id,
-            max_loras,
-            vocab_size,
-            extra_vocab_size,
-            self.device,
-            long_lora_context,
-        )
-        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
-        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
-        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
-            sampler_indices_padded)
-        self._embeddings_indices[:embeddings_indices.
-                                 shape[0], :embeddings_indices.shape[1]].copy_(
-                                     embeddings_indices)
-        if long_lora_offsets_tensor is not None:
-            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
-                long_lora_offsets_tensor)
-        else:
-            self._long_lora_indices.zero_()
-        self.indices_len[:] = indices_len
-
-    def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
-
-        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
-         batch_size, max_length, token_nums,
-         no_lora) = compute_meta(token_lora_tensor)
-
-        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
-            b_seq_start_tensor)
-        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
-        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
-            lora_indices_tensor)
-        self.batch_size = batch_size
-        self.max_length = max_length
-        self.token_nums = token_nums
-        self.no_lora = no_lora
-
-    @property
-    def prefill_metadata(
-        self
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]:
-        """
-        This property provides a convenient way to access the necessary 
-        metadata for prefill-related  kernel computations.
-            1. seq_start_locs: Tensor of sequence start positions.
-            2. seq_lengths: Tensor of sequence lengths.
-            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
-                -1 means no lora should be applied.
-            4. batch_size: Batch size after clustering identical lora indices.
-            5. max_length: The maximum sequence length in the batch.
-            6. token_nums: The token numbers in the batch.
-        """
-        return (self._seq_start_locs[:self.batch_size],
-                self._seq_lengths[:self.batch_size],
-                self._lora_indices_per_batch[:self.batch_size],
-                self.batch_size, self.max_length, self.token_nums)
-
-    @property
-    def token_lora_indices(self) -> torch.Tensor:
-        """
-        This property provides the lora indices corresponding to each token 
-        in the batch. An index of -1 means no lora should be applied.
-        """
-        token_lora_len = self.indices_len[0]
-        return self._token_lora_indices[:token_lora_len]
-
-    @property
-    def sampler_indices(self) -> torch.Tensor:
-        """ 
-        This property is used to access the lora indices specifically for 
-        LogitsProcessorWithLoRA.
-        """
-        sampler_indices_len = self.indices_len[1]
-        return self._sampler_indices[:sampler_indices_len]
-
-    @property
-    def sampler_indices_padded(self) -> torch.Tensor:
-        """
-        This property provides access to padded sampler indices.
-        """
-        indices_padded_len = self.indices_len[2]
-        return self._sampler_indices_padded[:indices_padded_len]
-
-    @property
-    def embeddings_indices(self) -> torch.Tensor:
-        """
-        This property provides access to the indices used for lora embeddings, 
-        specifically for VocabParallelEmbeddingWithLoRA.
-        """
-        embeddings_indices_len = self.indices_len[3]
-        return self._embeddings_indices[:, :embeddings_indices_len]
-
-    @property
-    def long_lora_indices(self) -> torch.Tensor:
-        """ 
-        This property provides access to the indices used for long context 
-        lora, specifically for LinearScalingRotaryEmbeddingWithLora.
-        """
-        long_lora_len = self.indices_len[4]
-        return self._long_lora_indices[:long_lora_len]
-
-    def shrink_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        scale: float,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_shrink(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            scale,
-        )
-
-    def shrink_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        scale: float,
-    ):
-        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
-
-    def expand_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_input: bool,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_expand(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            add_input,
-        )
-
-    def expand_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_input: bool,
-    ):
-        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
-
-    def expand_slice_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_input: bool,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_expand_slice(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            y_offset,
-            y_slice_size,
-            add_input,
-        )
-
-    def expand_slice_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_input: bool,
-    ):
-        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
-                          y_slice_size, add_input)
-
-    def add_shrink(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        scale: float,
-    ):
-        """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
-        GEMM of lora'a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the shrink_decode function
-        should be called.
-        """
-        shrink_fun: Callable = (self.shrink_prefill
-                                if self.is_prefill else self.shrink_decode)
-        shrink_fun(y, x, w_t_all, scale)
-
-    def add_expand(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_input: bool = True,
-    ):
-        """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
-        GEMM of lora'b.
-        When `is_prefill` is true, it indicates that it is currently the
-        prefill stage, and the `expand_prefill` function should be called.
-        Otherwise, it is the decode stage, and the expand_decode function
-        should be called.
-        """
-
-        expand_fun: Callable = (self.expand_prefill
-                                if self.is_prefill else self.expand_decode)
-        expand_fun(y, x, w_t_all, add_input)
-
-    def add_expand_slice(self,
-                         y: torch.Tensor,
-                         x: torch.Tensor,
-                         w_t_all: torch.Tensor,
-                         y_offset: Optional[int],
-                         y_slice_size: Optional[int],
-                         add_input: bool = True):
-        """
-        Similar to `add_expand`
-        """
-
-        expand_slice_fun: Callable = (self.expand_slice_prefill
-                                      if self.is_prefill else
-                                      self.expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
-
-    def add_lora(self,
-                 y: torch.Tensor,
-                 x: torch.Tensor,
-                 wa_t_all: torch.Tensor,
-                 wb_t_all: torch.Tensor,
-                 scale: float,
-                 y_offset: Optional[int] = None,
-                 y_slice_size: Optional[int] = None,
-                 *,
-                 buffer: Optional[torch.Tensor] = None) -> None:
-        """
-        Semantics:
-        y[i] += (
-            x[i].unsqueeze(0)
-            @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-            @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-            * scale
-            ).squeeze(0)
-        Args:
-            y (torch.Tensor):  Output tensor. Will be changed in-place.
-            x (torch.Tensor): Input tensor
-            wa_t_all (torch.Tensor): lora_a's weight
-            wb_t_all (torch.Tensor): lora_b's weight
-            scale (float): Scaling factor.
-            y_offset (Optional[int], optional): Offset to apply to the starting
-                column of y.
-            y_slice_size (Optional[int], optional): Size of the y column slice.
-            buffer (Optional[torch.Tensor], optional): Defaults to None.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        x = x.view(-1, x.shape[-1])
-        r = wb_t_all.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default ,refer to:
-            # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-
-        self.add_shrink(buffer, x, wa_t_all, scale)
-        if y_offset is None and y_slice_size is None:
-            self.add_expand(y, buffer, wb_t_all, add_input=True)
-        else:
-            self.add_expand_slice(y,
-                                  buffer,
-                                  wb_t_all,
-                                  y_offset,
-                                  y_slice_size,
-                                  add_input=True)
-        y = y.view_as(y_org)
-
-    def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
-                               lora_a_stacked: Tuple[torch.Tensor,
-                                                     torch.Tensor,
-                                                     torch.Tensor],
-                               lora_b_stacked: Tuple[torch.Tensor,
-                                                     torch.Tensor,
-                                                     torch.Tensor],
-                               scale: float,
-                               output_slices: Tuple[int, ...]) -> None:
-        """
-        Applies lora to each input. Similar to add_lora, This method is 
-        used for layers that are composed of multiple sublayers
-        (slices) packed together.
-        """
-        y_org = y
-        x = x.view(-1, x.shape[-1])
-        y = y.view(-1, y.shape[-1])
-        offset_left = 0
-        # TODO fuse these kernels
-        for slice_idx in range(len(output_slices)):
-            self.add_lora(y, x, lora_a_stacked[slice_idx],
-                          lora_b_stacked[slice_idx], scale, offset_left,
-                          output_slices[slice_idx])
-            offset_left += output_slices[slice_idx]
-
-        y = y.view_as(y_org)
-
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        wa_t_all: torch.Tensor,
-                        wb_t_all: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None) -> None:
-        """
-        LogitsProcessorWithLoRA always using bgmv
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        x = x.view(-1, x.shape[-1])
-        r = wb_t_all.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default ,refer to:
-            # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-
-        bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale)
-        bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True)
-        y = y.view_as(y_org)
diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py
new file mode 100644
index 0000000000000..48ada3926ea46
--- /dev/null
+++ b/vllm/lora/punica_wrapper/__init__.py
@@ -0,0 +1,7 @@
+from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
+from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
+
+__all__ = [
+    "PunicaWrapperBase",
+    "get_punica_wrapper",
+]
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
new file mode 100644
index 0000000000000..b9ec0c4bc6323
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -0,0 +1,482 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+import torch
+
+from .utils import compute_meta, convert_mapping
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
+
+
+class PunicaWrapperABC(ABC):
+    """
+    PunicaWrapper ABC.
+    """
+
+    @abstractmethod
+    def update_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Update the lora-related metadata
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_shrink(
+        self,
+        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: Tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> None:
+        """
+        Performs GEMM  for multiple slices of lora_a.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        lora_b_stacked: Tuple[torch.Tensor, ...],
+        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+        output_slices: Tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        add_inputs: bool = True,
+        **kwargs,
+    ) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA, 
+        and this layer only requires the expand operation.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        """
+        raise NotImplementedError
+
+
+class PunicaWrapperBase(PunicaWrapperABC):
+    """
+    PunicaWrapperBase is designed to manage and provide metadata for the punica 
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        self._token_lora_indices = torch.empty(max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._sampler_indices = torch.empty(max_num_batched_tokens,
+                                            dtype=torch.long,
+                                            device=device)
+        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self._embeddings_indices = torch.empty(2,
+                                               max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._long_lora_indices = torch.empty(max_num_batched_tokens,
+                                              dtype=torch.long,
+                                              device=device)
+
+        # 5 is the number of indicies tensors.
+        # base_indices, sampler_indices, sampler_indices_padded,
+        # embeddings_indices,long_lora_indices
+        self.indices_len: List[Optional[int]] = [None] * 5
+        # these attributes are the information required for sgmv kernel
+        self._seq_start_locs = torch.empty(max_batches,
+                                           dtype=torch.long,
+                                           device=device)
+        self._seq_lengths = torch.empty(max_batches,
+                                        dtype=torch.long,
+                                        device=device)
+        self._lora_indices_per_batch = torch.empty(max_batches,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self.device: torch.device = device
+        self.max_length: int = 0
+        self.token_nums: int = 0
+        self.batch_size: int = -1
+        self.is_prefill = False
+        self.no_lora = False
+
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            lora_index_to_id,
+            max_loras,
+            vocab_size,
+            extra_vocab_size,
+            self.device,
+            long_lora_context,
+        )
+        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
+        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
+        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
+            sampler_indices_padded)
+        self._embeddings_indices[:embeddings_indices.
+                                 shape[0], :embeddings_indices.shape[1]].copy_(
+                                     embeddings_indices)
+        if long_lora_offsets_tensor is not None:
+            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
+                long_lora_offsets_tensor)
+        else:
+            self._long_lora_indices.zero_()
+        self.indices_len[:] = indices_len
+
+    def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
+
+        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+         batch_size, max_length, token_nums,
+         no_lora) = compute_meta(token_lora_tensor)
+
+        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
+            b_seq_start_tensor)
+        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
+        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
+            lora_indices_tensor)
+        self.batch_size = batch_size
+        self.max_length = max_length
+        self.token_nums = token_nums
+        self.no_lora = no_lora
+
+    def _apply_bias(
+        self,
+        indices: torch.Tensor,
+        output: torch.Tensor,
+        output_slices: Tuple[int, ...],
+        lora_bias_stacked: Tuple[Optional[torch.Tensor], ...],
+    ):
+        """Applies bias to output
+
+        Input shapes:
+            lora_bias_stacked:      3 element tuple of (num_loras, output_dim)
+            indices:           (batch_size)
+            output:            (batch_size, q_slice_size + 2*kv_slice_size)
+            output_slices:     n-1 element tuple of (slice_size...),
+                            where n is number of slices
+        """
+        org_output = output
+        output = output.view(-1, output.shape[-1])
+        indices = indices.view(-1)
+
+        offset_left = 0
+        for slice_idx, slice in enumerate(output_slices):
+            bias = lora_bias_stacked[slice_idx]
+            if bias is not None:
+                bias = bias.view(-1, bias.shape[-1])
+                bias = bias[indices]
+                bias[indices == -1] = 0
+                output[:, offset_left:offset_left + slice] += bias
+            offset_left += slice
+
+        return output.view_as(org_output)
+
+    @property
+    def prefill_metadata(
+        self
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]:
+        """
+        This property provides a convenient way to access the necessary 
+        metadata for prefill-related  kernel computations.
+            1. seq_start_locs: Tensor of sequence start positions.
+            2. seq_lengths: Tensor of sequence lengths.
+            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
+                -1 means no lora should be applied.
+            4. batch_size: Batch size after clustering identical lora indices.
+            5. max_length: The maximum sequence length in the batch.
+            6. token_nums: The token numbers in the batch.
+        """
+        return (self._seq_start_locs[:self.batch_size],
+                self._seq_lengths[:self.batch_size],
+                self._lora_indices_per_batch[:self.batch_size],
+                self.batch_size, self.max_length, self.token_nums)
+
+    @property
+    def token_lora_indices(self) -> torch.Tensor:
+        """
+        This property provides the lora indices corresponding to each token 
+        in the batch. An index of -1 means no lora should be applied.
+        """
+        token_lora_len = self.indices_len[0]
+        return self._token_lora_indices[:token_lora_len]
+
+    @property
+    def sampler_indices(self) -> torch.Tensor:
+        """ 
+        This property is used to access the lora indices specifically for 
+        LogitsProcessorWithLoRA.
+        """
+        sampler_indices_len = self.indices_len[1]
+        return self._sampler_indices[:sampler_indices_len]
+
+    @property
+    def sampler_indices_padded(self) -> torch.Tensor:
+        """
+        This property provides access to padded sampler indices.
+        """
+        indices_padded_len = self.indices_len[2]
+        return self._sampler_indices_padded[:indices_padded_len]
+
+    @property
+    def embeddings_indices(self) -> torch.Tensor:
+        """
+        This property provides access to the indices used for lora embeddings, 
+        specifically for VocabParallelEmbeddingWithLoRA.
+        """
+        embeddings_indices_len = self.indices_len[3]
+        return self._embeddings_indices[:, :embeddings_indices_len]
+
+    @property
+    def long_lora_indices(self) -> torch.Tensor:
+        """ 
+        This property provides access to the indices used for long context 
+        lora, specifically for LinearScalingRotaryEmbeddingWithLora.
+        """
+        long_lora_len = self.indices_len[4]
+        return self._long_lora_indices[:long_lora_len]
+
+    def update_metadata(
+            self,
+            mapping: "LoRAMapping",
+            lora_index_to_id: List[Optional[int]],
+            max_loras: int,
+            vocab_size: int,
+            extra_vocab_size: int,
+            long_lora_context: Optional["LongContextLoRAContext"] = None,
+            **kwargs):
+
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size,
+                                   long_lora_context)
+        if mapping.is_prefill:
+            # Update metadata required for prefill-related operators.
+            self._update_prefill_metada(self.token_lora_indices)
+            self.is_prefill = True
+        else:
+            self.is_prefill = False
+
+    @abstractmethod
+    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
+                   scale: float, **kwargs) -> None:
+        """
+        Performs GEMM  for multiple slices of lora_a.
+
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   lora_b_stacked: Tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                   output_slices: Tuple[int, ...],
+                   offset_start: int = 0,
+                   add_inputs=True,
+                   **kwargs) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            offset = offset_start
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (Tuple[int, ...]): Every slice's size
+            offset_start (int): The starting position of y, defaults to 0
+            add_inputs (bool):  Defaults to True.
+
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+        and this layer only requires the expand operation.
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_inputs (bool): Default to True.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (Tuple[int, ...]): Every slice's size.
+            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
new file mode 100644
index 0000000000000..de378df8b3cfa
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -0,0 +1,358 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Callable, Optional, Tuple, Union, final
+
+import torch
+
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.lora.ops.bgmv_expand import bgmv_expand
+    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+    from vllm.lora.ops.sgmv_expand import sgmv_expand
+    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+
+from .punica_base import PunicaWrapperBase
+
+
+@final
+class PunicaWrapperGPU(PunicaWrapperBase):
+    """
+    PunicaWrapperGPU is designed to manage and provide metadata for the punica 
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica triton kernel.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
+                                   device)
+
+    def _shrink_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_shrink(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            scale,
+        )
+
+    def _shrink_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
+
+    def _expand_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_inputs: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            add_inputs,
+        )
+
+    def _expand_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_inputs: bool,
+    ):
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
+
+    def _expand_slice_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_inputs: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand_slice(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            y_offset,
+            y_slice_size,
+            add_inputs,
+        )
+
+    def _expand_slice_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_inputs: bool,
+    ):
+        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
+                          y_slice_size, add_inputs)
+
+    def _apply_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_inputs: bool = True,
+    ):
+        """
+        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
+        computation, which is suitable for the
+        GEMM of lora'b.
+        """
+
+        expand_slice_fun: Callable = (self._expand_slice_prefill
+                                      if self.is_prefill else
+                                      self._expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
+
+    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
+                      w_t_all: torch.Tensor, scale: float):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        shrink_fun: Callable = (self._shrink_prefill
+                                if self.is_prefill else self._shrink_decode)
+        shrink_fun(y, x, w_t_all, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
+                   scale: float, **kwargs):
+        """
+        Performs GEMM  for multiple slices of lora_a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+            
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+        """
+
+        x = x.view(-1, x.shape[-1])
+        # TODO fuse these kernels
+        for slice_idx in range(len(lora_a_stacked)):
+            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
+                               scale)
+
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   lora_b_stacked: Tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                   output_slices: Tuple[int, ...],
+                   offset_start: int = 0,
+                   add_inputs=True,
+                   **kwargs) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (Tuple[int, ...]): Every slice's size
+            add_inputs (bool):  Defaults to True.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        offset_left = offset_start
+        if lora_bias_stacked is not None:
+            self._apply_bias(self.token_lora_indices, y, output_slices,
+                             lora_bias_stacked)
+        for slice_idx in range(len(lora_b_stacked)):
+            self._apply_expand(
+                y,
+                x[slice_idx],
+                lora_b_stacked[slice_idx],
+                offset_left,
+                output_slices[slice_idx],
+                add_inputs=add_inputs,
+            )
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_inputs (bool): Default to True.
+        """
+
+        # Embedding layer only need expand op
+        expand_fun: Callable = (self._expand_prefill
+                                if self.is_prefill else self._expand_decode)
+        expand_fun(y, x, lora_b_stacked, add_inputs)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (Tuple[int, ...]): Every slice's size.
+            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
+        """
+
+        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
+        if lora_bias_stacked is not None:
+            assert len(lora_bias_stacked) == len(output_slices)
+            y = self._apply_bias(self.token_lora_indices, y, output_slices,
+                                 lora_bias_stacked)
+
+        if buffer is None:
+            r = lora_b_stacked[0].size(-1)
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = tuple(
+                torch.zeros(
+                    (x.size(0), r), dtype=torch.float32, device=x.device)
+                for _ in range(len(output_slices)))
+        self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
+        self.add_expand(y,
+                        buffer,
+                        lora_b_stacked,
+                        None,
+                        output_slices,
+                        add_inputs=True,
+                        **kwargs)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = lora_b_stacked.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+        # LogitsProcessorWithLoRA always using bgmv.
+        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer,
+                    lora_b_stacked,
+                    y,
+                    self.sampler_indices,
+                    add_inputs=True)
+        y = y.view_as(y_org)
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
new file mode 100644
index 0000000000000..d9c4f44a1c282
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -0,0 +1,87 @@
+from typing import Optional, Tuple, Union, final
+
+import torch
+from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
+                                    dispatch_bgmv_linear)
+
+from .punica_base import PunicaWrapperBase
+
+
+@final
+class PunicaWrapperHPU(PunicaWrapperBase):
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        # Increasing max_num_batched_tokens by 3x to handle increase in
+        # tensor size due to padding.
+        PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
+                                   max_batches, device)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+
+        for slice_idx in range(len(output_slices)):
+            dispatch_bgmv_linear(
+                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
+                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink(
+        self,
+        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: Tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        lora_b_stacked: Tuple[torch.Tensor, ...],
+        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+        output_slices: Tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
new file mode 100644
index 0000000000000..cd64878d95ae3
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -0,0 +1,19 @@
+from vllm.platforms import current_platform
+from vllm.utils import print_info_once
+
+from .punica_base import PunicaWrapperBase
+
+
+def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
+    if current_platform.is_cuda_alike():
+        # Lazy import to avoid ImportError
+        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
+        print_info_once("Using PunicaWrapperGPU.")
+        return PunicaWrapperGPU(*args, **kwargs)
+    elif current_platform.is_hpu():
+        # Lazy import to avoid ImportError
+        from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
+        print_info_once("Using PunicaWrapperHPU.")
+        return PunicaWrapperHPU(*args, **kwargs)
+    else:
+        raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
new file mode 100644
index 0000000000000..7360c8c09e3ac
--- /dev/null
+++ b/vllm/lora/punica_wrapper/utils.py
@@ -0,0 +1,159 @@
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+import torch
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
+
+
+def compute_meta(
+    token_lora_tensor: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]:
+    """
+    Get the information required for the sgmv kernel. With the  features:
+    1. If consecutive requests in the batch use the same LoRA, this function
+    will combine them into a single request, improving sgmv kernel inference
+    performance.
+    2. At the beginning of each prefill stage inference, recalculations are
+    needed based on the input, but only once.
+    """
+
+    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
+        token_lora_tensor, return_counts=True)
+    cum_result = torch.cumsum(seq_length_tensor, dim=0)
+    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
+    b_seq_start_tensor[1:].copy_(cum_result[:-1])
+    max_length = seq_length_tensor.max().item()
+    token_nums = seq_length_tensor.sum().item()
+    batch_size = lora_indices_tensor.size(0)
+    no_lora = False
+    # -1 means no lora should be applied. Use `no_lora` to determine whether
+    # the current step requires LoRA. If LoRA is not needed, the prefill stage
+    # does not need to launch the triton kernel, which can improve performance
+    if batch_size == 1 and lora_indices_tensor == -1:
+        no_lora = True
+    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+            batch_size, max_length, token_nums, no_lora)
+
+
+# TODO see if this can be vectorized
+def convert_mapping(
+    mapping: "LoRAMapping",
+    lora_index_to_id: List[Optional[int]],
+    max_loras: int,
+    vocab_size: int,
+    extra_vocab_size: int,
+    device: torch.device,
+    long_lora_context: Optional["LongContextLoRAContext"] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+           Optional[torch.Tensor], List[int]]:
+    """Converts LoRAMapping to index tensors.
+
+    Args:
+        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
+        lora_index_to_id: List mapping LoRA ids to LoRA indices.
+        max_loras: Maximum number of LoRAs.
+        vocab_size: Model vocab size.
+        extra_vocab_size: Extra vocab size each LoRA can have.
+        long_lora_context: Passed if there are long context lora in a batch.
+
+    Returns:
+        A tuple of tensors:
+            base_indices: Tensor of shape [batch_size] mapping batch rows to
+                LoRA indices.
+            sampler_indices: Tensor of shape [batch_size] mapping requests to
+                LoRA indices for sampler. For generation, this will be the
+                same as base_indicies. For prefill, this will map requests
+                to LoRA indices.
+            sampler_indices_padded: Tensor of shape [batch_size] mapping
+                requests to LoRA indices for sampler with padding.
+                Same as sampler_indicies, but -1 is replaced with
+                max_loras.
+            embeddings_indices: Tensor of shape [2, batch_size] mapping
+                requests to embedding indices. First row is for embeddings
+                added by the LoRAs, second row is for the LoRA.lora_a
+                embeddings.
+            long_lora_indices: Tensor of shape [batch_size] mapping
+                requests to RoPE offsets and rot dims for long LoRAs.
+                None if long context lora doesn't exist.
+            indices_len: List of lengths of the above tensors. It contains
+                (base_indices, sampler_indices, sampler_indices_padded,
+                embeddings_indices, long_lora_indices).
+    """
+    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+    long_lora_offsets: Optional[torch.Tensor] = None
+    if long_lora_context:
+        long_lora_offsets = torch.zeros(len(index_mapping_indices),
+                                        device=device,
+                                        dtype=torch.long)
+    prompt_mapping: List[int] = [
+        lora_index_to_id.index(x) if x > 0 else -1
+        for x in mapping.prompt_mapping
+    ]
+    lora_idx = None
+    for i in range(len(index_mapping_indices)):
+        # TODO index can be slow. optimize
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_indices[i] = lora_idx
+        if long_lora_context:
+            assert long_lora_offsets is not None
+            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
+                index_mapping_indices[i], 0)
+            long_lora_offsets[i] = lora_offset
+
+    indices_list: List[Union[List[int], torch.Tensor]] = [
+        index_mapping_indices,
+        lora_indices,
+        embedding_indices,
+    ]
+    if long_lora_context:
+        assert long_lora_offsets is not None
+        indices_list.append(long_lora_offsets)
+    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         dtype=torch.long,
+                                         device=device)
+    embeddings_indices = torch.stack([
+        indices[2] * extra_vocab_size,
+        indices[2] * (vocab_size + extra_vocab_size),
+    ])
+    embeddings_indices[embeddings_indices == -1] = max_loras - 1
+    base_indices = indices[1]
+    sampler_indices = prompt_mapping_tensor
+    sampler_indices_padded = sampler_indices.clone()
+    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
+    sampler_indices_padded = torch.arange(
+        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
+    long_lora_indices = None
+    long_lora_indices_len: Optional[int] = None
+    if long_lora_context:
+        long_lora_indices = indices[3]
+        long_lora_indices_len = long_lora_indices.shape[-1]
+    # Contain length of indices tensors. Used to index into each tensor.
+    indices_len = [
+        base_indices.shape[-1],
+        sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1],
+        embeddings_indices.shape[-1],
+    ]
+    if long_lora_indices_len is not None:
+        indices_len.append(long_lora_indices_len)
+    else:
+        # If long_lora doesn't exist,append None
+        indices_len.append(None)
+
+    return (
+        base_indices,
+        sampler_indices,
+        sampler_indices_padded,
+        embeddings_indices,
+        long_lora_indices,
+        indices_len,
+    )
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 24d75f4df4e02..fddc8bad09ef5 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,10 +1,8 @@
-from functools import lru_cache
 from typing import Dict, Type
 
 import torch.nn as nn
 
-import vllm.envs as envs
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import print_warning_once
@@ -63,10 +61,13 @@ def forward_hpu(self, *args, **kwargs):
     def dispatch_forward(self):
         # NOTE(woosuk): Here we assume that vLLM was built for only one
         # specific backend. Currently, we do not support dynamic dispatching.
-
+        compilation_config = get_current_vllm_config().compilation_config
         enabled = self.enabled()
-        logger.debug("custom op %s %s", self.__class__.name,
-                     "enabled" if enabled else "disabled")
+        if enabled:
+            compilation_config.enabled_custom_ops.update([self.__class__.name])
+        else:
+            compilation_config.disabled_custom_ops.update(
+                [self.__class__.name])
 
         if not enabled:
             return self.forward_native
@@ -87,6 +88,8 @@ def dispatch_forward(self):
     @classmethod
     def enabled(cls) -> bool:
         # if no name, then it was not registered
+        compilation_config = get_current_vllm_config().compilation_config
+        custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
             print_warning_once(
                 f"Custom op {cls.__name__} was not registered, "
@@ -94,22 +97,25 @@ def enabled(cls) -> bool:
                 f"It will be enabled/disabled based on the global settings.")
             return CustomOp.default_on()
 
-        enabled = f"+{cls.name}" in envs.VLLM_CUSTOM_OPS
-        disabled = f"-{cls.name}" in envs.VLLM_CUSTOM_OPS
+        enabled = f"+{cls.name}" in custom_ops
+        disabled = f"-{cls.name}" in custom_ops
         assert not (enabled
                     and disabled), f"Cannot enable and disable {cls.name}"
 
         return (CustomOp.default_on() or enabled) and not disabled
 
-    # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE
-    # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence.
     @staticmethod
-    @lru_cache
     def default_on() -> bool:
-        count_none = envs.VLLM_CUSTOM_OPS.count("none")
-        count_all = envs.VLLM_CUSTOM_OPS.count("all")
-        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
-        return envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE and \
+        """
+        On by default if level < CompilationLevel.PIECEWISE
+        Specifying 'all' or 'none' in custom_op takes precedence.
+        """
+        from vllm.config import CompilationLevel
+        compilation_config = get_current_vllm_config().compilation_config
+        custom_ops = compilation_config.custom_ops
+        count_none = custom_ops.count("none")
+        count_all = custom_ops.count("all")
+        return compilation_config.level < CompilationLevel.PIECEWISE and \
             not count_none > 0 or count_all > 0
 
     # Dictionary of all custom ops (classes, indexed by registered name).
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index d7b67425fcbc0..e631aec928ec5 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,14 +1,96 @@
-from typing import Optional
+from __future__ import annotations
 
-from vllm.logits_process import LogitsProcessor
-from vllm.sampling_params import GuidedDecodingParams
+from typing import TYPE_CHECKING
+
+from vllm.logger import init_logger
+from vllm.platforms import CpuArchEnum, current_platform
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from vllm.config import ModelConfig
+    from vllm.logits_process import LogitsProcessor
+    from vllm.sampling_params import GuidedDecodingParams
+
+logger = init_logger(__name__)
+
+
+def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and any(
+                key in obj for key in [
+                    "minimum", "maximum", "exclusiveMinimum",
+                    "exclusiveMaximum", "multipleOf"
+                ]):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def maybe_backend_fallback(
+        guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
+    # lm-format-enforce doesn't support grammar, fallback to xgrammar
+    if (guided_params.backend == "lm-format-enforcer"
+            and guided_params.grammar is not None):
+        logger.warning(
+            "lm-format-enforcer does not support grammar guided decoding. "
+            "Falling back to use xgrammar instead.")
+        guided_params.backend = "xgrammar"
+
+    if guided_params.backend == "xgrammar":
+        # xgrammar only has x86 wheels for linux, fallback to outlines
+        if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
+            logger.warning("xgrammar is only supported on x86 CPUs. "
+                           "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
+
+        # xgrammar doesn't support regex or choice, fallback to outlines
+        if guided_params.regex is not None or guided_params.choice is not None:
+            logger.warning(
+                "xgrammar only supports json or grammar guided decoding. "
+                "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
+
+        # xgrammar doesn't support some JSON schema features
+        elif (guided_params.json is not None
+              and has_xgrammar_unsupported_json_features(guided_params.json)):
+            logger.warning(
+                "xgrammar does not support advanced JSON schema features like "
+                "patterns or numeric ranges. "
+                "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
+
+    return guided_params
 
 
 async def get_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer) -> Optional[LogitsProcessor]:
+        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig) -> LogitsProcessor | None:
+    guided_params = maybe_backend_fallback(guided_params)
     # CFG grammar not supported by LMFE, so we use outlines instead
-    if guided_params.backend == 'outlines' or guided_params.grammar:
+    if guided_params.backend == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_outlines_guided_decoding_logits_processor)
@@ -19,17 +101,23 @@ async def get_guided_decoding_logits_processor(
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
         return get_local_lm_format_enforcer_guided_decoding_logits_processor(
             guided_params, tokenizer)
+    if guided_params.backend == 'xgrammar':
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
+            get_local_xgrammar_guided_decoding_logits_processor)
+        return get_local_xgrammar_guided_decoding_logits_processor(
+            guided_params, tokenizer, model_config)
 
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer'")
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
 
 
 def get_local_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer) -> Optional[LogitsProcessor]:
+        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig) -> LogitsProcessor | None:
+    guided_params = maybe_backend_fallback(guided_params)
     # CFG grammar not supported by LMFE, so we use outlines instead
-    if guided_params.backend == 'outlines' or guided_params.grammar:
+    if guided_params.backend == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_local_outlines_guided_decoding_logits_processor)
@@ -40,7 +128,12 @@ def get_local_guided_decoding_logits_processor(
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
         return get_local_lm_format_enforcer_guided_decoding_logits_processor(
             guided_params, tokenizer)
+    if guided_params.backend == 'xgrammar':
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
+            get_local_xgrammar_guided_decoding_logits_processor)
+        return get_local_xgrammar_guided_decoding_logits_processor(
+            guided_params, tokenizer, model_config)
 
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer'")
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 8a7ff38bfeb1a..eb8db882435e6 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,5 +1,6 @@
 import asyncio
 import concurrent.futures
+import os
 from enum import Enum
 from json import dumps as json_dumps
 from re import escape as regex_escape
@@ -48,6 +49,11 @@ class GuidedDecodingMode(Enum):
 
 global_thread_pool = None  # used for generating logits processor fsm
 
+# It's not yet clear that using more provides a benefit, and it could
+# potentially starve other processes on the machine. We'll cap this for now and
+# adjust later if testing proves it to help overcome a bottleneck.
+_MAX_THREADPOOL_WORKERS = 16
+
 
 async def get_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
@@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor(
         return None
 
     if global_thread_pool is None:
+        max_workers = os.cpu_count() or 2
+        if max_workers > _MAX_THREADPOOL_WORKERS:
+            max_workers = _MAX_THREADPOOL_WORKERS
         global_thread_pool = concurrent.futures.ThreadPoolExecutor(
-            max_workers=2)
+            max_workers=max_workers)
     loop = asyncio.get_running_loop()
 
     return await loop.run_in_executor(global_thread_pool,
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index e1309c31f77e7..b63fed1c8a8c3 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -25,7 +25,7 @@
 from outlines import grammars
 from outlines.caching import cache
 from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
-from outlines.fsm.json_schema import build_regex_from_schema
+from outlines_core.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
 
@@ -99,7 +99,7 @@ class RegexLogitsProcessor(BaseLogitsProcessor):
     def _get_guide(cls, regex_string: str,
                    tokenizer: PreTrainedTokenizerBase) -> Guide:
         tokenizer = _adapt_tokenizer(tokenizer)
-        return RegexGuide(regex_string, tokenizer)
+        return RegexGuide.from_regex(regex_string, tokenizer)
 
     def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
         """Compile the FSM that drives the regex-structured generation.
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
new file mode 100644
index 0000000000000..5b97f03257502
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -0,0 +1,311 @@
+# noqa: UP007
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import torch
+from transformers import PreTrainedTokenizerFast
+
+try:
+    import xgrammar as xgr
+    from xgrammar.base import _core as xgr_core
+except ImportError:
+    pass
+
+from vllm.model_executor.guided_decoding.xgrammar_utils import (
+    convert_lark_to_gbnf, grammar_is_likely_lark)
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from vllm.config import ModelConfig
+    from vllm.sampling_params import GuidedDecodingParams
+
+
+# TODO: passing batch size to max threads here
+def get_local_xgrammar_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig,
+        max_threads: int = 8):
+    config = GrammarConfig.from_guided_params(guided_params=guided_params,
+                                              model_config=model_config,
+                                              tokenizer=tokenizer,
+                                              max_threads=max_threads)
+    return XGrammarLogitsProcessor(config)
+
+
+@dataclass(frozen=True)
+class TokenizerData:
+    """Immutable container for cached tokenizer data."""
+    encoded_vocab: list[str] = field(default_factory=list)
+    stop_token_ids: list[int] | None = None
+    # These fields are mutually exclusive: `backend_str` is used to create a
+    # TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
+    # used within the constructor of TokenizeInfo
+    backend_str: str | None = None
+    vocab_type: xgr.VocabType | None = None
+
+    def __post_init__(self):
+        # Check for mutual exclusive
+        assert not (self.backend_str and self.vocab_type), \
+            "backend_str and vocab_type are mutual exclusive"
+
+
+class TokenizerDataCache:
+    """Cache manager for tokenizer data to avoid repeated processing."""
+    _cache: dict[int, TokenizerData] = {}
+
+    @classmethod
+    def get_tokenizer_data(cls,
+                           tokenizer: PreTrainedTokenizer) -> TokenizerData:
+        tokenizer_hash = hash(tokenizer)
+
+        if tokenizer_hash not in cls._cache:
+            # Vendored from xgrammar logic since we cannot pickle the tokenizer
+            # https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501
+            try:
+                encoded_vocab = [
+                    token for token, _ in sorted(tokenizer.get_vocab().items(),
+                                                 key=lambda x: x[1])
+                ]
+            except AttributeError as e:
+                raise ValueError(
+                    f"Cannot get the vocabulary of the tokenizer "
+                    f"{type(tokenizer)}. The tokenizer should have a "
+                    "get_vocab method.") from e
+
+            stop_token_ids = None
+            backend_str = ""
+            vocab_type = xgr.VocabType.RAW
+
+            if stop_token_ids is None and hasattr(
+                    tokenizer,
+                    "eos_token_id") and tokenizer.eos_token_id is not None:
+                stop_token_ids = [tokenizer.eos_token_id]
+
+            if isinstance(tokenizer, PreTrainedTokenizerFast):
+                backend_str = tokenizer.backend_tokenizer.to_str()
+                vocab_type = None
+
+            elif isinstance(tokenizer, MistralTokenizer):
+                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type = xgr.VocabType.BYTE_FALLBACK
+
+            cls._cache[tokenizer_hash] = TokenizerData(
+                encoded_vocab=encoded_vocab,
+                stop_token_ids=stop_token_ids,
+                backend_str=backend_str,
+                vocab_type=vocab_type)
+
+        return cls._cache[tokenizer_hash]
+
+
+class GrammarCompilerCache:
+    """
+    Cache for GrammarCompiler instances based on tokenizer.
+
+    This cache reduces the overhead of creating new compiler instances when
+    using the same tokenizer configuration.
+    """
+    _cache: dict[str, xgr.GrammarCompiler] = {}
+
+    @classmethod
+    def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
+        cache_key = str(config.tokenizer_hash)
+
+        if cache_key not in cls._cache:
+            assert config.tokenizer_data is not None
+            assert config.tokenizer_data.encoded_vocab is not None
+
+            config_data = config.tokenizer_data
+
+            # In TokenizerDataCache.get_tokenizer_data, a serializable
+            # tokenizer_data is created and cached. This data is used to build
+            # a tokenizer_info and create an xgrammar compiler.
+            # - If tokenizer_data has backend_str set, use
+            # xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
+            # - Otherwise, use the default constructor with vocab_type.
+            # - xgr_core.TokenizerInfo.from_huggingface !=
+            #   xgr.TokenizerInfo.from_huggingface.
+            if config_data.backend_str:
+                tokenizer_info = xgr.TokenizerInfo._create_from_handle(
+                    xgr_core.TokenizerInfo.from_huggingface(
+                        config_data.encoded_vocab, config_data.backend_str,
+                        config.vocab_size, config_data.stop_token_ids))
+            else:
+                tokenizer_info = xgr.TokenizerInfo(
+                    config_data.encoded_vocab,
+                    config_data.vocab_type,
+                    vocab_size=config.vocab_size,
+                    stop_token_ids=config_data.stop_token_ids)
+            cls._cache[cache_key] = xgr.GrammarCompiler(
+                tokenizer_info, max_threads=config.max_threads)
+
+        return cls._cache[cache_key]
+
+
+@dataclass
+class GrammarConfig:
+    """Serializable configuration for grammar compilation"""
+    tokenizer_hash: int
+    vocab_size: int
+    json_str: str | None = None
+    grammar_str: str | None = None
+    json_object: bool | None = None
+    max_threads: int = 8
+    tokenizer_data: TokenizerData | None = None
+
+    @classmethod
+    def from_guided_params(cls,
+                           guided_params: GuidedDecodingParams,
+                           model_config: ModelConfig,
+                           tokenizer: PreTrainedTokenizer,
+                           max_threads: int = 8) -> GrammarConfig:
+
+        tokenizer_hash = hash(tokenizer)
+        tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
+
+        if guided_params.json:
+            if not isinstance(guided_params.json, str):
+                json_str = json.dumps(guided_params.json)
+            else:
+                json_str = guided_params.json
+
+            # Validate the schema and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_json_schema(json_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
+            return cls(json_str=json_str,
+                       vocab_size=model_config.hf_text_config.vocab_size,
+                       tokenizer_hash=tokenizer_hash,
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
+        elif guided_params.grammar:
+            # XGrammar only supports GBNF grammars, so we must convert Lark
+            if grammar_is_likely_lark(guided_params.grammar):
+                try:
+                    grammar_str = convert_lark_to_gbnf(guided_params.grammar)
+                except ValueError as e:
+                    raise ValueError(
+                        "Failed to convert the grammar from Lark to GBNF. "
+                        "Please either use GBNF grammar directly or specify"
+                        " --guided-decoding-backend=outlines.\n"
+                        f"Conversion error: {str(e)}") from e
+            else:
+                grammar_str = guided_params.grammar
+
+            # Validate the grammar and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_ebnf(grammar_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
+            return cls(grammar_str=grammar_str,
+                       vocab_size=model_config.hf_text_config.vocab_size,
+                       tokenizer_hash=tokenizer_hash,
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
+        elif guided_params.json_object:
+            return cls(
+                json_object=True,
+                vocab_size=model_config.hf_text_config.vocab_size,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
+        else:
+            raise ValueError(
+                "Currently only support JSON and EBNF grammar mode for xgrammar"
+            )
+
+
+@dataclass
+class XGrammarLogitsProcessor:
+    """Wrapper class to support pickle protocol"""
+    config: GrammarConfig
+
+    ctx: xgr.CompiledGrammar | None = None
+    token_bitmask: torch.Tensor = None  # type: ignore[assignment]
+    matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
+    batch_size: int = field(default=1)
+    prefilled: bool = field(default=False)
+
+    def __getstate__(self) -> dict[str, Any]:
+        return {'config': self.config}
+
+    def __setstate__(self, state: dict[str, Any]):
+        self.config = state['config']
+
+        self.ctx = None
+        self.matchers = []
+        self.batch_size = 1
+        self.token_bitmask = None  # type: ignore[assignment]
+        self.prefilled = False
+
+    def _ensure_ctx(self):
+        """Lazily initialize the processor in the worker process"""
+        if self.ctx is None:
+            compiler = GrammarCompilerCache.get_compiler(self.config)
+            if self.config.json_str is not None:
+                self.ctx = compiler.compile_json_schema(self.config.json_str)
+            elif self.config.grammar_str is not None:
+                self.ctx = compiler.compile_grammar(self.config.grammar_str)
+            elif self.config.json_object:
+                self.ctx = compiler.compile_builtin_json_grammar()
+            else:
+                raise ValueError(
+                    "Invalid configuration for xgrammar logits processor")
+
+    def __call__(self, input_ids: list[int],
+                 scores: torch.Tensor) -> torch.Tensor:
+        if self.ctx is None:
+            self._ensure_ctx()
+
+        if len(self.matchers) == 0:
+            self.matchers = [
+                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
+            ]
+            self.token_bitmask = xgr.allocate_token_bitmask(
+                self.batch_size, self.config.vocab_size)
+
+        if not self.prefilled:
+            # Have not sampled a token yet
+            self.prefilled = True
+        else:
+            for i, matcher in enumerate(self.matchers):
+                if not matcher.is_terminated():
+                    sampled_token = input_ids[-1]
+                    assert self.matchers[i].accept_token(sampled_token)
+
+        for i, matcher in enumerate(self.matchers):
+            if not matcher.is_terminated():
+                # @ubospica: ideally, fill_next_token_bitmask should be
+                # parallelized with model decoding
+                # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303
+                matcher.fill_next_token_bitmask(self.token_bitmask, i)
+
+        # token_bitmask is a CPU tensor for use with accept_token and
+        # fill_next_token_bitmask so we move it to the device of scores
+        device_type = scores.device.type
+        if device_type != "cuda":
+            scores = scores.to("cpu").unsqueeze(0)
+
+        # Note: In this method, if the tensors have different dimensions
+        # on CPU device fails, but on GPU it runs without error. Hence the
+        # unsqueeze above for scores, to match the token bitmask shape
+        xgr.apply_token_bitmask_inplace(scores,
+                                        self.token_bitmask.to(scores.device))
+        if device_type != "cuda":
+            scores = scores.to(device_type).squeeze()
+
+        return scores
diff --git a/vllm/model_executor/guided_decoding/xgrammar_utils.py b/vllm/model_executor/guided_decoding/xgrammar_utils.py
new file mode 100644
index 0000000000000..9a0463964de49
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/xgrammar_utils.py
@@ -0,0 +1,158 @@
+import re
+
+
+def grammar_is_likely_lark(grammar_str: str) -> bool:
+    """
+    Check if grammar appears to use Lark syntax.
+    
+    Args:
+        grammar_str: Input grammar string
+        
+    Returns:
+        bool: True if grammar appears to be in Lark format, False otherwise
+        
+    Examples:
+        >>> grammar_is_likely_lark("rule: 'abc'")
+        True
+        >>> grammar_is_likely_lark("rule ::= 'abc'")
+        False
+    """
+    if not grammar_str or not isinstance(grammar_str, str):
+        return False
+
+    for line in grammar_str.split('\n'):
+        # Remove both comment styles
+        line = re.sub(r'(#|//).*$', '', line).strip()
+        if not line:
+            continue
+
+        # Look for GBNF rule definition
+        if '::=' in line:
+            return False
+
+    return True
+
+
+def convert_lark_to_gbnf(grammar_str: str) -> str:
+    """
+    Convert a Lark grammar string to GBNF format.
+
+    GBNF reference:
+    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+    Lark grammar reference:
+    https://lark-parser.readthedocs.io/en/latest/grammar.html
+    
+    Args:
+        grammar_str: Input grammar in Lark format
+        
+    Returns:
+        str: Converted grammar in GBNF format
+        
+    Examples:
+        >>> print(convert_lark_to_gbnf("rule: 'hello'"))
+        root ::= rule
+        rule ::= "hello"
+    """
+    if not isinstance(grammar_str, str):
+        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
+    if not grammar_str.strip():
+        raise ValueError("Grammar string cannot be empty")
+
+    defined_rules = set()
+    referenced_rules = set()
+    output_lines = []
+
+    def clean_line(line: str) -> str:
+        """Remove comments and whitespace from line."""
+        return re.sub(r'(#|//).*$', '', line).strip()
+
+    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
+        """Validate quote matching in text."""
+        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
+            raise ValueError(
+                f"Mismatched quotes in {rule_name} on line {line_num}")
+
+    def extract_references(text: str) -> set:
+        """Extract rule references from text."""
+        # Remove quoted strings and special characters
+        text = re.sub(r'"[^"]*"', '', text)
+        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
+        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
+
+    # First pass: Find root rule and validate rule definitions
+    lines = [clean_line(line) for line in grammar_str.split('\n')]
+    first_rule = None
+
+    for line_num, line in enumerate(lines, 1):
+        if not line or line.startswith('|'):
+            continue
+
+        if ':' in line:
+            try:
+                name = line.split(':', 1)[0].strip().strip('?')
+                defined_rules.add(name)
+                if first_rule is None:
+                    first_rule = name
+                if name == 'start':
+                    first_rule = 'start'
+            except IndexError as e:
+                raise ValueError(f"Invalid rule format on line {line_num}. "
+                                 "Expected 'rule_name: definition'") from e
+
+    if not defined_rules:
+        raise ValueError("No valid rules found in grammar")
+
+    # Add root rule
+    output_lines.append(f"root ::= {first_rule}")
+
+    # Second pass: Process rule definitions and alternatives
+    current_rule = None
+    current_definition = []
+
+    for line_num, line in enumerate(lines, 1):
+        if not line:
+            continue
+
+        try:
+            if ':' in line and not line.startswith('|'):
+                # Save previous rule if exists
+                if current_rule:
+                    output_lines.append(
+                        f"{current_rule} ::= {' | '.join(current_definition)}")
+
+                # Process new rule
+                name, definition = line.split(':', 1)
+                current_rule = name.strip().strip('?')
+
+                check_quotes(definition, f"rule '{current_rule}'", line_num)
+                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
+                referenced_rules.update(extract_references(definition))
+                current_definition = [definition.strip()]
+
+            elif line.startswith('|'):
+                if not current_rule:
+                    raise ValueError(f"Alternative '|' on line {line_num} "
+                                     "without a preceding rule definition")
+
+                alt_def = line[1:].strip()
+                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
+                             line_num)
+                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
+                referenced_rules.update(extract_references(alt_def))
+                current_definition.append(alt_def)
+
+        except ValueError as e:
+            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
+
+    # Add final rule if exists
+    if current_rule:
+        output_lines.append(
+            f"{current_rule} ::= {' | '.join(current_definition)}")
+
+    # Validate all rules are defined
+    undefined_rules = referenced_rules - defined_rules - {'root'}
+    if undefined_rules:
+        raise ValueError("Referenced rules are not defined: "
+                         f"{', '.join(sorted(undefined_rules))}")
+
+    return '\n'.join(output_lines)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 340da32263c1c..e6f9f01ef0f74 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -105,16 +105,18 @@ def fused_moe_kernel(
     num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
     if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
         return
-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
+        tl.int64)
     offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
     token_mask = offs_token < num_valid_tokens
 
-    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_bn = (pid_n * BLOCK_SIZE_N +
+               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
     a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
                       offs_k[None, :] * stride_ak)
 
-    off_experts = tl.load(expert_ids_ptr + pid_m)
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
     b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
                                                 offs_bn[None, :] * stride_bn)
     if use_int8_w8a16:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5570771ac917b..8c6f7c6e06515 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -242,7 +242,7 @@ def _load_per_tensor_weight_scale(self, shard_id: str,
     def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
                                                  expert_data: torch.Tensor,
                                                  shard_id: str,
-                                                 loaded_weight: torch.tensor,
+                                                 loaded_weight: torch.Tensor,
                                                  tp_rank: int):
         # Load grouped weight scales for group quantization
         # or model weights
@@ -261,7 +261,7 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
 
     def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                                        shard_dim: int, shard_id: str,
-                                       loaded_weight: torch.tensor,
+                                       loaded_weight: torch.Tensor,
                                        tp_rank: int):
         # for per channel weight quantization
         if shard_id == "w2":
@@ -274,7 +274,7 @@ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                            tp_rank=tp_rank)
 
     def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
-                  shard_id: str, loaded_weight: torch.tensor, tp_rank: int):
+                  shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
 
         # Index the loaded weight for tp sharding.
         # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
@@ -292,7 +292,7 @@ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
         expert_data.copy_(loaded_weight)
 
     def _load_w2(self, expert_data: torch.Tensor, shard_dim: int,
-                 shard_id: str, loaded_weight: torch.tensor, tp_rank: int):
+                 shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
 
         # Index the loaded weight for tp sharding.
         # down_proj: "RowParallel" so tp sharding on input_dim
@@ -311,7 +311,7 @@ def _load_single_value(self, param: torch.nn.Parameter,
         param_data[expert_id] = loaded_weight
 
     def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
-                    shard_dim: int, loaded_weight: torch.tensor, tp_rank: int):
+                    shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
 
         if shard_id == "w2":
             self._load_w2(shard_id=shard_id,
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 345919c5d1636..43ea4eb5a4d1a 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -20,6 +20,7 @@ def __init__(
         hidden_size: int,
         eps: float = 1e-6,
         var_hidden_size: Optional[int] = None,
+        has_weight: bool = True,
     ) -> None:
         super().__init__()
 
@@ -27,7 +28,11 @@ def __init__(
         self.variance_epsilon = eps
         self.variance_size_override = (None if var_hidden_size == hidden_size
                                        else var_hidden_size)
-        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.has_weight = has_weight
+
+        self.weight = torch.ones(hidden_size)
+        if self.has_weight:
+            self.weight = nn.Parameter(self.weight)
 
     def forward_native(
         self,
@@ -59,7 +64,9 @@ def forward_native(
         variance = x_var.pow(2).mean(dim=-1, keepdim=True)
 
         x = x * torch.rsqrt(variance + self.variance_epsilon)
-        x = x.to(orig_dtype) * self.weight
+        x = x.to(orig_dtype)
+        if self.has_weight:
+            x = x * self.weight
         if residual is None:
             return x
         else:
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 94f30412e43b3..cf1999ea5fe13 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1,5 +1,6 @@
+import itertools
 from abc import abstractmethod
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Type
 
 import torch
 import torch.nn.functional as F
@@ -27,7 +28,8 @@
     "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod",
     "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
     "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
-    "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod"
+    "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod",
+    "HQQMarlinMethod"
 ]
 
 
@@ -40,12 +42,12 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
 
 
 def adjust_bitsandbytes_4bit_shard(param: Parameter,
-                                   qkv_offsets: Dict[str, Tuple[int, int]],
+                                   shard_offsets: Dict[str, Tuple[int, int]],
                                    loaded_shard_id: str) -> Tuple[int, int]:
     """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
 
-    total, _ = qkv_offsets["total"]
-    orig_offset, orig_size = qkv_offsets[loaded_shard_id]
+    total, _ = shard_offsets["total"]
+    orig_offset, orig_size = shard_offsets[loaded_shard_id]
 
     quantized_total = param.data.shape[0]
     quantized_offset = orig_offset * quantized_total // total
@@ -135,6 +137,36 @@ def apply(self,
         return F.linear(x, layer.weight, bias)
 
 
+class TiedWeightLinearMethod(UnquantizedLinearMethod):
+    """Linear method base with noop create_weights
+
+    Can be used to prevent the initialization of weights
+    during the initialization of modules with weight tying.
+    """
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        ...
+
+
+class QuantizationConfigOverride(QuantizationConfig):
+    """Config class to inject a specific LinearMethod.
+    """
+
+    def __init__(self, cls: Type[LinearMethodBase]):
+        self.cls = cls
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional[LinearMethodBase]:
+        return self.cls()
+
+
+QuantizationConfigOverride.__abstractmethods__ = frozenset()
+
+
 class LinearBase(torch.nn.Module):
     """Base linear layer.
 
@@ -470,7 +502,8 @@ def weight_loader(self,
         needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
         if loaded_shard_id is None:
-            # Loaded weight is already fused on disk (qkv/mlp).
+            # Loaded weight is already fused on disk (mlp).
+            # (e.g., Phi-3's gate_up_proj).
             if output_dim is None:
                 if needs_scalar_to_array:
                     param_data, loaded_weight = adjust_scalar_to_fused_array(
@@ -480,6 +513,8 @@ def weight_loader(self,
                 param_data.copy_(loaded_weight)
                 return
             current_shard_offset = 0
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
+                                            False)
             shard_offsets: List[Tuple[int, int, int]] = []
             for i, output_size in enumerate(self.output_sizes):
                 shard_offsets.append((i, current_shard_offset, output_size))
@@ -496,6 +531,16 @@ def weight_loader(self,
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
+                if use_bitsandbytes_4bit:
+                    index = list(itertools.accumulate([0] + self.output_sizes))
+                    orig_offsets = {
+                        str(i): (index[i], size)
+                        for i, size in enumerate(self.output_sizes)
+                    }
+                    orig_offsets["total"] = (self.output_size, 0)
+                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                        param, orig_offsets, str(shard_id))
+
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
@@ -808,7 +853,8 @@ def weight_loader(self,
         needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
         if loaded_shard_id is None:
-            # Loaded weight is already fused on disk (qkv/mlp).
+            # Loaded weight is already fused on disk (qkv).
+            # (e.g., Phi-3's qkv_proj).
             if output_dim is None:
                 if needs_scalar_to_array:
                     param_data, loaded_weight = adjust_scalar_to_fused_array(
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index fb76b1b17925e..2bc7e458494f7 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -5,6 +5,7 @@
 import torch
 import torch.nn as nn
 
+import vllm.envs as envs
 from vllm.distributed import (tensor_model_parallel_all_gather,
                               tensor_model_parallel_gather)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -42,7 +43,9 @@ def __init__(self,
         # Soft cap the logits. Used in Gemma 2.
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
-        self.use_gather = not current_platform.is_tpu()
+
+        self.use_gather = not current_platform.is_tpu(
+        ) and not envs.VLLM_USE_V1
 
     def forward(
         self,
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 8ef0a6cdf2c52..10bec75f49fdf 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -40,6 +40,7 @@ def __init__(self,
                  use_conv_bias: bool,
                  use_bias: bool,
                  use_rms_norm: bool,
+                 rms_norm_has_weight: bool = True,
                  rms_norm_eps: float = 1e-5,
                  activation="silu"):
         super().__init__()
@@ -105,14 +106,23 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
             input_is_parallel=True,
         )
 
-        self.dt_layernorm = RMSNorm(time_step_rank,
-                                    eps=rms_norm_eps) if use_rms_norm else None
-
-        self.b_layernorm = RMSNorm(ssm_state_size,
-                                   eps=rms_norm_eps) if use_rms_norm else None
-
-        self.c_layernorm = RMSNorm(ssm_state_size,
-                                   eps=rms_norm_eps) if use_rms_norm else None
+        self.dt_layernorm = RMSNorm(
+            time_step_rank,
+            eps=rms_norm_eps,
+            has_weight=rms_norm_has_weight,
+        ) if use_rms_norm else None
+
+        self.b_layernorm = RMSNorm(
+            ssm_state_size,
+            eps=rms_norm_eps,
+            has_weight=rms_norm_has_weight,
+        ) if use_rms_norm else None
+
+        self.c_layernorm = RMSNorm(
+            ssm_state_size,
+            eps=rms_norm_eps,
+            has_weight=rms_norm_has_weight,
+        ) if use_rms_norm else None
 
     def forward_native(self, hidden_states: torch.Tensor,
                        attn_metadata: AttentionMetadata,
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 6fee57a0a03eb..75bf33dc70a51 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,13 +1,18 @@
 from enum import IntEnum
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+from typing_extensions import assert_never
 
 from vllm.config import PoolerConfig
 from vllm.model_executor.pooling_metadata import (PoolingMetadata,
                                                   PoolingTensors)
-from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput
+from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
+from vllm.transformers_utils.config import (
+    get_cross_encoder_activation_function)
 
 
 class PoolingType(IntEnum):
@@ -19,7 +24,7 @@ class PoolingType(IntEnum):
     MEAN = 4
 
 
-class Pooler(nn.Module):
+class SimplePooler(nn.Module):
     """A layer that pools specific information from hidden states.
 
     This layer does the following:
@@ -32,21 +37,203 @@ class Pooler(nn.Module):
         normalize: Whether to normalize the pooled data.
     """
 
+    @staticmethod
+    def from_pooling_type(
+        pooling_type: PoolingType,
+        *,
+        normalize: bool,
+        softmax: bool,
+        step_tag_id: Optional[int] = None,
+        returned_token_ids: Optional[List[int]] = None,
+    ) -> "SimplePooler":
+        if pooling_type == PoolingType.LAST:
+            assert step_tag_id is None and returned_token_ids is None
+            return LastPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.ALL:
+            assert step_tag_id is None and returned_token_ids is None
+            return AllPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.CLS:
+            assert step_tag_id is None and returned_token_ids is None
+            return CLSPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.MEAN:
+            assert step_tag_id is None and returned_token_ids is None
+            return MeanPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.STEP:
+            return StepPool(normalize=normalize,
+                            softmax=softmax,
+                            step_tag_id=step_tag_id,
+                            returned_token_ids=returned_token_ids)
+
+        assert_never(pooling_type)
+
+    def __init__(self, *, normalize: bool, softmax: bool) -> None:
+        super().__init__()
+
+        self.head = PoolerHead(normalize=normalize, softmax=softmax)
+
+    def get_prompt_lens(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> torch.Tensor:
+        return PoolingTensors.from_pooling_metadata(
+            pooling_metadata, hidden_states.device).prompt_lens
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        raise NotImplementedError
+
+    def build_output(self, data: torch.Tensor) -> PoolingSequenceGroupOutput:
+        return PoolingSequenceGroupOutput(data)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.extract_states(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data)
+        pooled_outputs = [self.build_output(data) for data in pooled_data]
+        return PoolerOutput(outputs=pooled_outputs)
+
+
+class CLSPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        first_token_flat_indices = torch.zeros_like(prompt_lens)
+        first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
+        return hidden_states[first_token_flat_indices]
+
+
+class LastPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
+        return hidden_states[last_token_flat_indices]
+
+
+class AllPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+        for prompt_len in prompt_lens:
+            pooled_data.append(hidden_states[offset:offset + prompt_len])
+            offset += prompt_len
+
+        return pooled_data
+
+
+class MeanPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        cumsum = torch.cumsum(hidden_states, dim=0)
+        start_indices = torch.cat([
+            torch.tensor([0], device=hidden_states.device),
+            torch.cumsum(prompt_lens[:-1], dim=0)
+        ])
+        end_indices = torch.cumsum(prompt_lens, dim=0)
+        return (cumsum[end_indices - 1] - cumsum[start_indices] +
+                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
+
+
+class StepPool(SimplePooler):
+
     def __init__(
         self,
-        pooling_type: PoolingType,
+        *,
         normalize: bool,
         softmax: bool,
         step_tag_id: Optional[int] = None,
         returned_token_ids: Optional[List[int]] = None,
     ):
+        super().__init__(normalize=normalize, softmax=softmax)
+
+        self.step_tag_id = step_tag_id
+        self.returned_token_ids = returned_token_ids
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        returned_token_ids = self.returned_token_ids
+        if returned_token_ids is not None and len(returned_token_ids) > 0:
+            hidden_states = hidden_states[:, returned_token_ids]
+
+        step_tag_id = self.step_tag_id
+
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+        for prompt_len, seq_data_i in zip(prompt_lens,
+                                          pooling_metadata.seq_data.values()):
+            pooled_data_i = hidden_states[offset:offset + prompt_len]
+            if step_tag_id is not None:
+                token_ids = torch.tensor(seq_data_i.prompt_token_ids)
+                pooled_data_i = pooled_data_i[token_ids == step_tag_id]
+
+            offset += prompt_len
+            pooled_data.append(pooled_data_i)
+
+        return pooled_data
+
+
+class PoolerHead(nn.Module):
+
+    def __init__(self, *, normalize: bool, softmax: bool) -> None:
         super().__init__()
 
-        self.pooling_type = pooling_type
         self.normalize = normalize
         self.softmax = softmax
-        self.step_tag_id = step_tag_id
-        self.returned_token_ids = returned_token_ids
+
+    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor]):
+        if self.normalize:
+            if isinstance(pooled_data, list):
+                pooled_data = [
+                    F.normalize(data, p=2, dim=1) for data in pooled_data
+                ]
+            else:
+                pooled_data = F.normalize(pooled_data, p=2, dim=1)
+
+        if self.softmax:
+            if isinstance(pooled_data, list):
+                pooled_data = [F.softmax(data, dim=-1) for data in pooled_data]
+            else:
+                pooled_data = F.softmax(pooled_data, dim=-1)
+
+        return pooled_data
+
+
+class Pooler(nn.Module):
 
     @classmethod
     def from_config_with_defaults(
@@ -57,10 +244,8 @@ def from_config_with_defaults(
         softmax: bool,
         step_tag_id: Optional[int] = None,
         returned_token_ids: Optional[List[int]] = None,
-    ) -> Optional["Pooler"]:
-        if pooler_config is None:
-            return None
-        return cls(
+    ) -> SimplePooler:
+        return SimplePooler.from_pooling_type(
             pooling_type=PoolingType[pooler_config.pooling_type]
             if pooler_config.pooling_type is not None else pooling_type,
             normalize=pooler_config.normalize
@@ -74,77 +259,62 @@ def from_config_with_defaults(
             returned_token_ids,
         )
 
+
+class CrossEncodingPooler(nn.Module):
+    """A layer that pools specific information from hidden states.
+
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `PoolerOutput`.
+
+    Attributes:
+        pooling_type: The type of pooling to use.
+        normalize: Whether to normalize the pooled data.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        classifier: nn.Module,
+        pooler: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        self.classifier = classifier
+        self.pooler = pooler
+        self.default_activation_function = \
+            get_cross_encoder_activation_function(config)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
-        """Pools specific information from hidden states based on metadata."""
+        """Pools sentence pair scores from the hidden_states."""
 
         prompt_lens = PoolingTensors.from_pooling_metadata(
             pooling_metadata, hidden_states.device).prompt_lens
 
-        if self.pooling_type is PoolingType.CLS:
-            first_token_flat_indices = torch.zeros_like(prompt_lens)
-            first_token_flat_indices[1:] += torch.cumsum(prompt_lens,
-                                                         dim=0)[:-1]
-            pooled_data = hidden_states[first_token_flat_indices]
-        elif self.pooling_type == PoolingType.LAST:
-            last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
-            pooled_data = hidden_states[last_token_flat_indices]
-        elif self.pooling_type == PoolingType.ALL:
-            offset = 0
-            pooled_data_lst = []
-            for prompt_len in prompt_lens:
-                pooled_data_i = hidden_states[offset:offset + prompt_len]
-
-                pooled_data_lst.append(pooled_data_i)
-                offset += prompt_len
-
-            pooled_data = torch.stack(pooled_data_lst)
-        elif self.pooling_type == PoolingType.MEAN:
-            # Calculate mean pooling
-            cumsum = torch.cumsum(hidden_states, dim=0)
-            start_indices = torch.cat([
-                torch.tensor([0], device=hidden_states.device),
-                torch.cumsum(prompt_lens[:-1], dim=0)
-            ])
-            end_indices = torch.cumsum(prompt_lens, dim=0)
-            pooled_data = (
-                cumsum[end_indices - 1] - cumsum[start_indices] +
-                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
-        elif self.pooling_type == PoolingType.STEP:
-            returned_token_ids = self.returned_token_ids
-            if returned_token_ids is not None and len(returned_token_ids) > 0:
-                hidden_states = hidden_states[:, returned_token_ids]
-
-            logits = hidden_states.softmax(dim=-1)
-            step_tag_id = self.step_tag_id
-
-            offset = 0
-            pooled_data_lst = []
-            for prompt_len, seq_data_i in zip(
-                    prompt_lens, pooling_metadata.seq_data.values()):
-                pooled_data_i = logits[offset:offset + prompt_len]
-                if step_tag_id is not None:
-                    token_ids = torch.tensor(seq_data_i.prompt_token_ids)
-                    pooled_data_i = pooled_data_i[token_ids == step_tag_id]
-
-                offset += prompt_len
-                pooled_data_lst.append(pooled_data_i)
-
-            pooled_data = torch.stack(pooled_data_lst)
-        else:
-            raise ValueError(f"Invalid pooling type: {self.pooling_type}")
+        offset = 0
+        pooled_data_lst = []
+        for prompt_len in prompt_lens:
+            pooled_data_i = hidden_states[offset:offset + prompt_len]
 
-        if self.normalize:
-            pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
+            if self.pooler is not None:
+                final_shape_tensor = self.pooler(pooled_data_i)
+            else:
+                final_shape_tensor = self.classifier(pooled_data_i)
 
-        if self.softmax:
-            pooled_data = nn.functional.softmax(pooled_data, dim=-1)
+            pooled_data_lst.append(final_shape_tensor)
+            offset += prompt_len
+
+        pooled_output = torch.stack(pooled_data_lst)
+
+        if self.pooler is not None:
+            # apply classifier once on the full batch if possible
+            pooled_output = self.classifier(pooled_output)
 
-        pooled_outputs = [
-            EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
-        ]
+        scores = self.default_activation_function(pooled_output).squeeze(-1)
 
+        pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
         return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index da841d052d728..dd10c434f0752 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,63 +1,87 @@
-from typing import Dict, Type
+from typing import Dict, List, Type
 
-from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
-from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.bitsandbytes import (
-    BitsAndBytesConfig)
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsConfig)
-from vllm.model_executor.layers.quantization.deepspeedfp import (
-    DeepSpeedFPConfig)
-from vllm.model_executor.layers.quantization.experts_int8 import (
-    ExpertsInt8Config)
-from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config
-from vllm.model_executor.layers.quantization.gguf import GGUFConfig
-from vllm.model_executor.layers.quantization.gptq import GPTQConfig
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinConfig)
-from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
-    GPTQMarlin24Config)
-from vllm.model_executor.layers.quantization.ipex_quant import IPEXConfig
-from vllm.model_executor.layers.quantization.marlin import MarlinConfig
-from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config
-from vllm.model_executor.layers.quantization.neuron_quant import (
-    NeuronQuantConfig)
-from vllm.model_executor.layers.quantization.qqq import QQQConfig
-from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
 
-QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
-    "aqlm": AQLMConfig,
-    "awq": AWQConfig,
-    "deepspeedfp": DeepSpeedFPConfig,
-    "tpu_int8": Int8TpuConfig,
-    "fp8": Fp8Config,
-    "fbgemm_fp8": FBGEMMFp8Config,
-    "modelopt": ModelOptFp8Config,
+QUANTIZATION_METHODS: List[str] = [
+    "aqlm",
+    "awq",
+    "deepspeedfp",
+    "tpu_int8",
+    "fp8",
+    "fbgemm_fp8",
+    "modelopt",
     # The order of gptq methods is important for config.py iteration over
     # override_quantization_method(..)
-    "marlin": MarlinConfig,
-    "gguf": GGUFConfig,
-    "gptq_marlin_24": GPTQMarlin24Config,
-    "gptq_marlin": GPTQMarlinConfig,
-    "awq_marlin": AWQMarlinConfig,
-    "gptq": GPTQConfig,
-    "compressed-tensors": CompressedTensorsConfig,
-    "bitsandbytes": BitsAndBytesConfig,
-    "qqq": QQQConfig,
-    "experts_int8": ExpertsInt8Config,
-    "neuron_quant": NeuronQuantConfig,
-    "ipex": IPEXConfig,
-}
+    "marlin",
+    "gguf",
+    "gptq_marlin_24",
+    "gptq_marlin",
+    "awq_marlin",
+    "gptq",
+    "compressed-tensors",
+    "bitsandbytes",
+    "qqq",
+    "hqq",
+    "experts_int8",
+    "neuron_quant",
+    "ipex",
+]
 
 
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization not in QUANTIZATION_METHODS:
         raise ValueError(f"Invalid quantization method: {quantization}")
-    return QUANTIZATION_METHODS[quantization]
+
+    # lazy import to avoid triggering `torch.compile` too early
+    from .aqlm import AQLMConfig
+    from .awq import AWQConfig
+    from .awq_marlin import AWQMarlinConfig
+    from .bitsandbytes import BitsAndBytesConfig
+    from .compressed_tensors.compressed_tensors import (  # noqa: E501
+        CompressedTensorsConfig)
+    from .deepspeedfp import DeepSpeedFPConfig
+    from .experts_int8 import ExpertsInt8Config
+    from .fbgemm_fp8 import FBGEMMFp8Config
+    from .fp8 import Fp8Config
+    from .gguf import GGUFConfig
+    from .gptq import GPTQConfig
+    from .gptq_marlin import GPTQMarlinConfig
+    from .gptq_marlin_24 import GPTQMarlin24Config
+    from .hqq_marlin import HQQMarlinConfig
+    from .ipex_quant import IPEXConfig
+    from .marlin import MarlinConfig
+    from .modelopt import ModelOptFp8Config
+    from .neuron_quant import NeuronQuantConfig
+    from .qqq import QQQConfig
+    from .tpu_int8 import Int8TpuConfig
+
+    method_to_config: Dict[str, Type[QuantizationConfig]] = {
+        "aqlm": AQLMConfig,
+        "awq": AWQConfig,
+        "deepspeedfp": DeepSpeedFPConfig,
+        "tpu_int8": Int8TpuConfig,
+        "fp8": Fp8Config,
+        "fbgemm_fp8": FBGEMMFp8Config,
+        "modelopt": ModelOptFp8Config,
+        # The order of gptq methods is important for config.py iteration over
+        # override_quantization_method(..)
+        "marlin": MarlinConfig,
+        "gguf": GGUFConfig,
+        "gptq_marlin_24": GPTQMarlin24Config,
+        "gptq_marlin": GPTQMarlinConfig,
+        "awq_marlin": AWQMarlinConfig,
+        "gptq": GPTQConfig,
+        "compressed-tensors": CompressedTensorsConfig,
+        "bitsandbytes": BitsAndBytesConfig,
+        "qqq": QQQConfig,
+        "hqq": HQQMarlinConfig,
+        "experts_int8": ExpertsInt8Config,
+        "neuron_quant": NeuronQuantConfig,
+        "ipex": IPEXConfig,
+    }
+
+    return method_to_config[quantization]
 
 
 __all__ = [
diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
index bbb7fc8ad5087..ace8f4a348812 100644
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -42,7 +42,7 @@ def awq_dequantize_kernel(
     result_masks = result_masks_y[:, None] & result_masks_x[None, :]
 
     # Load the weights.
-    iweights = tl.load(qweight_ptr + offsets, masks)
+    iweights = tl.load(qweight_ptr + offsets, masks, 0.0)
     iweights = tl.interleave(iweights, iweights)
     iweights = tl.interleave(iweights, iweights)
     iweights = tl.interleave(iweights, iweights)
@@ -71,7 +71,7 @@ def awq_dequantize_kernel(
     zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :]
 
     # Load the zeros.
-    zeros = tl.load(zeros_ptr + zero_offsets, zero_masks)
+    zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0)
     zeros = tl.interleave(zeros, zeros)
     zeros = tl.interleave(zeros, zeros)
     zeros = tl.interleave(zeros, zeros)
@@ -91,7 +91,7 @@ def awq_dequantize_kernel(
     scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
 
     # Load the scales.
-    scales = tl.load(scales_ptr + scale_offsets, scale_masks)
+    scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0)
     scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
 
     # Dequantize.
@@ -165,10 +165,10 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
         masks_k = offsets_k < K
         masks_a = masks_am[:, None] & masks_k[None, :]
-        a = tl.load(a_ptrs, mask=masks_a)
+        a = tl.load(a_ptrs, mask=masks_a, other=0.0)
 
         masks_b = masks_k[:, None] & masks_bn[None, :]
-        b = tl.load(b_ptrs, mask=masks_b)
+        b = tl.load(b_ptrs, mask=masks_b, other=0.0)
         b = tl.interleave(b, b)
         b = tl.interleave(b, b)
         b = tl.interleave(b, b)
@@ -181,7 +181,7 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
         masks_zk = offsets_szk < K // group_size
         masks_z = masks_zk[:, None] & masks_zn[None, :]
         zeros_ptrs = zeros_ptr + offsets_z
-        zeros = tl.load(zeros_ptrs, mask=masks_z)
+        zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0)
         zeros = tl.interleave(zeros, zeros)
         zeros = tl.interleave(zeros, zeros)
         zeros = tl.interleave(zeros, zeros)
@@ -191,7 +191,7 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
         masks_sk = offsets_szk < K // group_size
         masks_s = masks_sk[:, None] & masks_sn[None, :]
         scales_ptrs = scales_ptr + offsets_s
-        scales = tl.load(scales_ptrs, mask=masks_s)
+        scales = tl.load(scales_ptrs, mask=masks_s, other=0.0)
         scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N))
 
         b = (b >> shifts) & 0xF
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 39965ac9115c2..5dc872933282c 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -20,17 +20,19 @@ def __init__(
         load_in_8bit: bool = False,
         load_in_4bit: bool = True,
         bnb_4bit_compute_dtype: str = "float32",
+        bnb_4bit_quant_storage: str = "uint8",
         bnb_4bit_quant_type: str = "fp4",
         bnb_4bit_use_double_quant: bool = False,
         llm_int8_enable_fp32_cpu_offload: bool = False,
         llm_int8_has_fp16_weight: bool = False,
         llm_int8_skip_modules: Optional[List[str]] = None,
-        llm_int8_threshold: float = 0.0,
+        llm_int8_threshold: float = 6.0,
     ) -> None:
 
         self.load_in_8bit = load_in_8bit
         self.load_in_4bit = load_in_4bit
         self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
+        self.bnb_4bit_quant_storage = bnb_4bit_quant_storage
         self.bnb_4bit_quant_type = bnb_4bit_quant_type
         self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
         self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
@@ -38,10 +40,15 @@ def __init__(
         self.llm_int8_skip_modules = llm_int8_skip_modules or []
         self.llm_int8_threshold = llm_int8_threshold
 
+        if self.bnb_4bit_quant_storage not in ["uint8"]:
+            raise ValueError("Unsupported bnb_4bit_quant_storage: "
+                             f"{self.bnb_4bit_quant_storage}")
+
     def __repr__(self) -> str:
         return (f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, "
                 f"load_in_4bit={self.load_in_4bit}, "
                 f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, "
+                f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, "
                 f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, "
                 f"llm_int8_skip_modules={self.llm_int8_skip_modules})")
 
@@ -80,6 +87,9 @@ def get_safe_value(config, keys, default_value=None):
         bnb_4bit_compute_dtype = get_safe_value(config,
                                                 ["bnb_4bit_compute_dtype"],
                                                 default_value="float32")
+        bnb_4bit_quant_storage = get_safe_value(config,
+                                                ["bnb_4bit_quant_storage"],
+                                                default_value="uint8")
         bnb_4bit_quant_type = get_safe_value(config, ["bnb_4bit_quant_type"],
                                              default_value="fp4")
         bnb_4bit_use_double_quant = get_safe_value(
@@ -93,12 +103,13 @@ def get_safe_value(config, keys, default_value=None):
                                                ["llm_int8_skip_modules"],
                                                default_value=[])
         llm_int8_threshold = get_safe_value(config, ["llm_int8_threshold"],
-                                            default_value=0.0)
+                                            default_value=6.0)
 
         return cls(
             load_in_8bit=load_in_8bit,
             load_in_4bit=load_in_4bit,
             bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
+            bnb_4bit_quant_storage=bnb_4bit_quant_storage,
             bnb_4bit_quant_type=bnb_4bit_quant_type,
             bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
             llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload,
@@ -134,12 +145,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.44.0":
+            if bitsandbytes.__version__ < "0.45.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.44.0.")
+                                  "install bitsandbytes>=0.45.0.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.44.0 via "
-                              "`pip install bitsandbytes>=0.44.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.0 via "
+                              "`pip install bitsandbytes>=0.45.0` to use "
                               "bitsandbytes quantizer.") from err
 
         self.quant_config = quant_config
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 4f5758a42dbbc..0c1fc18228f5c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,7 +1,9 @@
-from typing import Any, Dict, List, Optional, cast
+from typing import Any, Dict, List, Literal, Optional, cast
 
 import torch
-from compressed_tensors.config import CompressionFormat
+from compressed_tensors.config import (CompressionFormat,
+                                       SparsityCompressionConfig,
+                                       SparsityStructure)
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
@@ -15,7 +17,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
     CompressedTensorsMoEMethod)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
@@ -27,20 +29,29 @@
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
+SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
+QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
+
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self,
-                 target_scheme_map: Dict[str, Any],
-                 ignore: List[str],
-                 quant_format: str,
-                 kv_cache_scheme: Optional[Dict[str, Any]] = None):
+    def __init__(
+        self,
+        target_scheme_map: Dict[str, Any],
+        ignore: List[str],
+        quant_format: str,
+        sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        kv_cache_scheme: Optional[Dict[str, Any]] = None,
+        config: Optional[Dict[str, Any]] = None,
+    ):
 
         self.ignore = ignore
         self.quant_format = quant_format
         # Map from [target -> scheme]
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
+        self.sparsity_scheme_map = sparsity_scheme_map
+        self.config = config
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -78,8 +89,50 @@ def get_quant_method(
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
+        ignore: List[str] = cast(List[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(
+            config=config)
+        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
+            config=config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            sparsity_scheme_map=sparsity_scheme_map,
+            config=config,
+        )
+
+    @classmethod
+    def _sparsity_scheme_map_from_config(
+            cls, config: Dict[str,
+                              Any]) -> Dict[str, SparsityCompressionConfig]:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            sparsity compression configurations
+        """
+        if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None:
+            return dict()
+
+        sparsity_config = SparsityCompressionConfig.model_validate(
+            sparsity_config)
+        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config
+            for target in sparsity_config.targets or list()
+        }
+        return sparse_scheme_map
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+            cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
         target_scheme_map: Dict[str, Any] = dict()
-        ignore = cast(List[str], config.get("ignore"))
         quant_format = cast(str, config.get("format"))
 
         # The quant_config has multiple config_groups, each containing
@@ -90,12 +143,14 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
-        for _, quant_config in config["config_groups"].items():
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
             targets = quant_config.get("targets")
             for target in targets:
                 target_scheme_map[target] = {}
                 target_scheme_map[target][
-                    "weights"] = QuantizationArgs.parse_obj(
+                    "weights"] = QuantizationArgs.model_validate(
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
@@ -110,13 +165,9 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                             "weights"].type == QuantizationType.FLOAT
                     else:
                         target_scheme_map[target][
-                            "input_activations"] = QuantizationArgs.parse_obj(
+                            "input_activations"] = QuantizationArgs.model_validate(  # noqa: E501
                                 quant_config.get("input_activations"))
-
-        return cls(target_scheme_map=target_scheme_map,
-                   ignore=ignore,
-                   quant_format=quant_format,
-                   kv_cache_scheme=config.get("kv_cache_scheme"))
+        return target_scheme_map
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -315,23 +366,105 @@ def get_scheme(
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
-        matched_target = find_matched_target(
-            layer_name=layer_name,
-            module=layer,
-            targets=self.target_scheme_map.keys())
 
-        # Find the quant_scheme
-        scheme_dict = self.target_scheme_map[matched_target]
-        scheme = self._get_scheme_from_parts(
-            weight_quant=scheme_dict["weights"],
-            input_quant=scheme_dict["input_activations"])
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys())
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+        elif self.sparsity_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.sparsity_scheme_map.keys())
+            weight_quant = None
+            input_quant = None
+
+        # For models with sparsity, assumes that the sparse layers are also
+        # quantized for cutlass 2:4 support
+        sparsity_scheme: Optional[
+            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
+                matched_target)
+
+        if self.supports_cutlass_24(weight_quant=weight_quant,
+                                    input_quant=input_quant,
+                                    sparsity_scheme=sparsity_scheme):
+            # Have a valid sparsity scheme
+            # Validate layer is supported by Cutlass 2:4 Kernel
+            scheme = CompressedTensors24(quantized=weight_quant is not None
+                                         or input_quant is not None,
+                                         weight_quant=weight_quant,
+                                         input_quant=input_quant)
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
-
         return scheme
 
+    @staticmethod
+    def supports_cutlass_24(
+            weight_quant: Optional[QuantizationArgs],
+            input_quant: Optional[QuantizationArgs],
+            sparsity_scheme: Optional[SparsityCompressionConfig] = None
+    ) -> bool:
+        """
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition: Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN
+            - Only 8 bit quantization is supported 
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        is_valid_sparsity = (sparsity_scheme is not None
+                             and sparsity_scheme.sparsity_structure
+                             == SparsityStructure.TWO_FOUR.value
+                             and sparsity_scheme.format == "dense")
+        if not is_valid_sparsity:
+            return False
+
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value
+        ]
+
+        assert weight_quant is not None
+        assert input_quant is not None
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
+        ]
+
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+
+        return weight_quant.num_bits == input_quant.num_bits == 8
+
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 5d259ec72051c..569ecaa6f5a76 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -7,13 +7,12 @@
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
 
+from .compressed_tensors_24 import CompressedTensors24  # isort: skip
+
 __all__ = [
-    "CompressedTensorsScheme",
-    "CompressedTensorsWNA16",
-    "CompressedTensorsW8A16Fp8",
-    "CompressedTensorsW4A16Sparse24",
-    "CompressedTensorsW8A8Int8",
-    "CompressedTensorsW8A8Fp8",
-    "WNA16_SUPPORTED_BITS",
-    "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensorsScheme", "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
new file mode 100644
index 0000000000000..af266769aef89
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -0,0 +1,203 @@
+from typing import Callable, List, Optional
+
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+__all__ = ["CompressedTensors24"]
+
+
+class CompressedTensors24(CompressedTensorsScheme):
+
+    def __init__(self,
+                 quantized: bool = False,
+                 weight_quant: Optional[QuantizationArgs] = None,
+                 input_quant: Optional[QuantizationArgs] = None):
+
+        self.quantized = quantized
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Only cutlass 3.x kernels are implemented so far
+        return 90
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        self.output_dtype = params_dtype
+        layer.logical_widths = output_partition_sizes
+        self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
+
+        # parameter to store uncompressed weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=self.weights_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        # Check if quantized, not just 2:4 Sparse
+        if self.quantized:
+            if (self.weight_quant and self.weight_quant.strategy
+                    == QuantizationStrategy.CHANNEL.value):
+                weight_scale = ChannelQuantScaleParameter(
+                    data=torch.empty((sum(output_partition_sizes), 1),
+                                     dtype=torch.float32),
+                    output_dim=0,
+                    weight_loader=weight_loader)
+            else:
+                assert (self.weight_quant and self.weight_quant.strategy
+                        == QuantizationStrategy.TENSOR.value)
+                weight_scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader)
+
+            layer.register_parameter("weight_scale", weight_scale)
+
+            # input quant will be non-none
+            if self.input_quant and not self.input_quant.dynamic:
+                # register input quant scale
+                assert (self.input_quant.strategy ==
+                        QuantizationStrategy.TENSOR.value)
+                input_scale = BasevLLMParameter(data=torch.empty(
+                    1, dtype=torch.float32),
+                                                weight_loader=weight_loader)
+
+                layer.register_parameter("input_scale", input_scale)
+
+        else:
+            # for sparse-only, pass in 1 for weight/input scales
+            weight_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                              requires_grad=False)
+            input_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                             requires_grad=False)
+            layer.register_parameter("input_scale", input_scale)
+            layer.register_parameter("weight_scale", weight_scale)
+
+        layer.register_parameter("weight", weight)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """
+        Compress weights after loading. Store compressed weight and meta
+            tensor
+        
+        :post-condition: layer.w_compressed and layer.meta are
+            set to the compressed weight and meta tensor in the
+            format expected by the Cutlass kernels
+        :param layer: The layer with the weights to be processed
+        
+        """
+        # torch.compile workaround
+        if hasattr(layer, "input_scale"):
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
+
+        if self.weight_quant:
+            if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+                layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
+                    weight_scale=layer.weight_scale,
+                    logical_widths=layer.logical_widths),
+                                                        requires_grad=False)
+            else:
+                # torch.compile workaround
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data, requires_grad=False)
+
+        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Returns the output tensor for the layer with 2:4 
+        sparse compressed weights, given the input tensor
+        and bias
+
+        :param layer: The layer with 2:4 sparse compressed 
+            weights to be used for the computation
+        :param x: The input tensor to the layer
+        :param bias: The bias to be added to the output tensor
+        :return: The output tensor of the layer 
+        """
+        if self.quantized:
+            scale = None
+            if hasattr(layer, "input_scale"):
+                scale = layer.input_scale
+
+            if self.weights_dtype == torch.int8:
+                ops_output = ops.scaled_int8_quant(x, scale=scale)
+                q_input = ops_output[0]
+                input_scale = ops_output[1]
+            else:
+                assert self.weights_dtype == torch.float8_e4m3fn
+                if scale is not None:
+                    q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
+                else:
+                    q_input, input_scale = ops.scaled_fp8_quant(
+                        x, use_per_token_if_dynamic=True)
+
+        else:
+            # Not quantized, nothing to do with the input_scales, use as is
+            input_scale = layer.input_scale
+            q_input = x
+
+        out = ops.cutlass_scaled_sparse_mm(a=q_input,
+                                           bt_nzs=layer.weight,
+                                           bt_meta=layer.meta,
+                                           scale_a=input_scale,
+                                           scale_b=layer.weight_scale,
+                                           out_dtype=self.output_dtype,
+                                           bias=bias)
+        assert out.is_contiguous()
+        return out
+
+    def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
+        if not self.quantized:
+            return params_dtype
+
+        assert self.weight_quant is not None
+        assert self.input_quant is not None
+
+        is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
+
+        if not is_8_bits:
+            raise ValueError("Cutlass only supports 8-bit quantization")
+
+        if (self.weight_quant.type == QuantizationType.FLOAT
+                and self.input_quant.type == QuantizationType.FLOAT):
+            return torch.float8_e4m3fn
+
+        if (self.weight_quant.type == QuantizationType.INT
+                and self.input_quant.type == QuantizationType.INT):
+            return torch.int8
+
+        raise ValueError("Quantization type not supported by Cutlass")
+
+
+def check_24(tensor):
+    new_tensor = tensor.view(-1, 4)
+    zero_counts = (new_tensor == 0).sum(dim=1)
+    return (zero_counts >= 2).all().item()
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 24138662eb25c..f0943efa0039d 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -2,6 +2,7 @@
 
 import gguf
 import torch
+from gguf import GGMLQuantizationType as WeightType
 from torch.nn.parameter import Parameter, UninitializedParameter
 
 from vllm import _custom_ops as ops
@@ -49,19 +50,65 @@ def get_quant_method(self, layer: torch.nn.Module,
         return None
 
 
+UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
+STANDARD_QUANT_TYPES = {
+    WeightType.Q4_0,
+    WeightType.Q4_1,
+    WeightType.Q5_0,
+    WeightType.Q5_1,
+    WeightType.Q8_0,
+    WeightType.Q8_1,
+}
+KQUANT_TYPES = {
+    WeightType.Q2_K,
+    WeightType.Q3_K,
+    WeightType.Q4_K,
+    WeightType.Q5_K,
+    WeightType.Q6_K,
+}
+IMATRIX_QUANT_TYPES = {
+    WeightType.IQ1_M,
+    WeightType.IQ1_S,
+    WeightType.IQ2_XXS,
+    WeightType.IQ2_XS,
+    WeightType.IQ2_S,
+    WeightType.IQ3_XXS,
+    WeightType.IQ3_S,
+    WeightType.IQ4_XS,
+    WeightType.IQ4_NL,
+}
+# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization.
+# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add
+# MMQ kernel for I-Matrix quantization.
+DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
+
+
 def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
                   qweight_type: int) -> torch.Tensor:
-    # use dequantize mulmat for IQmatrix, mmq for k-quants
-    if x.shape[0] == 1:
-        # enable mmvq in contiguous batching
+    # there is no need to call any kernel for fp16/bf16
+    if qweight_type in UNQUANTIZED_TYPES:
+        return x @ qweight.T
+    # enable MMVQ in contiguous batching with batch_size=1
+    if x.shape[0] == 1 and qweight_type in MMVQ_QUANT_TYPES:
         y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
-    elif qweight_type >= 16:
+    # Use MMQ Kernel if it's available (standard + k-quants)
+    elif qweight_type in MMQ_QUANT_TYPES:
+        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
+    # If there is no available MMQ kernel, fallback to dequantize
+    elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
         weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
         y = x @ weight.T
     else:
-        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
+        # Raise an error if the quantization type is not supported.
+        # Might be useful if llama.cpp adds a new quantization type.
+        # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
+        qweight_type = WeightType(qweight_type)
+        raise NotImplementedError(
+            f"Unsupported GGUF quantization type: {qweight_type}")
     return y
 
 
@@ -121,9 +168,9 @@ def apply(self,
             shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id
             qweight = layer.qweight.unbind(0)
             result = []
-            for id in shard_id:
-                q_idx = layer.qweight.shard_id_map[id]
-                qweight_type = layer.qweight_type.shard_weight_type[id]
+            for idx in shard_id:
+                q_idx = layer.qweight.shard_id_map[idx]
+                qweight_type = layer.qweight_type.shard_weight_type[idx]
                 result.append(_fuse_mul_mat(x, qweight[q_idx], qweight_type))
             out = torch.cat(result, axis=1)
         else:
@@ -163,9 +210,13 @@ class GGUFUninitializedParameter(UninitializedParameter):
     data_container: List[torch.Tensor]
 
     def materialize_nested(self) -> Parameter:
+        dtype = {data.dtype for data in self.data_container}
+        assert len(dtype) == 1, ValueError(
+            f"Data container has mixed dtypes: {dtype}")
+        dtype = next(iter(dtype))
         nested_data = torch.nested.nested_tensor(self.data_container,
                                                  device=self.device,
-                                                 dtype=torch.uint8)
+                                                 dtype=dtype)
         self.data_container.clear()
         param = torch.Tensor._make_subclass(self.cls_to_become,
                                             nested_data,
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 0aa605e62454e..abafad0f1047e 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -210,7 +210,6 @@ def create_weights(
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # for torch.compile
-        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
         layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
         layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
         layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 1f72e3afbbce5..a3e58bf1b2a4c 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -23,6 +23,7 @@
                                            PackedColumnParameter,
                                            PackedvLLMParameter,
                                            RowvLLMParameter)
+from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -134,6 +135,9 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         sym = quant_config.get("sym")
         desc_act = quant_config.get("desc_act")
 
+        if not current_platform.is_cuda():
+            return False
+
         if quant_method != "gptq":
             return False
 
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
new file mode 100644
index 0000000000000..28538d2993355
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -0,0 +1,325 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    marlin_make_empty_g_idx, marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace)
+from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class HQQMarlinConfig(QuantizationConfig):
+    """Config class for HQQ Marlin"""
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        skip_modules: Optional[List[str]] = None,
+    ) -> None:
+        assert group_size == 64, ("The only supported HQQ group size is "
+                                  "currently 64.")
+        assert weight_bits == 4, ("The only supported HQQ quantization "
+                                  "bitsize is currently 4.")
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.pack_factor = 32 // weight_bits  # packed into int32 in GPTQ format
+        self.quant_type = scalar_types.uint4
+        self.skip_modules = skip_modules
+
+    def __repr__(self) -> str:
+        return (f"HQQMarlinConfig(quant_type={self.quant_type}, "
+                f"group_size={self.group_size})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "hqq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "HQQMarlinConfig":
+        wq_params = (config["quant_config"]["weight_quant_params"])
+        weight_bits = cls.get_from_keys(wq_params, ["nbits"])
+        group_size = cls.get_from_keys(wq_params, ["group_size"])
+        skip_modules = config["skip_modules"]
+        return cls(weight_bits, group_size, skip_modules)
+
+    def is_layer_skipped(self, prefix: str) -> bool:
+        # Split the prefix into its dot-separated components
+        components = prefix.split('.')
+
+        # Check if any of the skip modules exactly matches any component
+        return self.skip_modules is not None and any(
+            module_name in components for module_name in self.skip_modules)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            if self.is_layer_skipped(prefix):
+                return UnquantizedLinearMethod()
+            return HQQMarlinMethod(self)
+        return None
+
+
+# Empty HQQ parameter, will be ignored during loading
+class HQQEmptyParameter(BasevLLMParameter):
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        pass
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        pass
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        pass
+
+
+def error_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+    raise ValueError("No loader provided for HQQ parameter!")
+
+
+# HQQ packing creates issues with sharding - therefore, prior to loading, we
+# repack to GPTQ. We also reshape the weights to their proper GPTQ shape.
+class HQQweightParameter(PackedvLLMParameter):
+
+    # unpack function from https://github.com/mobiusml/hqq
+    def unpack_4bit_u8(self,
+                       W_q: torch.Tensor) -> torch.Tensor:  # uint8/2 > uint8
+        assert self.weight_bits == 4, "Unsupported quant bitsize (must be 4)"
+
+        dtype = torch.uint8
+        step = W_q.shape[0]
+        tmp = torch.empty([2 * step, W_q.shape[1]],
+                          dtype=dtype,
+                          device=W_q.device)
+        tmp[:step] = (W_q & 0b11110000) >> 4
+        tmp[step:] = W_q & 0b00001111
+        return tmp
+
+    def __init__(self, packed_factor: int, packed_dim: int, weight_bits: int,
+                 **kwargs):
+        super().__init__(packed_factor, packed_dim, None, **kwargs)
+        self.weight_bits = weight_bits
+        self.input_shape = self.shape[self.input_dim] * self.packed_factor
+        self.output_shape = self.shape[self.output_dim]
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = self.unpack_4bit_u8(loaded_weight)
+        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(
+            1, 0)
+        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
+                                  loaded_weight.shape[0],
+                                  loaded_weight.shape[1])
+        super().load_merged_column_weight(loaded_weight, **kwargs)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        loaded_weight = self.unpack_4bit_u8(loaded_weight)
+        loaded_weight = loaded_weight.reshape(self.output_shape,
+                                              -1).transpose(1, 0)
+        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
+                                  loaded_weight.shape[0],
+                                  loaded_weight.shape[1])
+        super().load_row_parallel_weight(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = self.unpack_4bit_u8(loaded_weight)
+        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(
+            1, 0)
+        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
+                                  loaded_weight.shape[0],
+                                  loaded_weight.shape[1])
+        super().load_qkv_weight(loaded_weight, **kwargs)
+
+
+# Zero points and scales in HQQ must also be reshaped to correspond to W_q's
+# GPTQ shape (transposed - we transpose them too when processing weights).
+class HQQZeroScaleParameter(GroupQuantScaleParameter):
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = loaded_weight.reshape(-1, self.shape[1])
+        super().load_merged_column_weight(loaded_weight, **kwargs)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        loaded_weight = loaded_weight.reshape(self.shape[0], -1)
+        super().load_row_parallel_weight(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = loaded_weight.reshape(-1, self.shape[1])
+        super().load_qkv_weight(loaded_weight, **kwargs)
+
+
+class HQQMarlinMethod(LinearMethodBase):
+    """Linear method for HQQ Marlin.
+    """
+
+    def __init__(
+        self,
+        quant_config: HQQMarlinConfig,
+    ):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        self.output_size_per_partition = sum(output_partition_sizes)
+        self.input_size_per_partition = input_size_per_partition
+
+        weight_loader = extra_weight_attrs.get("weight_loader", error_loader)
+
+        self.scales_and_zp_size = (input_size_per_partition //
+                                   self.quant_config.group_size)
+
+        qweight = HQQweightParameter(
+            data=torch.empty(
+                self.input_size_per_partition // self.quant_config.pack_factor,
+                self.output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_bits=self.quant_config.weight_bits,
+            weight_loader=weight_loader)
+
+        zeros = HQQZeroScaleParameter(data=torch.empty(
+            self.output_size_per_partition,
+            self.scales_and_zp_size,
+            dtype=params_dtype,
+        ),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        scales = HQQZeroScaleParameter(data=torch.empty(
+            self.output_size_per_partition,
+            self.scales_and_zp_size,
+            dtype=params_dtype,
+        ),
+                                       input_dim=1,
+                                       output_dim=0,
+                                       weight_loader=weight_loader)
+
+        layer.register_parameter("W_q", qweight)
+        layer.register_parameter("zero", zeros)
+        layer.register_parameter("scale", scales)
+
+        # Ignore extra parameters in the HQQ model.
+        # To be added as needed.
+        ignore_parameters = ("axis", "channel_wise", "compute_dtype",
+                             "encoded_state_dict", "group_size", "nbits",
+                             "offload_meta", "optimize", "packing",
+                             "quant_scale", "quant_zero", "round_zero",
+                             "shape", "stores_quant_config",
+                             "unpack_view_dtype", "view_as_float")
+        for name in ignore_parameters:
+            layer.register_parameter(
+                name,
+                HQQEmptyParameter(data=torch.empty(0),
+                                  weight_loader=weight_loader))
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        dev = layer.W_q.device
+
+        # Repack to Marlin
+        sort_indices = torch.empty(0, dtype=torch.int, device=dev)
+        marlin_w_q = ops.gptq_marlin_repack(
+            layer.W_q,
+            sort_indices,
+            self.input_size_per_partition,
+            self.output_size_per_partition,
+            self.quant_config.weight_bits,
+        ).to(dev)
+        marlin_s = marlin_permute_scales(layer.scale.transpose(1, 0),
+                                         self.input_size_per_partition,
+                                         self.output_size_per_partition,
+                                         self.quant_config.group_size).to(dev)
+        marlin_zp = marlin_permute_scales(layer.zero.transpose(1, 0),
+                                          self.input_size_per_partition,
+                                          self.output_size_per_partition,
+                                          self.quant_config.group_size).to(dev)
+
+        layer.g_idx = marlin_make_empty_g_idx(dev)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(dev)
+
+        layer.marlin_qweight = marlin_w_q
+        layer.marlin_zeros = marlin_zp
+        layer.marlin_scales = marlin_s
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        workspace = MarlinWorkspace(self.output_size_per_partition,
+                                    GPTQ_MARLIN_MIN_THREAD_N,
+                                    GPTQ_MARLIN_MAX_PARALLEL)
+
+        scales = layer.marlin_scales
+        zeros = layer.marlin_zeros
+        orig_type = x.dtype
+
+        if orig_type != torch.float16:
+            x = x.to(torch.float16)
+            scales = scales.to(torch.float16)
+            zeros = zeros.to(torch.float16)
+
+        marlin_out = ops.gptq_marlin_gemm(
+            x,
+            layer.marlin_qweight,
+            scales,
+            zeros,
+            layer.g_idx,
+            layer.g_idx_sort_indices,
+            workspace.scratch,
+            scalar_types.uint4,
+            x.shape[0],
+            self.output_size_per_partition,
+            self.input_size_per_partition,
+            True,  # is_k_full
+            True,  # has_zp
+            True,  # use 32-bit reduce
+            True,  # use float zp
+        )
+
+        if orig_type != torch.float16:
+            marlin_out = marlin_out.to(orig_type)
+
+        if bias is not None:
+            marlin_out.add_(bias)
+
+        return marlin_out
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 330c2ad195d78..c16a962134d06 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -2,21 +2,26 @@
 
 import torch
 
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod,
+                                                         is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.platforms import current_platform
 
+MIN_IPEX_VERSION = "2.5.0"
+
 
 class IPEXConfig(QuantizationConfig):
-    """INT8 quantization config class using IPEX for the CPU backend,
-    including AWQ.
+    """INT8 quantization config class using IPEX for the CPU/XPU backend,
+    including AWQ, GPTQ.
     """
 
     IPEX_QUANT_METHOD_MAP = {
         "awq": 1,
-        "gptq": 2,
+        "gptq": 0,
     }
 
     def __init__(
@@ -24,29 +29,30 @@ def __init__(
         method: str,
         weight_bits: int,
         group_size: int,
+        modules_to_not_convert: Optional[List[str]] = None,
+        desc_act: Optional[bool] = None,
+        lm_head_quantized: Optional[bool] = None,
     ) -> None:
         self.method = method
         self.weight_bits = weight_bits
         self.group_size = group_size
+        self.modules_to_not_convert = modules_to_not_convert or []
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
         self.pack_factor = 32 // self.weight_bits
 
         if self.weight_bits not in [4]:
             raise ValueError(f"IPEX quantization supports weight bits [4], "
                              f"but got {self.weight_bits}.")
 
-        if self.method == "awq":
-            self.quant_method = IPEXAWQLinearMethod
-        else:
-            raise ValueError(f"IPEX quantization supports [awq], "
+        if self.method not in ["awq", "gptq"]:
+            raise ValueError(f"IPEX quantization supports [awq, gptq], "
                              f"but got {self.method}.")
 
     def __repr__(self) -> str:
-        return (f"IPEXConfig(method={self.method}"
+        return (f"IPEXConfig(method={self.method},"
                 f"weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size}")
-
-    def get_ipex_quant_method_id(self) -> int:
-        return IPEXConfig.IPEX_QUANT_METHOD_MAP[self.method]
+                f"group_size={self.group_size})")
 
     @classmethod
     def get_name(cls) -> str:
@@ -70,19 +76,32 @@ def get_config_filenames() -> List[str]:
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "IPEXConfig":
         method = cls.get_from_keys(config, ["quant_method"]).lower()
-        weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
-        group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
-        return cls(method, weight_bits, group_size)
+        if method == "awq":
+            weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
+            group_size = cls.get_from_keys(config,
+                                           ["q_group_size", "group_size"])
+            modules_to_not_convert = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None)
+            return cls(method, weight_bits, group_size, modules_to_not_convert,
+                       False, False)
+        # otherwise for gptq
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
+        return cls(method, weight_bits, group_size, [], desc_act,
+                   lm_head_quantized)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
-        if not current_platform.is_cpu():
+        if not current_platform.is_cpu() and not current_platform.is_xpu():
             return None
 
         quant_method = hf_quant_cfg.get("quant_method", "").lower()
 
-        if quant_method in ["awq"]:
+        if quant_method in ["awq", "gptq"]:
             return cls.get_name()
 
         return None
@@ -90,12 +109,81 @@ def override_quantization_method(cls, hf_quant_cfg,
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["LinearMethodBase"]:
         if isinstance(layer, LinearBase):
-            return self.quant_method(self)
+            if self.method == "awq":
+                if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                    return UnquantizedLinearMethod()
+                return IPEXAWQLinearMethod(self)
+            if self.method == "gptq":
+                return IPEXGPTQLinearMethod(self)
         return None
 
 
+class IPEXGPTQLinearMethod(GPTQLinearMethod):
+    """GPTQ linear method using IPEX for the CPU/XPU backend.
+    """
+
+    def __init__(self, quant_config: IPEXConfig):
+        self.quant_config = quant_config  # type: ignore
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        bias = layer.bias if not layer.skip_bias_add else None
+
+        try:
+            import intel_extension_for_pytorch as ipex
+            if ipex.__version__ < MIN_IPEX_VERSION:
+                raise ImportError(
+                    "intel_extension_for_pytorch version is "
+                    "wrong. Please install "
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
+        except ImportError as err:
+            raise ImportError(
+                "Please install "
+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
+                " to use IPEX-AWQ linear method.") from err
+        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
+        # with better performance.
+        lowp_mode = ipex.quantization.WoqLowpMode.INT8
+        # The weight will be de-packed from INT4 to INT8.
+        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
+        # The float activation will be quantized (dynamic, per-token) to INT8.
+        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
+
+        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+            weight_dtype=weight_dtype,
+            lowp_mode=lowp_mode,
+            act_quant_mode=act_quant_mode,
+            group_size=self.quant_config.group_size,
+        )
+        layer.ipex_output_size = layer.qweight.shape[-1]
+        g_idx = layer.g_idx if self.quant_config.desc_act else None
+        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
+            IPEXWeightOnlyQuantizedLinear.from_weight(
+            layer.qweight,
+            layer.scales,
+            layer.qzeros,
+            layer.qweight.size(0),
+            layer.ipex_output_size,
+            qconfig=qconfig,
+            g_idx=g_idx,
+            bias=bias,
+            group_size=self.quant_config.group_size,
+            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"]
+        )
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = layer.ipex_qlinear(reshaped_x)
+        if bias is not None:
+            out.add_(bias)
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
+
+
 class IPEXAWQLinearMethod(AWQLinearMethod):
-    """AWQ linear method using IPEX for the CPU backend.
+    """AWQ linear method using IPEX for the CPU/XPU backend.
     """
 
     def __init__(self, quant_config: IPEXConfig):
@@ -108,15 +196,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         try:
             import intel_extension_for_pytorch as ipex
-            if ipex.__version__ < "2.4.0":
-                raise ImportError("intel_extension_for_pytorch version is "
-                                  "wrong. Please install "
-                                  "intel_extension_for_pytorch>=2.4.0.")
+            if ipex.__version__ < MIN_IPEX_VERSION:
+                raise ImportError(
+                    "intel_extension_for_pytorch version is "
+                    "wrong. Please install "
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
         except ImportError as err:
             raise ImportError(
                 "Please install "
-                "intel_extension_for_pytorch>=2.4.0 via "
-                "`pip install intel_extension_for_pytorch>=2.4.0`"
+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
                 " to use IPEX-AWQ linear method.") from err
 
         # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
@@ -136,19 +225,18 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         layer.ipex_output_size = layer.qweight.size(
             1) * self.quant_config.pack_factor
-        layer.ipex_qlinear = ipex.nn.modules.weight_only_quantization.\
-            WeightOnlyQuantizedLinear.from_weight(
-                layer.qweight,
-                layer.scales,
-                layer.qzeros,
-                layer.qweight.size(0),
-                layer.ipex_output_size,
-                qconfig=qconfig,
-                bias=bias,
-                group_size=self.quant_config.group_size,
-                quant_method=
-                    self.quant_config.get_ipex_quant_method_id() # type: ignore
-            )
+        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
+            IPEXWeightOnlyQuantizedLinear.from_weight(
+            layer.qweight,
+            layer.scales,
+            layer.qzeros,
+            layer.qweight.size(0),
+            layer.ipex_output_size,
+            qconfig=qconfig,
+            bias=bias,
+            group_size=self.quant_config.group_size,
+            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"]  # type: ignore
+        )
 
     def apply(self,
               layer: torch.nn.Module,
@@ -156,5 +244,4 @@ def apply(self,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         reshaped_x = x.reshape(-1, x.shape[-1])
         out = layer.ipex_qlinear(reshaped_x)
-
         return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
diff --git a/vllm/model_executor/layers/quantization/kernels/machete.py b/vllm/model_executor/layers/quantization/kernels/machete.py
index e5696d08f30f5..15df0200f30b5 100644
--- a/vllm/model_executor/layers/quantization/kernels/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/machete.py
@@ -79,7 +79,9 @@ def transform_w_q(x):
                                                           c.weight_type,
                                                           packed_dim=0)
             x.data = ops.machete_prepack_B(x.data.t().contiguous().t(),
-                                           self.config.weight_type)
+                                           a_type=c.act_type,
+                                           b_type=c.weight_type,
+                                           group_scales_type=c.act_type)
             return x
 
         def transform_w_s(x):
@@ -105,12 +107,12 @@ def apply_weights(self,
         if c.has_g_idx:
             x_2d = self.act_perm(x_2d)
 
-        output = ops.machete_gemm(a=x_2d,
-                                  b_q=w_q,
-                                  b_type=c.weight_type,
-                                  b_zeros=None,
-                                  b_scales=w_s,
-                                  b_group_size=c.group_size)
+        output = ops.machete_mm(a=x_2d,
+                                b_q=w_q,
+                                b_type=c.weight_type,
+                                b_group_zeros=None,
+                                b_group_scales=w_s,
+                                b_group_size=c.group_size)
 
         if bias is not None:
             output.add_(bias)  # In-place add
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 9a1defa409714..c9366ca97d149 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -303,7 +303,8 @@ def apply_gptq_marlin_linear(
                                   size_k=input_size_per_partition,
                                   is_k_full=is_k_full,
                                   has_zp=False,
-                                  use_fp32_reduce=use_fp32_reduce)
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
 
     if bias is not None:
         output.add_(bias)  # In-place add
@@ -340,7 +341,8 @@ def apply_awq_marlin_linear(
                                   size_k=input_size_per_partition,
                                   is_k_full=True,
                                   has_zp=True,
-                                  use_fp32_reduce=use_fp32_reduce)
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
 
     if bias is not None:
         output.add_(bias)  # In-place add
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index c217f5ca620a1..83055d6000d83 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -126,11 +126,14 @@ def permute_rows(q_w: torch.Tensor,
 
 def quantize_weights(w: torch.Tensor,
                      quant_type: ScalarType,
-                     group_size: int,
+                     group_size: Optional[int],
                      zero_points: bool = False,
                      ref_zero_points_after_scales: bool = False):
     assert quant_type.is_integer(), \
         "Floating point quantization may work but has not been tested"
+    assert not zero_points or group_size is not None, \
+        "to have group zero points, group_size must be provided "\
+        "(-1 group_size is channelwise)"
 
     orig_device = w.device
     orig_type = w.dtype
@@ -140,10 +143,9 @@ def quantize_weights(w: torch.Tensor,
 
     if group_size == -1:
         group_size = size_k
-    assert group_size <= size_k
 
     # Reshape to [groupsize, -1]
-    if group_size < size_k:
+    if group_size is not None and group_size < size_k:
         w = w.reshape((-1, group_size, size_n))
         w = w.permute(1, 0, 2)
         w = w.reshape((group_size, -1))
@@ -155,18 +157,20 @@ def quantize_weights(w: torch.Tensor,
     max_q_val = quant_type.max()
     min_q_val = quant_type.min()
 
-    if zero_points:
-        assert not quant_type.is_signed() and quant_type.max() > 0
-        w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
-        maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \
-            .clamp(min_q_val, max_q_val).int()
-    else:
-        # If the bias is such that there are no possible negative/positive
-        #  values, set the max value to inf to avoid divide by 0
-        w_s = torch.max(
-            abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
-            abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)))
-        maybe_w_zp = None
+    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
+    maybe_w_zp = None
+    if group_size is not None:
+        if zero_points:
+            assert not quant_type.is_signed() and quant_type.max() > 0
+            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
+            maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \
+                .clamp(min_q_val, max_q_val).int()
+        else:
+            # If the bias is such that there are no possible negative/positive
+            #  values, set the max value to inf to avoid divide by 0
+            w_s = torch.max(
+                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)))
 
     # Quantize
     w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
@@ -176,7 +180,7 @@ def quantize_weights(w: torch.Tensor,
     # For some kernels (namely Machete) the zero-points are applied after the
     # scales are applied, for this case computing the reference in similar way
     # allows us to use tighter error tolerances in our unit tests.
-    if ref_zero_points_after_scales and zero_points:
+    if ref_zero_points_after_scales and maybe_w_zp is not None:
         w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
     else:
         w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
@@ -185,7 +189,7 @@ def quantize_weights(w: torch.Tensor,
         w_q += quant_type.bias
 
     # Restore original shapes
-    if group_size < size_k:
+    if group_size is not None and group_size < size_k:
 
         def reshape_w(w):
             w = w.reshape((group_size, -1, size_n))
@@ -195,17 +199,16 @@ def reshape_w(w):
 
         w_q = reshape_w(w_q)
         w_ref = reshape_w(w_ref)
+        w_s = w_s.reshape((-1, size_n)).contiguous()
 
-    w_s = w_s.reshape((-1, size_n)).contiguous()
-
-    if zero_points:
+    if maybe_w_zp is not None:
         maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
         maybe_w_zp = maybe_w_zp.to(device=orig_device)
 
     return (
         w_ref.to(device=orig_device),
         w_q.to(device=orig_device),
-        w_s.to(device=orig_device),
+        w_s if group_size is not None else None,
         maybe_w_zp,
     )
 
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 2e9a0e170693b..3ab0ba9e9f5c2 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -368,7 +368,7 @@ def _smallest_positive_value(self) -> float:
 # Note that we always sample with replacement.
 # probs will be modified in place, but this is fine, as we pass
 # in a copy already.
-@torch.jit.script
+@torch.compile(dynamic=True)
 def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 63ceec63e8317..117fe086e5e87 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -847,6 +847,7 @@ def get_input_positions(
         vision_end_token_id: int,
         spatial_merge_size: int,
         context_len: int = 0,
+        seq_len: Optional[int] = None,
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
 
@@ -921,9 +922,9 @@ def get_input_positions(
                 torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:]
         mrope_position_delta = (llm_positions.max() + 1 -
                                 len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions.tolist(), mrope_position_delta
 
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c10efefea5471..8792bd42d54d2 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -995,7 +995,9 @@ def get_logprobs(
     if len(query_indices) == 0:
         empty_sampled_logprob: SampleLogprobs = []
         empty_prompt_logprob: Optional[PromptLogprobs] = None
-        return [empty_prompt_logprob], [empty_sampled_logprob]
+        num_seq_groups = len(sampling_metadata.seq_groups)
+        return [empty_prompt_logprob
+                ] * num_seq_groups, [empty_sampled_logprob] * num_seq_groups
 
     selected_logprobs, ranks = None, None
     top_logprobs, top_token_ids = None, None
@@ -1262,6 +1264,10 @@ def _build_sampler_output(
         assert sample_logprobs is not None
         assert not isinstance(maybe_deferred_sample_results,
                               SampleResultArgsType)
+        assert len(sampling_metadata.seq_groups) \
+            == len(maybe_deferred_sample_results) \
+            == len(prompt_logprobs) \
+            == len(sample_logprobs)
         deferred_sample_results_args = None
 
         for (seq_group, sample_result, group_prompt_logprobs,
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 7e750a744e25f..6aa4b8bd34cde 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -43,6 +43,21 @@ def init_gpu_tensors(self, device: Union[int, str]) -> None:
                                                dtype=torch.long,
                                                device=device)
 
+    def init_tensors(self,
+                     device: Union[int, str],
+                     device_type: Union[torch.device, str] = 'cuda') -> None:
+        assert self.num_accepted_tokens is None
+        if isinstance(device_type, torch.device):
+            device_type = device_type.type
+        if isinstance(device, int):
+            device = f"{device_type}:{device}"
+        self.num_accepted_tokens = torch.tensor(0,
+                                                dtype=torch.long,
+                                                device=device)
+        self.num_emitted_tokens = torch.tensor(0,
+                                               dtype=torch.long,
+                                               device=device)
+
     @property
     def probs_dtype(self):
         return torch.float32
@@ -77,7 +92,7 @@ def _create_output(
             tensor is [batch_size, k + num_bonus_tokens]
         """
         batch_size, k = substitute_token_ids.shape
-        bonus_token_ids = bonus_token_ids.squeeze()
+        bonus_token_ids = bonus_token_ids.squeeze(-1)
         # Determine the index of the first False value for each row.
         limits = (accepted == 0).max(1).indices
         limits[~(accepted == 0).any(1)] = k
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 52771f50a7a23..30548e656c557 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -133,13 +133,13 @@ def __post_init__(self):
         assert self.num_added_elements <= self.num_added_elements_padded
 
 
-@torch.jit.script
+@torch.compile(dynamic=True)
 def get_masked_input_and_mask(
         input_: torch.Tensor, org_vocab_start_index: int,
         org_vocab_end_index: int, num_org_vocab_padding: int,
         added_vocab_start_index: int,
         added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    # torch.jit.script will fuse all of the pointwise ops below
+    # torch.compile will fuse all of the pointwise ops below
     # into a single kernel, making it very fast
     org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
                                                           org_vocab_end_index)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 140b61fe6d56a..e4122962e574a 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -5,9 +5,10 @@
 import fnmatch
 import glob
 import inspect
-import json
+import itertools
 import math
 import os
+import warnings
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
@@ -16,19 +17,24 @@
 import huggingface_hub
 import numpy as np
 import torch
-from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub import HfApi
 from torch import nn
 from transformers import AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm.config import (LoadConfig, LoadFormat, ModelConfig, ParallelConfig,
-                         VllmConfig)
+                         VllmConfig, set_current_vllm_config)
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (ReplicatedLinear,
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizeMethodBase)
 from vllm.model_executor.model_loader.tensorizer import (
     TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
     serialize_vllm_model, tensorizer_weights_iterator)
@@ -36,9 +42,10 @@
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
-    filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
-    get_gguf_extra_tensor_names, gguf_quant_weights_iterator,
-    initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
+    fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
+    filter_files_not_needed_for_inference, get_gguf_extra_tensor_names,
+    gguf_quant_weights_iterator, initialize_dummy_weights,
+    np_cache_weights_iterator, pt_weights_iterator,
     safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
@@ -73,12 +80,14 @@ def device_loading_context(module: torch.nn.Module,
                 original_device: torch.device = original_device_states[name]
                 if original_device.type == "cpu":
                     # `torch.empty_like` does not support `pin_memory` argument
-                    cpu_data = torch.empty_strided(size=p.data.size(),
-                                                   stride=p.data.stride(),
-                                                   dtype=p.data.dtype,
-                                                   layout=p.data.layout,
-                                                   device="cpu",
-                                                   pin_memory=pin_memory)
+                    cpu_data = torch.empty_strided(
+                        size=p.data.size(),
+                        stride=p.data.stride(),
+                        dtype=p.data.dtype,
+                        layout=p.data.layout,
+                        device="cpu",
+                        pin_memory=pin_memory,
+                    )
                     cpu_data.copy_(p.data)
                     p.data = cpu_data
                 else:
@@ -89,24 +98,33 @@ def device_loading_context(module: torch.nn.Module,
 logger = init_logger(__name__)
 
 
-def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
+def _initialize_model(
+    vllm_config: VllmConfig,
+    *,
+    prefix: str = "",
+) -> nn.Module:
     """Initialize a model with the given configurations."""
     model_config = vllm_config.model_config
     model_class, _ = get_model_architecture(model_config)
+
     signatures = inspect.signature(model_class.__init__)
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
         # new-style model class
-        return model_class(vllm_config=vllm_config, prefix=prefix)
+        with set_current_vllm_config(vllm_config):
+            return model_class(vllm_config=vllm_config, prefix=prefix)
+
     msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
            "input arguments. Possibly you have an old-style model class"
            " registered from out of tree and it is used for new vLLM version. "
-           "Check https://docs.vllm.ai/en/latest/design/class_hierarchy.html "
+           "Check https://docs.vllm.ai/en/latest/design/arch_overview.html "
            "for the design and update the model class accordingly.")
-    logger.warning(msg)
+    warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
     logger.warning(
         "Trying to guess the arguments for old-style model class %s",
-        model_class)
+        model_class,
+    )
     # try to be compatible with old-style model class
     kwargs = {}
     if "prefix" in all_params:
@@ -121,7 +139,8 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
         kwargs["lora_config"] = vllm_config.lora_config
     if "scheduler_config" in all_params:
         kwargs["scheduler_config"] = vllm_config.scheduler_config
-    return model_class(**kwargs)
+    with set_current_vllm_config(vllm_config):
+        return model_class(**kwargs)
 
 
 class BaseModelLoader(ABC):
@@ -191,14 +210,17 @@ def _maybe_download_from_modelscope(
             return model_path
         return None
 
-    def _prepare_weights(self, model_name_or_path: str,
-                         revision: Optional[str],
-                         fall_back_to_pt: bool) -> Tuple[str, List[str], bool]:
+    def _prepare_weights(
+        self,
+        model_name_or_path: str,
+        revision: Optional[str],
+        fall_back_to_pt: bool,
+    ) -> Tuple[str, List[str], bool]:
         """Prepare weights for the model.
 
         If the model is not local, it will be downloaded."""
-        model_name_or_path = self._maybe_download_from_modelscope(
-            model_name_or_path, revision) or model_name_or_path
+        model_name_or_path = (self._maybe_download_from_modelscope(
+            model_name_or_path, revision) or model_name_or_path)
 
         is_local = os.path.isdir(model_name_or_path)
         load_format = self.load_config.load_format
@@ -251,8 +273,11 @@ def _prepare_weights(self, model_name_or_path: str,
             # any files not found in the index.
             if not is_local:
                 download_safetensors_index_file_from_hf(
-                    model_name_or_path, index_file,
-                    self.load_config.download_dir, revision)
+                    model_name_or_path,
+                    index_file,
+                    self.load_config.download_dir,
+                    revision,
+                )
             hf_weights_files = filter_duplicate_safetensors_files(
                 hf_weights_files, hf_folder, index_file)
         else:
@@ -275,10 +300,21 @@ def _get_weights_iterator(
             # Currently np_cache only support *.bin checkpoints
             assert use_safetensors is False
             weights_iterator = np_cache_weights_iterator(
-                source.model_or_path, self.load_config.download_dir, hf_folder,
-                hf_weights_files)
+                source.model_or_path,
+                self.load_config.download_dir,
+                hf_folder,
+                hf_weights_files,
+            )
         elif use_safetensors:
-            weights_iterator = safetensors_weights_iterator(hf_weights_files)
+            use_fastsafe_tensor = os.getenv('USE_FASTSAFETENSOR',
+                                            'False').lower() == 'true'
+            if use_fastsafe_tensor:
+                logger.info("Using fastsafetensor for loading weights")
+                weights_iterator = fastsafetensors_weights_iterator(
+                    hf_weights_files)
+            else:
+                weights_iterator = safetensors_weights_iterator(
+                    hf_weights_files)
         else:
             weights_iterator = pt_weights_iterator(hf_weights_files)
 
@@ -303,17 +339,19 @@ def _get_all_weights(
         model_config: ModelConfig,
         model: nn.Module,
     ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-
         primary_weights = DefaultModelLoader.Source(
             model_config.model,
             model_config.revision,
             prefix="",
             fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load",
-                                    True))
+                                    True),
+        )
         yield from self._get_weights_iterator(primary_weights)
 
-        secondary_weights = cast(Iterable[DefaultModelLoader.Source],
-                                 getattr(model, "secondary_weights", ()))
+        secondary_weights = cast(
+            Iterable[DefaultModelLoader.Source],
+            getattr(model, "secondary_weights", ()),
+        )
         for source in secondary_weights:
             yield from self._get_weights_iterator(source)
 
@@ -331,11 +369,21 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
             with target_device:
                 model = _initialize_model(vllm_config=vllm_config)
 
-            model.load_weights(self._get_all_weights(model_config, model))
+            weights_to_load = {name for name, _ in model.named_parameters()}
+            loaded_weights = model.load_weights(
+                self._get_all_weights(model_config, model))
+            # We only enable strict check for non-quantized models
+            # that have loaded weights tracking currently.
+            if model_config.quantization is None and loaded_weights is not None:
+                weights_not_loaded = weights_to_load - loaded_weights
+                if weights_not_loaded:
+                    raise ValueError(
+                        "Following weights were not initialized from "
+                        f"checkpoint: {weights_not_loaded}")
 
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)
-                if quant_method is not None:
+                if isinstance(quant_method, QuantizeMethodBase):
                     # When quant methods need to process weights after loading
                     # (for repacking, quantizing, etc), they expect parameters
                     # to be on the global target device. This scope is for the
@@ -399,7 +447,7 @@ def _verify_config(self, model_config: ModelConfig,
         self.tensorizer_config.verify_with_parallel_config(parallel_config)
 
     def _get_weights_iterator(
-            self) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        self, ) -> Generator[Tuple[str, torch.Tensor], None, None]:
         tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
         return tensorizer_weights_iterator(tensorizer_args)
 
@@ -462,9 +510,10 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
 
         if parallel_config.tensor_parallel_size > 1:
             from vllm.distributed import get_tensor_model_parallel_rank
-            self.tensorizer_config.tensorizer_uri = \
-                self.tensorizer_config.tensorizer_uri \
-                    % get_tensor_model_parallel_rank()
+
+            self.tensorizer_config.tensorizer_uri = (
+                self.tensorizer_config.tensorizer_uri %
+                get_tensor_model_parallel_rank())
 
         if is_vllm_tensorized(self.tensorizer_config):
             return self._load_model_serialized(vllm_config=vllm_config)
@@ -503,13 +552,13 @@ def __init__(self, load_config: LoadConfig):
 
     @staticmethod
     def _filter_subtensors(
-            tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        tensors: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]:
         """
         Filter out all tensors that share the same memory or a subset of the
         memory of another tensor.
         """
-        same_storage_groups: Dict[Any, List[Tuple[
-            str, torch.Tensor]]] = collections.defaultdict(list)
+        same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = (
+            collections.defaultdict(list))
         for key, tensor in tensors.items():
             if tensor.numel():
                 ptr = tensor.untyped_storage().data_ptr()
@@ -598,8 +647,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                         if tensor.shape != param_shape:
                             logger.warning(
                                 "loading tensor of shape %s into "
-                                "parameter '%s' of shape %s", tensor.shape,
-                                key, param_shape)
+                                "parameter '%s' of shape %s",
+                                tensor.shape,
+                                key,
+                                param_shape,
+                            )
                         param_data.copy_(tensor)
                         state_dict.pop(key)
             if state_dict:
@@ -617,6 +669,7 @@ def save_model(
         from safetensors.torch import save_file
 
         from vllm.distributed import get_tensor_model_parallel_rank
+
         if pattern is None:
             pattern = ShardedStateLoader.DEFAULT_PATTERN
         rank = get_tensor_model_parallel_rank()
@@ -650,24 +703,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
     possible_config_file_names = ["adapter_config.json"]
 
-    default_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        '.fc1.',
-        '.fc2.',
-        '.dense.',
-        '.query_key_value.',
-        '.qkv_proj.',
-        '.dense_h_to_4h.',
-        '.dense_4h_to_h.',
-        '.out_proj.',
-    ]
-
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
 
@@ -675,54 +710,18 @@ def __init__(self, load_config: LoadConfig):
         self.unsharded_weights_modules: List[str] = []
         # Save the module names that are sharded by column.
         self.column_sharded_weights_modules: List[str] = []
-        # we don't need to quantize the whole model, only the target modules
-        # that are specified in the adapter config file. If the adapter config
-        # file is not provided, we will quantize the default modules.
-        if (not load_config.model_loader_extra_config
-                or "qlora_adapter_name_or_path"
-                not in load_config.model_loader_extra_config):
-            self.target_modules = []
-            return
-
-        qlora_adapter = load_config.model_loader_extra_config[
-            "qlora_adapter_name_or_path"]
-
-        config_file_path = self._get_config_file(qlora_adapter)
-
-        with open(config_file_path) as f:
-            config = json.load(f)
-            self.target_modules = config["target_modules"]
-
-    def _get_config_file(self, qlora_adapter: str) -> str:
-        is_local = os.path.isdir(qlora_adapter)
-        config_file_path = None
-        if is_local:
-            for file in self.possible_config_file_names:
-                config_file_path = os.path.join(qlora_adapter, file)
-                if os.path.exists(config_file_path):
-                    break
-        else:
-            hf_api = HfApi()
-            repo_files = hf_api.list_repo_files(repo_id=qlora_adapter)
-            for file in self.possible_config_file_names:
-                if file in repo_files:
-                    config_file_path = hf_hub_download(repo_id=qlora_adapter,
-                                                       filename=file)
-                    break
-
-        if not config_file_path:
-            raise ValueError(
-                f"Cannot find adapter config file in {qlora_adapter}")
-
-        return config_file_path
+        # Store all module names (from transformers) that support
+        # BNB quantization.
+        self.target_modules: List[str] = []
 
     def _get_weight_files(
-            self,
-            model_name_or_path: str,
-            allowed_patterns: List[str],
-            revision: Optional[str] = None) -> Tuple[List[str], str]:
-        """Retrieve weight files. Download the files if necessary. 
-        
+        self,
+        model_name_or_path: str,
+        allowed_patterns: List[str],
+        revision: Optional[str] = None,
+    ) -> Tuple[List[str], str]:
+        """Retrieve weight files. Download the files if necessary.
+
         Return the weight files and the file pattern."""
         is_local = os.path.isdir(model_name_or_path)
 
@@ -789,6 +788,7 @@ def _get_quantized_weights_iterator(
         # only load the bitsandbytes module when needed
         try:
             import bitsandbytes
+
             if bitsandbytes.__version__ < "0.44.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
                                   "install bitsandbytes>=0.44.0.")
@@ -822,8 +822,11 @@ def _is_8bit_weight_name(self, weight_name: str):
 
     def _is_4bit_weight_name(self, weight_name: str):
         quantized_suffix = {
-            "absmax", "quant_map", "nested_absmax", "nested_quant_map",
-            "bitsandbytes"
+            "absmax",
+            "quant_map",
+            "nested_absmax",
+            "nested_quant_map",
+            "bitsandbytes",
         }
         suffix = weight_name.split(".")[-1]
         return any(q_suffix in suffix for q_suffix in quantized_suffix)
@@ -840,7 +843,6 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
 
         for weight_name, weight_tensor in self._hf_weight_iter(
                 hf_weights_files, use_safetensors):
-
             if self._is_8bit_weight_name(weight_name):
                 continue
 
@@ -882,14 +884,13 @@ def _parse_quant_state(param_name: str,
         # pre quantized weights would have a quant_state
         for weight_name, weight_tensor in self._hf_weight_iter(
                 hf_weights_files, use_safetensors):
-
             if self._is_4bit_weight_name(weight_name):
                 continue
 
-            if (f"{weight_name}.quant_state.bitsandbytes__nf4" \
-                    in temp_state_dict) or \
-            (f"{weight_name}.quant_state.bitsandbytes__fp4" \
-                    in temp_state_dict):
+            if (f"{weight_name}.quant_state.bitsandbytes__nf4"
+                    in temp_state_dict) or (
+                        f"{weight_name}.quant_state.bitsandbytes__fp4"
+                        in temp_state_dict):
                 quant_state = _parse_quant_state(weight_name, temp_state_dict)
                 quant_state_dict[weight_name] = quant_state
                 yield weight_name, weight_tensor
@@ -899,12 +900,12 @@ def _parse_quant_state(param_name: str,
     def _unquantized_generator(self, hf_weights_files, use_safetensors,
                                quant_state_dict) -> Generator:
         from bitsandbytes.functional import quantize_4bit
+
         tp_size = get_tensor_model_parallel_world_size()
         tp_rank = get_tensor_model_parallel_rank()
 
         for weight_name, weight_tensor in self._hf_weight_iter(
                 hf_weights_files, use_safetensors):
-
             if any(target_module in weight_name for target_module in
                    self.target_modules) and weight_name.endswith(".weight"):
                 # Without sharding
@@ -921,6 +922,33 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
                     end_index = total_size // tp_size * (tp_rank + 1)
                     weight_sub_tensor = weight_tensor[...,
                                                       start_index:end_index]
+                # Weights have fused on disk. In this case, we assume that the
+                # weight and module use same name.
+                elif any(
+                        weight_name.startswith(module)
+                        for module in self.maybe_fused_weights_modules):
+                    # special case for fused weights
+                    # get the size of each shard weight tensor
+                    total_shard_sizes = next(
+                        (sizes for module, sizes in
+                         self.maybe_fused_weights_modules.items()
+                         if weight_name.startswith(module)))
+                    total_size = weight_tensor.size(0)
+                    assert total_size == sum(total_shard_sizes)
+                    # get the start/end index of each shard weight tensor
+                    total_start_index = list(
+                        itertools.accumulate([0] + total_shard_sizes))[:-1]
+                    shard_weights_index = [(
+                        idx + size // tp_size * tp_rank,
+                        idx + size // tp_size * (tp_rank + 1),
+                    ) for idx, size in zip(total_start_index,
+                                           total_shard_sizes)]
+                    # slice and reorder the weight tensor
+                    weight_tensor = [
+                        weight_tensor[start_index:end_index, ...]
+                        for start_index, end_index in shard_weights_index
+                    ]
+                    weight_sub_tensor = torch.cat(weight_tensor, dim=0)
                 # Shard by row
                 else:
                     total_size = weight_tensor.size(0)
@@ -944,7 +972,8 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
                     processed_weight, quant_state = quantize_4bit(
                         loaded_weight,
                         compress_statistics=True,
-                        quant_type="nf4")
+                        quant_type="nf4",
+                    )
 
                 quant_state_dict[weight_name] = quant_state
             else:
@@ -952,30 +981,61 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
 
             yield weight_name, processed_weight
 
+    def _get_bnb_target_modules(self, model: nn.Module) -> None:
+
+        # TODO: Maybe we can replace bitsandbytes_stacked_params_mapping with
+        # packed_modules_mapping.
+        inverse_stacked_mapping: Dict[str, List[str]] = {}
+        for orig, (
+                packed,
+                idx,
+        ) in model.bitsandbytes_stacked_params_mapping.items():
+            if packed not in inverse_stacked_mapping:
+                inverse_stacked_mapping[packed] = []
+            inverse_stacked_mapping[packed].insert(idx, orig)
+
+        for name, module in model.named_modules():
+            if isinstance(module, (LinearBase, )):
+                last_name = name.split(".")[-1]
+                if sub_modules := inverse_stacked_mapping.get(last_name, []):
+                    # Map vllm's names to transformers' names.
+                    for sub_name in sub_modules:
+                        self.target_modules.append(
+                            name.replace(last_name, sub_name))
+                else:
+                    self.target_modules.append(name)
+        assert (self.target_modules
+                ), "vllm currently does not support BNB quantization for"
+        f" {type(model).__name__}"
+
     def _load_weights(self, model_config: ModelConfig,
                       model: nn.Module) -> None:
-        if not hasattr(model, 'load_weights'):
+        if not hasattr(model, "load_weights"):
             raise AttributeError(
                 "The required method 'load_weights' is not defined in class"
                 f" {type(model).__name__}.")
 
-        if not hasattr(model, 'bitsandbytes_stacked_params_mapping'):
+        if not hasattr(model, "bitsandbytes_stacked_params_mapping"):
             raise AttributeError(
                 f"Model {type(model).__name__} does not support BitsAndBytes "
                 "quantization yet.")
 
-        if len(self.target_modules) == 0:
-            if hasattr(model, 'default_bitsandbytes_target_modules'):
-                self.target_modules = model.default_bitsandbytes_target_modules
-            else:
-                self.target_modules = self.default_target_modules
-
+        # Modules whose weights might have fused on disk
+        # we need their output_sizes to make shard in flight correctly with TP
+        self.maybe_fused_weights_modules: Dict[str, List[int]] = {}
+        self._get_bnb_target_modules(model)
         for name, module in model.named_modules():
             # Some modules like `ReplicatedLinear` should not have their weights
             # sharded. The reason for implementing it this way is to avoid new
             # static variable in the model implementation.
             if isinstance(module, (ReplicatedLinear, )):
                 self.unsharded_weights_modules.append(name)
+            # `QKVParallelLinear` and `MergedColumnParallelLinear` might have
+            # fused weights on disk. We need to use the output sizes of these
+            # modules to shard the weights correctly.
+            elif isinstance(module,
+                            (QKVParallelLinear, MergedColumnParallelLinear)):
+                self.maybe_fused_weights_modules[name] = module.output_sizes
             # In TP, these weights are partitioned along the column
             # dimension (dim=-1)
             elif isinstance(module, (RowParallelLinear, )):
@@ -991,7 +1051,7 @@ def _load_weights(self, model_config: ModelConfig,
 
         pre_quant = False
         if quant_config is not None:
-            quant_method = quant_config.get('quant_method')
+            quant_method = quant_config.get("quant_method")
             if quant_method == "bitsandbytes":
                 pre_quant = True
             else:
@@ -1008,13 +1068,21 @@ def _load_weights(self, model_config: ModelConfig,
 
         load_8bit = False
         if pre_quant:
-            load_8bit = quant_config.get('load_in_8bit', False)
-
-        qweight_iterator, quant_state_dict = \
-            self._get_quantized_weights_iterator(
-            model_config.model, model_config.revision, pre_quant, load_8bit)
-
-        model.load_weights(qweight_iterator)
+            load_8bit = quant_config.get("load_in_8bit", False)
+
+        qweight_iterator, quant_state_dict = (
+            self._get_quantized_weights_iterator(model_config.model,
+                                                 model_config.revision,
+                                                 pre_quant, load_8bit))
+
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        loaded_weights = model.load_weights(qweight_iterator)
+        # Some models may have weights loading tracker unimplemented.
+        if loaded_weights is not None:
+            weights_not_loaded = weights_to_load - loaded_weights
+            if weights_not_loaded:
+                raise ValueError("Following weights were not initialized from "
+                                 f"checkpoint: {weights_not_loaded}")
 
         torch.cuda.empty_cache()
 
@@ -1023,6 +1091,7 @@ def _load_weights(self, model_config: ModelConfig,
         # TODO: Change this lazy import to normal import
         # after the checks are updated to run on a new version
         from vllm.model_executor.models.utils import is_pp_missing_parameter
+
         for quant_param_name in quant_state_dict:
             if is_pp_missing_parameter(quant_param_name, model):
                 continue
@@ -1031,9 +1100,9 @@ def _load_weights(self, model_config: ModelConfig,
 
             shard_index = 0
             for shard_name, (
-                    weight_name, index
+                    weight_name,
+                    index,
             ) in model.bitsandbytes_stacked_params_mapping.items():
-
                 shard_pos = quant_param_name.find(shard_name)
                 # Some models, such as MiniCPM V2.5/2.6, contain both
                 # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
@@ -1045,9 +1114,10 @@ def _load_weights(self, model_config: ModelConfig,
                         shard_name, weight_name)
                     break
 
+            # Models like Clip/Siglip may skip some layers in initialization,
+            # causing unused quant_param_name in state_dict.
             if quant_param_name not in param_dict:
-                raise ValueError(
-                    f"Parameter {quant_param_name} not found in the model.")
+                continue
 
             if quant_param_name not in stacked_quant_state_dict:
                 stacked_quant_state_dict[quant_param_name] = {}
@@ -1068,8 +1138,8 @@ def _load_weights(self, model_config: ModelConfig,
 
                 num_elements = [0] * len(quant_states)
                 for seq, quant_state in quant_states.items():
-                    num_elements[seq] = math.prod(
-                        quant_state.shape) // pack_ratio
+                    num_elements[seq] = (math.prod(quant_state.shape) //
+                                         pack_ratio)
 
                 offsets = np.concatenate(([0], np.cumsum(num_elements)))
                 set_weight_attrs(param, {"bnb_shard_offsets": offsets})
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index c48b287ed181a..87f3fcb5cae00 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -13,7 +13,7 @@
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
-from vllm.config import ModelConfig, ParallelConfig
+from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.logger import init_logger
@@ -284,7 +284,8 @@ def _init_model(self):
         model_args = self.tensorizer_config.hf_config
         model_args.torch_dtype = self.tensorizer_config.dtype
         assert self.tensorizer_config.model_class is not None
-        with no_init_or_tensor():
+        # TODO: Do we need to consider old-style model class?
+        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config):
             return self.tensorizer_config.model_class(
                 vllm_config=self.vllm_config, )
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index b95c0b7cd0612..f15e7176b3d50 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -7,6 +7,7 @@
 
 from vllm.config import ModelConfig
 from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models.adapters import as_embedding_model
 
 
 @contextlib.contextmanager
@@ -21,6 +22,7 @@ def set_default_torch_dtype(dtype: torch.dtype):
 def get_model_architecture(
         model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
+
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
     mixtral_supported = [
@@ -32,7 +34,11 @@ def get_model_architecture(
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
-    return ModelRegistry.resolve_model_cls(architectures)
+    model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
+    if model_config.runner_type == "pooling":
+        model_cls = as_embedding_model(model_cls)
+
+    return model_cls, arch
 
 
 def get_architecture_class_name(model_config: ModelConfig) -> str:
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 9488d54edf365..b69d556cf3d70 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -14,6 +14,7 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
+from fastsafetensors import SafeTensorsFileLoader, SingleGroup
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
@@ -410,6 +411,34 @@ def safetensors_weights_iterator(
                 yield name, param
 
 
+def fastsafetensors_weights_iterator(
+    hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files 
+    using fastsafetensor library."""
+    pg = SingleGroup()
+    if torch.distributed.is_initialized():
+        pg = torch.distributed.group.WORLD
+
+    device = torch.device(f'cuda:{pg.rank()}')
+    weight_files_sub_lists = [
+        hf_weights_files[i:i + pg.size()]
+        for i in range(0, len(hf_weights_files), pg.size())
+    ]
+
+    for f_list in weight_files_sub_lists:
+        loader = SafeTensorsFileLoader(pg, device)
+        rank_file_map = {i: [f] for i, f in enumerate(f_list)}
+        loader.add_filenames(rank_file_map)
+        fb = loader.copy_files_to_device()
+        keys = list(fb.key_to_rank_lidx.keys())
+        for k in keys:
+            t = fb.get_tensor(k)
+            yield k, t
+        fb.close()
+        loader.close()
+
+
 def pt_weights_iterator(
     hf_weights_files: List[str]
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index d66373512b95e..a3ef9adad16d9 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,15 +1,14 @@
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, has_inner_state, supports_lora,
                          supports_multimodal, supports_pp)
-from .interfaces_base import (VllmModelForEmbedding,
-                              VllmModelForTextGeneration, is_embedding_model,
-                              is_text_generation_model)
+from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
+                              is_pooling_model, is_text_generation_model)
 from .registry import ModelRegistry
 
 __all__ = [
     "ModelRegistry",
-    "VllmModelForEmbedding",
-    "is_embedding_model",
+    "VllmModelForPooling",
+    "is_pooling_model",
     "VllmModelForTextGeneration",
     "is_text_generation_model",
     "HasInnerState",
@@ -20,4 +19,4 @@
     "supports_multimodal",
     "SupportsPP",
     "supports_pp",
-]
\ No newline at end of file
+]
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
new file mode 100644
index 0000000000000..9cc43ae9181b9
--- /dev/null
+++ b/vllm/model_executor/models/adapters.py
@@ -0,0 +1,98 @@
+from collections.abc import Iterable
+from typing import Any, TypeVar
+
+import torch
+import torch.nn as nn
+
+from .interfaces_base import VllmModelForPooling, is_pooling_model
+
+_T = TypeVar("_T", bound=type[nn.Module])
+
+
+def as_embedding_model(cls: _T) -> _T:
+    """Subclass an existing vLLM model to support embeddings."""
+    # Avoid modifying existing embedding models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.config import VllmConfig
+    from vllm.model_executor.layers.pooler import (Pooler, PoolerOutput,
+                                                   PoolingType)
+    from vllm.model_executor.pooling_metadata import PoolingMetadata
+
+    from .utils import AutoWeightsLoader, WeightsMapper
+
+    class ModelForEmbedding(cls, VllmModelForPooling):
+
+        def __init__(
+            self,
+            *,
+            vllm_config: "VllmConfig",
+            prefix: str = "",
+            **kwargs: Any,
+        ) -> None:
+            super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+            # These are not used in embedding models
+            for attr in ("lm_head", "logits_processor"):
+                if hasattr(self, attr):
+                    delattr(self, attr)
+
+            pooler_config = vllm_config.model_config.pooler_config
+            assert pooler_config is not None
+
+            # If the model already defines a pooler instance, don't overwrite it
+            if not getattr(self, "_pooler", None):
+                self._pooler = Pooler.from_config_with_defaults(
+                    pooler_config,
+                    pooling_type=PoolingType.LAST,
+                    normalize=True,
+                    softmax=False,
+                )
+
+        def pooler(
+            self,
+            hidden_states: torch.Tensor,
+            pooling_metadata: PoolingMetadata,
+        ) -> PoolerOutput:
+            return self._pooler(hidden_states, pooling_metadata)
+
+        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+            # TODO: Support uninitialized params tracking
+
+            # We have deleted this attribute, so don't load it
+            weights = ((name, data) for name, data in weights
+                       if not name.startswith("lm_head."))
+
+            # If `*ForCausalLM` defines `load_weights` on the inner model
+            # and there are no other inner modules with parameters,
+            # we support loading from both `*Model` and `*ForCausalLM`
+            if hasattr(self, "model") and hasattr(self.model, "load_weights"):
+                # Whether only `self.model` contains parameters
+                model_is_only_param = all(
+                    name == "model" or next(child.parameters(), None) is None
+                    for name, child in self.named_children())
+
+                if model_is_only_param:
+                    mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+                    weights = mapper.apply(weights)
+
+                    self.model.load_weights(weights)
+                    return
+
+            # For most other models
+            if hasattr(cls, "load_weights"):
+                cls.load_weights(self, weights)  # type: ignore
+            # Fallback
+            else:
+                loader = AutoWeightsLoader(self)
+                loader.load_weights(weights)
+
+    ModelForEmbedding.__name__ = cls.__name__ \
+        .removesuffix("ForCausalLM") \
+        .removesuffix("ForConditionalGeneration") \
+        .removesuffix("ChatModel") \
+        .removesuffix("LMHeadModel") + "ForEmbedding"
+
+    return ModelForEmbedding  # type: ignore
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 9ee2a2cc09a24..fd6b5659df5d1 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -1,5 +1,5 @@
 """Inference-only Snowflake Arctic model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -33,7 +33,7 @@
 from vllm.transformers_utils.configs.arctic import ArcticConfig
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -44,15 +44,14 @@ class ArcticMLP(nn.Module):
 
     def __init__(self,
                  config: ArcticConfig,
-                 layer_id: int,
                  expert_id: int = -1,
                  is_residual_mlp: bool = False,
                  quant_config: Optional[QuantizationConfig] = None,
-                 reduce_results: bool = True):
+                 reduce_results: bool = True,
+                 prefix: str = ""):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.expert_id = expert_id
-        self.layer_id = layer_id
 
         self.ffn_dim = config.intermediate_size if not is_residual_mlp \
             else self.hidden_size
@@ -85,13 +84,14 @@ class ArcticMoE(nn.Module):
 
     def __init__(self,
                  config: ArcticConfig,
-                 layer_id: int,
                  tp_size: Optional[int] = None,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 reduce_results: bool = True):
+                 reduce_results: bool = True,
+                 prefix: str = ""):
         super().__init__()
 
+        layer_id = extract_layer_index(prefix)
         self.tp_size = tp_size or get_tensor_model_parallel_world_size()
         self.hidden_size = config.hidden_size
         self.num_experts = config.num_local_experts
@@ -109,15 +109,16 @@ def __init__(self,
 
         if not self.is_moe_layer:
             self.mlp = ArcticMLP(config,
-                                 layer_id=layer_id,
                                  quant_config=quant_config,
-                                 reduce_results=reduce_results)
+                                 reduce_results=reduce_results,
+                                 prefix=f"{prefix}.mlp")
         else:
             self.gate = ReplicatedLinear(self.hidden_size,
                                          self.num_experts,
                                          bias=False,
                                          params_dtype=self.params_dtype,
-                                         quant_config=quant_config)
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.gate")
             if self.is_quant:
                 self.ws = DeepSpeedFPParameter(
                     torch.Size((self.num_experts, 2 * self.intermediate_size,
@@ -220,13 +221,12 @@ class ArcticAttention(nn.Module):
     def __init__(
         self,
         config: ArcticConfig,
-        layer_idx: Optional[int] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
-        self.layer_idx = layer_idx
         self.hidden_size = config.hidden_size
 
         tp_size = get_tensor_model_parallel_world_size()
@@ -274,7 +274,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -296,24 +297,25 @@ class ArcticDecoderLayer(nn.Module):
     def __init__(
         self,
         config: ArcticConfig,
-        layer_idx: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
-        self.layer_idx = layer_idx
         self.hidden_size = config.hidden_size
+        layer_idx = extract_layer_index(prefix)
         is_moe_layer = (layer_idx + 1) % config.moe_layer_frequency == 0
         self.use_residual = config.use_residual and is_moe_layer
         self.self_attn = ArcticAttention(config,
-                                         layer_idx,
                                          cache_config,
-                                         quant_config=quant_config)
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.self_attn")
         self.block_sparse_moe = ArcticMoE(
             config,
-            layer_id=layer_idx,
             quant_config=quant_config,
-            reduce_results=(not self.use_residual))
+            reduce_results=(not self.use_residual),
+            prefix=f"{prefix}.block_sparse_moe",
+        )
 
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -324,9 +326,9 @@ def __init__(
             self.residual_layernorm = RMSNorm(config.hidden_size,
                                               eps=config.rms_norm_eps)
             self.residual_mlp = ArcticMLP(config,
-                                          layer_id=layer_idx,
                                           is_residual_mlp=True,
-                                          reduce_results=False)
+                                          reduce_results=False,
+                                          prefix=f"{prefix}.residual_mlp")
 
     def forward(
         self,
@@ -380,8 +382,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             org_num_embeddings=self.vocab_size)
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: ArcticDecoderLayer(config, int(
-                prefix.split(".")[-1]), cache_config, quant_config),
+            lambda prefix: ArcticDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self._attn_implementation = config._attn_implementation
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -389,6 +391,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -396,9 +401,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -439,6 +448,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -446,9 +458,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -468,7 +482,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -506,6 +521,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                         ("ws", f"experts.{expert_id}.w3.weight", expert_id))
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
 
         logger.info(
             "It will take ~10 minutes loading from the 16-bit weights. "
@@ -561,3 +577,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                         weight_loader = getattr(param, "weight_loader",
                                                 default_weight_loader)
                         weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
new file mode 100644
index 0000000000000..dd4b0c75cb84d
--- /dev/null
+++ b/vllm/model_executor/models/aria.py
@@ -0,0 +1,676 @@
+import math
+from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from transformers import LlamaConfig
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.inputs import INPUT_REGISTRY, token_inputs
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    get_compressed_tensors_cache_scale)
+from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
+                                                SamplingMetadata)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.idefics2_vision_model import (
+    Idefics2VisionTransformer)
+from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP,
+                                              LlamaModel)
+from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
+                                              is_pp_missing_parameter,
+                                              maybe_prefix,
+                                              merge_multimodal_embeddings)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import cached_get_image_processor
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   repeat_and_pad_placeholder_tokens)
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
+                                                  AriaVisionConfig)
+
+from .utils import flatten_bn
+
+
+class AriaImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    pixel_mask: Optional[torch.Tensor]
+    """
+    Shape: 
+        pixel_values: `(batch_size * num_images, num_channels, height, width)`
+        pixel_mask: `(batch_size * num_images, height, width)`
+    """
+
+
+class AriaVisionTransformer(Idefics2VisionTransformer):
+    """
+    AriaVisionTransformer is a modified version of Idefics2VisionTransformer
+    that replaces the post-layernorm with an identity layer.
+    """
+
+    def __init__(
+        self,
+        config: AriaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix)
+        self.post_layernorm = nn.Identity()
+
+
+class AriaVisionModel(nn.Module):
+    config_class = AriaVisionConfig
+
+    def __init__(
+        self,
+        config: AriaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.vision_model = AriaVisionTransformer(
+            config,
+            quant_config,
+            prefix=f"{prefix}.vision_model",
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]:
+        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
+
+        vit_oup = self.vision_model(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+        )
+
+        image_atts = self._create_image_attention_mask(patch_attention_mask)
+
+        return vit_oup, image_atts
+
+    def _create_patch_attention_mask(self, pixel_mask):
+        if pixel_mask is None:
+            return None
+
+        patches_subgrid = pixel_mask.unfold(
+            dimension=1,
+            size=self.vision_model.config.patch_size,
+            step=self.vision_model.config.patch_size,
+        ).unfold(
+            dimension=2,
+            size=self.vision_model.config.patch_size,
+            step=self.vision_model.config.patch_size,
+        )
+        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+    def _create_image_attention_mask(self, patch_attention_mask):
+        if patch_attention_mask is None:
+            return None
+
+        flattened_mask = patch_attention_mask.flatten(1)
+        return torch.logical_not(flattened_mask)
+
+
+class FFN(nn.Module):
+
+    def __init__(self, embed_dim, ff_dim, output_dim):
+        super().__init__()
+        self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False)
+        self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False)
+        self.act = get_act_fn("gelu_new")
+
+    def forward(self, hidden_states):
+        hidden_states, _ = self.linear_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_out(hidden_states)
+        return hidden_states
+
+
+class CrossAttention(nn.Module):
+
+    def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+
+        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.linear = nn.Linear(embed_dim, embed_dim)
+        self.dropout = nn.Dropout(drop_out_rate)
+
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        self.ln_kv = nn.LayerNorm(kv_dim)
+
+    def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
+
+        x = self.ln_kv(x)
+        key = self.k_proj(x).permute(1, 0, 2)
+        value = self.v_proj(x).permute(1, 0, 2)
+
+        attn_output, _ = self.multihead_attn(query,
+                                             key,
+                                             value,
+                                             attn_mask=attn_mask)
+
+        attn_output = attn_output.permute(1, 0, 2)
+
+        if add_residual:
+            attn_output = hidden_states + self.dropout(
+                self.linear(attn_output))
+        else:
+            attn_output = self.dropout(self.linear(attn_output))
+
+        return attn_output
+
+
+class AriaProjector(nn.Module):
+    """
+    A projection module with one cross attention layer and one FFN layer, which
+    projects ViT's outputs into MoE's inputs.
+
+    Args:
+        patch_to_query_dict (dict): Maps patch numbers to their corresponding
+        query numbers,
+            e.g., {1225: 128, 4900: 256}. This allows for different query sizes
+            based on image resolution.
+        embed_dim (int): Embedding dimension. 
+        num_heads (int): Number of attention heads. 
+        kv_dim (int): Dimension of key and value. 
+        ff_dim (int): Hidden dimension of the feed-forward network. 
+        output_dim (int): Output dimension. 
+        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
+
+    Outputs:
+        A tensor with the shape of (batch_size, query_number, output_dim)
+    """
+
+    def __init__(
+        self,
+        patch_to_query_dict,
+        embed_dim,
+        num_heads,
+        kv_dim,
+        ff_dim,
+        output_dim,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.patch_to_query_dict = patch_to_query_dict
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.query = nn.Parameter(
+            torch.zeros(max(patch_to_query_dict.values()), self.embed_dim))
+
+        trunc_normal_(self.query, std=0.02)
+
+        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
+
+        self.ln_ffn = norm_layer(embed_dim)
+        self.ffn = FFN(embed_dim, ff_dim, output_dim)
+
+    def forward(self, x, attn_mask=None):
+        bs = x.shape[0]
+        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
+
+        query_num = self.patch_to_query_dict.get(x.shape[1], None)
+        assert (query_num is not None
+                ), f"Query number for {x.shape[1]} patches is not provided"
+
+        queries = queries[:, :query_num, :]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
+            attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1)
+
+        attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
+
+        out = self.ffn(self.ln_ffn(attention_out))
+
+        return out
+
+
+class AriaFusedMoE(FusedMoE):
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
+                      shard_id: str) -> Set[str]:
+        # Override the weight_loader to handle the expert weights in the Aria
+        # model, which are already packed with experts, and merge the gate and
+        # up weights for each expert.
+        # Note: Loading expert weights with quantization is not supported
+        tp_rank = get_tensor_model_parallel_rank()
+        if shard_id == 'w13':
+            # the shape of loaded_weight is
+            # (num_experts, hidden_size, 2 * moe_intermediate_size)
+            if self.tp_size > 1:
+                up, gate = loaded_weight.chunk(2, dim=-1)
+                up_current_rank = up.chunk(self.tp_size, dim=-1)[tp_rank]
+                gate_current_rank = gate.chunk(self.tp_size, dim=-1)[tp_rank]
+                up_and_gate = torch.cat([up_current_rank, gate_current_rank],
+                                        dim=-1).transpose(1, 2)
+                param.data.copy_(up_and_gate)
+            else:
+                param.data.copy_(loaded_weight.transpose(1, 2))
+        elif shard_id == 'w2':
+            # the shape of loaded_weight is
+            # (num_experts, moe_intermediate_size, hidden_size)
+            if self.tp_size > 1:
+                down_current_rank = loaded_weight.chunk(self.tp_size,
+                                                        dim=1)[tp_rank]
+                param.data.copy_(down_current_rank.transpose(1, 2))
+            else:
+                param.data.copy_(loaded_weight.transpose(1, 2))
+
+
+class MoELayer(nn.Module):
+    """
+    Mixture of Experts (MoE) Layer for the AriaMoE model.
+
+    This layer implements the MoE mechanism, which routes input tokens to
+    different experts based on a routing algorithm, processes them through the
+    experts, and then combines the outputs.
+    """
+
+    def __init__(
+        self,
+        config: AriaMoELMConfig,
+        quant_config: Optional[QuantizationConfig],
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.router_weight = nn.Parameter(
+            torch.empty(
+                (self.config.moe_num_experts, self.config.hidden_size)))
+
+        self.experts = AriaFusedMoE(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_topk,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            reduce_results=True,
+        )
+        self.shared_experts = LlamaMLP(
+            config.hidden_size,
+            config.moe_intermediate_size * config.moe_num_shared_experts,
+            "silu",
+            quant_config=quant_config,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the MoE Layer.
+
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (batch_size,
+            sequence_length, hidden_size).
+
+        Returns:
+            torch.Tensor: Output tensor after passing through the MoE layer.
+        """
+
+        router_output = torch.nn.functional.linear(hidden_states,
+                                                   self.router_weight)
+
+        shared_expert_output = self.shared_experts(hidden_states)
+        sparse_expert_output = self.experts(hidden_states, router_output)
+
+        return sparse_expert_output + shared_expert_output
+
+
+class MoEDecoderLayer(LlamaDecoderLayer):
+    """
+    Custom Decoder Layer for the AriaMoE model which modifies the standard
+    `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of
+    Experts (MoE) Layer.
+    """
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, cache_config, quant_config, prefix)
+        self.mlp = MoELayer(config, quant_config=quant_config)
+
+
+class AriaMoELMModel(LlamaModel):
+    """
+    Custom LlamaModel for the AriaMoE model which modifies the standard
+    LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=MoEDecoderLayer)
+
+    # Adapted from LlamaModel.load_weights with the modification of adding
+    # the expert weights mapping to `stacked_params_mapping`
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+            ("experts.w13_weight", "experts.fc1.weight", 'w13'),
+            ("experts.w2_weight", "experts.fc2.weight", 'w2'),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if scale_name := get_compressed_tensors_cache_scale(name):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def build_mm_projector(config):
+    return AriaProjector(
+        patch_to_query_dict=config.projector_patch_to_query_dict,
+        embed_dim=config.vision_config.hidden_size,
+        num_heads=config.vision_config.num_attention_heads,
+        kv_dim=config.vision_config.hidden_size,
+        ff_dim=config.text_config.hidden_size,
+        output_dim=config.text_config.hidden_size,
+    )
+
+
+def get_max_multimodal_tokens(ctx):
+    return max(ctx.model_config.hf_config.image_size2tokens.values())
+
+
+def input_mapper_for_aria(ctx, data):
+    return MultiModalKwargs(data)
+
+
+def input_processor(ctx, llm_inputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    # if it is pure text input, use it as is
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+    image_processor = cached_get_image_processor(
+        model_config.model, trust_remote_code=model_config.trust_remote_code)
+    hf_config = model_config.hf_config
+
+    # prepare image tokens, the max_image_size is used to determine the number
+    # of patch_size for every image
+    max_image_size = multi_modal_data.pop("max_image_size", 980)
+    _split_image = multi_modal_data.pop("split_image", False)
+
+    assert isinstance(max_image_size,
+                      (int, float)), "max_image_size should be float or int"
+    images = (multi_modal_data["image"] if isinstance(
+        multi_modal_data["image"], list) else [multi_modal_data["image"]])
+
+    image_inputs = image_processor.preprocess(images,
+                                              max_image_size=max_image_size,
+                                              split_image=_split_image,
+                                              return_tensors="pt").data
+    image_inputs['pixel_values'] = image_inputs['pixel_values'].to(
+        ctx.model_config.dtype)
+    num_crops = image_inputs.pop("num_crops")
+
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    if num_crops.sum().item() > 0:
+        _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens(
+            tokenizer,
+            None,
+            prompt_token_ids,
+            placeholder_token_id=hf_config.image_token_index,
+            repeat_count=num_crops,
+        )
+
+    repeat_count = [hf_config.image_size2tokens[max_image_size]
+                    ] * sum(num_crops).item()
+    new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens(
+        tokenizer,
+        None,
+        prompt_token_ids,
+        placeholder_token_id=hf_config.image_token_index,
+        repeat_count=repeat_count,
+    )
+
+    return token_inputs(
+        prompt_token_ids=new_token_ids,
+        prompt=new_prompt,
+        multi_modal_data={"image": image_inputs},
+    )
+
+
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens)
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria)
+@INPUT_REGISTRY.register_input_processor(input_processor)
+class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
+    """
+    Aria model for conditional generation tasks.
+
+    This model combines a vision tower, a multi-modal projector, and a language
+    model to perform tasks that involve both image and text inputs.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        # prepare the image_size to tokens mapping for the image preprocess, see
+        # input_processor
+        config.image_size2tokens = {
+            int(math.sqrt(k) * config.vision_config.patch_size): v
+            for k, v in config.projector_patch_to_query_dict.items()
+        }
+        self.config = config
+        self.vision_tower = AriaVisionModel(config.vision_config)
+        self.multi_modal_projector = build_mm_projector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AriaMoELMModel(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "language_model.model"),
+        )
+        self.pad_token_id = (self.config.pad_token_id
+                             if self.config.pad_token_id is not None else -1)
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.text_config.hidden_size,
+            org_num_embeddings=self.language_model.org_vocab_size,
+            quant_config=quant_config,
+        )
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+    def _validate_image_sizes(
+            self, images: List[torch.Tensor]) -> List[torch.Tensor]:
+        if not all(img.shape == images[0].shape for img in images):
+            raise ValueError("All images must be the same size")
+        return images
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        pixel_mask = kwargs.pop("pixel_mask", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        pixel_values = self._validate_image_sizes(pixel_values)
+        pixel_values = flatten_bn(pixel_values, concat=True)
+        if pixel_mask is not None:
+            pixel_mask = flatten_bn(pixel_mask, concat=True)
+
+        return AriaImagePixelInputs(
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+        )
+
+    def _process_image_input(
+        self, image_input: AriaImagePixelInputs
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert self.vision_tower is not None
+
+        pixel_values = image_input['pixel_values']
+        pixel_mask = image_input['pixel_mask']
+
+        image_feature, image_attn_mask = self.vision_tower(
+            pixel_values, pixel_mask=pixel_mask)
+        return self.multi_modal_projector(image_feature, image_attn_mask)
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        multimodal_embeddings = self._process_image_input(image_input)
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_index)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        hf_to_vllm_mapper = WeightsMapper(
+            orig_to_new_prefix={
+                "language_model.model": "language_model",
+                "language_model.lm_head": "lm_head",
+            },
+            orig_to_new_suffix={
+                "router.weight": "router_weight",
+            },
+        )
+
+        loader = AutoWeightsLoader(self)
+        loader.load_weights(weights, mapper=hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index aabbd31192a40..5e68b7f165bf4 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -18,7 +18,7 @@
 # limitations under the License.
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -116,6 +116,7 @@ def __init__(
         max_position_embeddings: int = 8192,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -158,7 +159,8 @@ def __init__(
                                   self.head_dim,
                                   scaling,
                                   alibi_slopes=alibi_slopes,
-                                  quant_config=quant_config)
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
         else:
             self.rotary_emb = get_rope(
                 self.head_dim,
@@ -171,7 +173,8 @@ def __init__(
                                   self.head_dim,
                                   self.scaling,
                                   cache_config=cache_config,
-                                  quant_config=quant_config)
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -195,7 +198,8 @@ def __init__(self,
                  config: PretrainedConfig,
                  position_embedding: str,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
@@ -209,6 +213,7 @@ def __init__(self,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
         )
         self.mlp = BaiChuanMLP(
             hidden_size=self.hidden_size,
@@ -275,8 +280,11 @@ def __init__(
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: BaiChuanDecoderLayer(config, position_embedding,
-                                                cache_config, quant_config),
+            lambda prefix: BaiChuanDecoderLayer(config,
+                                                position_embedding,
+                                                cache_config,
+                                                quant_config,
+                                                prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -284,6 +292,9 @@ def __init__(
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -291,9 +302,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -335,6 +350,13 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     def __init__(
         self,
         *,
@@ -363,6 +385,9 @@ def __init__(
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -370,9 +395,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -392,13 +419,15 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -437,6 +466,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index a50a5a5b018e1..3776490cb3465 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -126,6 +126,7 @@ def __init__(
         config: Optional[BartConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.d_model = config.d_model
@@ -178,7 +179,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
                 attn_metadata: AttentionMetadata) -> torch.Tensor:
@@ -208,6 +210,7 @@ def __init__(
         config: Optional[BartConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.d_model = config.d_model
@@ -260,7 +263,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
                 attn_metadata: AttentionMetadata) -> torch.Tensor:
@@ -290,6 +294,7 @@ def __init__(
         config: Optional[BartConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.d_model = config.d_model
@@ -342,7 +347,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -384,6 +390,7 @@ def __init__(
         config: BartConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.embed_dim = config.d_model
@@ -393,7 +400,9 @@ def __init__(
             num_heads=config.encoder_attention_heads,
             config=config,
             cache_config=cache_config,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.activation_fn = get_act_fn(config.activation_function)
 
@@ -464,6 +473,7 @@ def __init__(
         config: BartConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.embed_dim = config.d_model
@@ -473,7 +483,9 @@ def __init__(
             num_heads=config.decoder_attention_heads,
             config=config,
             cache_config=cache_config,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
         self.activation_fn = get_act_fn(config.activation_function)
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -486,6 +498,7 @@ def __init__(
             self.embed_dim,
             config.decoder_attention_heads,
             config=config,
+            prefix=f"{prefix}.encoder_attn",
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
 
@@ -578,7 +591,8 @@ def __init__(self,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  lora_config: Optional[LoRAConfig] = None,
-                 embed_tokens: Optional[nn.Embedding] = None):
+                 embed_tokens: Optional[nn.Embedding] = None,
+                 prefix: str = ""):
         super().__init__()
 
         self.cache_config = cache_config
@@ -599,9 +613,13 @@ def __init__(self,
             config.max_position_embeddings,
             embed_dim,
         )
-        self.layers = nn.ModuleList(
-            [BartEncoderLayer(config,cache_config,quant_config) \
-             for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList([
+            BartEncoderLayer(config,
+                             cache_config,
+                             quant_config,
+                             prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(config.encoder_layers)
+        ])
 
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
@@ -661,6 +679,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
         embed_tokens: Optional[nn.Embedding] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.cache_config = cache_config
@@ -683,8 +702,9 @@ def __init__(
         )
 
         self.layers = nn.ModuleList(
-            [BartDecoderLayer(config,cache_config,quant_config) \
-             for _ in range(config.decoder_layers)])
+            [BartDecoderLayer(config,cache_config,quant_config,
+            prefix=f"{prefix}.layers.{layer_idx}") \
+             for layer_idx in range(config.decoder_layers)])
 
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
@@ -759,10 +779,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.encoder = BartEncoder(config,
                                    cache_config,
-                                   quant_config=quant_config)
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.encoder")
         self.decoder = BartDecoder(config,
                                    cache_config,
-                                   quant_config=quant_config)
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.decoder")
 
     def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 encoder_input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 42dd6119e76f1..053d838432885 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -1,26 +1,30 @@
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
 from transformers import BertConfig
 
 from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
+                                               PoolingType)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.transformers_utils.config import (
+    get_cross_encoder_activation_function)
 
-from .utils import maybe_prefix
+from .interfaces import SupportsCrossEncoding
+from .utils import WeightsMapper, maybe_prefix
 
 
 class BertEmbedding(nn.Module):
@@ -48,7 +52,9 @@ def __init__(self, config: BertConfig):
     def forward(
         self,
         input_ids: torch.Tensor,
-        position_ids: Optional[torch.Tensor] = None,
+        seq_lens: torch.Tensor,
+        position_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         input_shape = input_ids.size()
 
@@ -58,25 +64,42 @@ def forward(
         # Position embeddings.
         position_embeddings = self.position_embeddings(position_ids)
 
-        # Token type embeddings. (TODO: move off hotpath?)
-        token_type_embeddings = self.token_type_embeddings(
-            torch.zeros(input_shape,
-                        dtype=torch.long,
-                        device=inputs_embeds.device))
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape,
+                                         dtype=torch.long,
+                                         device=inputs_embeds.device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
         embeddings = inputs_embeds + token_type_embeddings + position_embeddings
         embeddings = self.LayerNorm(embeddings)
         return embeddings
 
 
+class BertPooler(nn.Module):
+
+    def __init__(self, config: BertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[0, :]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@support_torch_compile
 class BertEncoder(nn.Module):
 
-    def __init__(self,
-                 config: BertConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.layer = nn.ModuleList([
             BertLayer(config=config,
                       cache_config=cache_config,
@@ -309,16 +332,14 @@ def __init__(self,
                  *,
                  vllm_config: VllmConfig,
                  prefix: str = "",
-                 embedding_class: type = BertEmbedding):
+                 embedding_class: type = BertEmbedding,
+                 add_pooling_layer: bool = False):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
         self.embeddings = embedding_class(config)
-        self.encoder = BertEncoder(config,
-                                   cache_config,
-                                   quant_config,
+        self.encoder = BertEncoder(vllm_config=vllm_config,
                                    prefix=f"{prefix}.encoder")
+        self.pooler = BertPooler(config) if add_pooling_layer else None
 
     def forward(
         self,
@@ -328,16 +349,21 @@ def forward(
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
-            hidden_states = self.embeddings(input_ids=input_ids,
-                                            position_ids=position_ids)
-
+            assert hasattr(attn_metadata, "seq_lens_tensor")
+            hidden_states = self.embeddings(
+                input_ids=input_ids,
+                seq_lens=attn_metadata.seq_lens_tensor,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids)
         return self.encoder(hidden_states, kv_caches, attn_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "query", "q"),
@@ -346,8 +372,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "pooler" in name:
+            if self.pooler is None and "pooler" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
@@ -368,6 +395,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class BertEmbeddingModel(nn.Module):
@@ -412,6 +441,10 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+        weights = hf_to_vllm_mapper.apply(weights)
+        weights = ((name, data) for name, data in weights
+                   if not name.startswith("lm_head."))
         self.model.load_weights(weights)
 
     def _build_model(self,
@@ -426,3 +459,78 @@ def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
                                                 pooling_type=PoolingType.CLS,
                                                 normalize=True,
                                                 softmax=False)
+
+
+class BertForSequenceClassification(nn.Module, SupportsCrossEncoding):
+    """A model that uses Bert to provide embedding functionalities.
+
+   This class encapsulates the BertModel and provides an interface for
+   embedding operations and customized pooling functions.
+
+   Attributes:
+       model: An instance of BertModel used for forward operations.
+       _pooler: An instance of Pooler used for pooling operations.
+   """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.default_activation_function = \
+            get_cross_encoder_activation_function(config)
+
+        self.num_labels = config.num_labels
+        self.bert = BertModel(vllm_config=vllm_config,
+                              prefix=maybe_prefix(prefix, "bert"),
+                              embedding_class=BertEmbedding,
+                              add_pooling_layer=True)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self._pooler = CrossEncodingPooler(config, self.classifier,
+                                           self.bert.pooler)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("bert."):
+                    yield (name[len("bert."):], weight)
+                else:
+                    self_weights.append((name, weight))
+
+        self.bert.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.bert(input_ids=input_ids,
+                         position_ids=positions,
+                         kv_caches=kv_caches,
+                         inputs_embeds=inputs_embeds,
+                         intermediate_tensors=intermediate_tensors,
+                         attn_metadata=attn_metadata,
+                         token_type_ids=token_type_ids)
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index e612010677364..42a239cadac46 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -1,13 +1,13 @@
 """Minimal implementation of BlipVisionModel intended to be only used 
 within a vision language model."""
-from typing import Iterable, Optional, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
 from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
-from transformers.models.blip.modeling_blip import BlipAttention
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import DecoderOnlyInputs, token_inputs
@@ -21,12 +21,6 @@
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-try:
-    from xformers import ops as xops
-    USE_XFORMERS_OPS = True
-except ImportError:
-    USE_XFORMERS_OPS = False
-
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -168,7 +162,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-class BlipParallelAttention(nn.Module):
+class BlipAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
@@ -208,6 +202,9 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads,
                            self.head_dim).transpose(1, 2).contiguous()
@@ -217,26 +214,10 @@ def forward(
         hidden_states: torch.Tensor,
     ):
         """Input shape: Batch x Time x Channel"""
-        bsz, tgt_len, _ = hidden_states.size()
 
         qkv_states, _ = self.qkv(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
-        query_states = query_states.view(bsz, tgt_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        key_states = key_states.view(bsz, tgt_len,
-                                     self.num_heads_per_partition,
-                                     self.head_dim)
-        value_states = value_states.view(bsz, tgt_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-
-        out = xops.memory_efficient_attention_forward(query_states,
-                                                      key_states,
-                                                      value_states,
-                                                      p=self.dropout,
-                                                      scale=self.scale)
-        out = out.view(bsz, tgt_len, -1)
+        out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.projection(out)
 
         return attn_output, None
@@ -285,18 +266,11 @@ def __init__(
         super().__init__()
 
         # fallback to sdpa attention if tp unavailable
-        num_heads = config.num_attention_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        if USE_XFORMERS_OPS and num_heads % tp_size == 0:
-            self.self_attn = BlipParallelAttention(
-                config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.self_attn",
-            )
-        else:
-            # Blip doesn't have SDPA attention implemented in transformers
-            # use eager attention instead for cpu backend
-            self.self_attn = BlipAttention(config)
+        self.self_attn = BlipAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
         self.layer_norm1 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
         self.mlp = BlipMLP(config,
@@ -374,11 +348,6 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
-
-        tp_size = get_tensor_model_parallel_world_size()
-        num_heads = config.num_attention_heads
-        self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0
-
         self.config = config
 
         self.embeddings = BlipVisionEmbeddings(config)
@@ -415,14 +384,16 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 
         return self.post_layernorm(hidden_states)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
             ("qkv_proj", "v_proj", "v"),
-        ] if self.shard_weight else []
+        ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         layer_count = len(self.encoder.layers)
 
         for name, loaded_weight in weights:
@@ -440,8 +411,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-
-                param = params_dict[name.replace(weight_name, param_name)]
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -450,3 +421,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 03dc1d15ab697..76b8505ee1c2a 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -16,6 +16,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
@@ -511,9 +512,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
 
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
@@ -609,6 +611,25 @@ def _process_image_input(self,
 
         return self.language_projection(query_output)
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                BLIP2_IMAGE_TOKEN_ID)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -616,6 +637,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[SamplerOutput, IntermediateTensors]:
         """Run forward pass for BLIP-2.
@@ -648,32 +670,24 @@ def forward(
         See also:
             :class:`Blip2ImageInputs`
         """
+
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
 
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    BLIP2_IMAGE_TOKEN_ID)
-
-                input_ids = None
-            else:
-                inputs_embeds = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
-        hidden_states = self.language_model.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
 
         return hidden_states
 
@@ -692,6 +706,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 84adf574af5e2..fee74f491acc1 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 """Inference-only BLOOM model compatible with HuggingFace weights."""
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -78,6 +78,7 @@ def __init__(
         config: BloomConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -116,7 +117,8 @@ def __init__(
                               scaling,
                               alibi_slopes=alibi_slopes,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -168,14 +170,17 @@ def __init__(
         config: BloomConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
 
         self.input_layernorm = nn.LayerNorm(hidden_size,
                                             eps=config.layer_norm_epsilon)
-        self.self_attention = BloomAttention(config, cache_config,
-                                             quant_config)
+        self.self_attention = BloomAttention(config,
+                                             cache_config,
+                                             quant_config,
+                                             prefix=f"{prefix}.self_attention")
         self.post_attention_layernorm = nn.LayerNorm(
             hidden_size, eps=config.layer_norm_epsilon)
         self.mlp = BloomMLP(config, quant_config)
@@ -242,7 +247,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Transformer blocks
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: BloomBlock(config, cache_config, quant_config),
+            lambda prefix: BloomBlock(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.h")
 
         # Final Layer Norm
@@ -251,6 +257,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.word_embeddings_layernorm(self.word_embeddings(input_ids))
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -258,10 +267,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.word_embeddings(input_ids)
-            hidden_states = self.word_embeddings_layernorm(hidden_states)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -301,6 +313,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -308,9 +323,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -330,8 +347,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if name == "lm_head.weight":
                 continue
@@ -360,3 +379,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 7b59c818e0b60..a40c321ce0a58 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
 import torch
@@ -29,6 +29,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
@@ -38,7 +39,7 @@
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 # These configs are not part of the model config but the preprocessor
 # and processor files, so we hardcode them in the model file for now.
@@ -223,6 +224,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
         cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -276,7 +278,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def _apply_qk_norm(self, q: torch.Tensor,
                        k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -313,6 +316,7 @@ def __init__(
         config: ChameleonConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -336,6 +340,7 @@ def __init__(
             quant_config=quant_config,
             bias=False,
             cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
         )
         self.mlp = ChameleonMLP(
             hidden_size=self.hidden_size,
@@ -386,6 +391,7 @@ def __init__(
         config: ChameleonConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -409,6 +415,7 @@ def __init__(
             quant_config=quant_config,
             bias=False,
             cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
         )
         self.mlp = ChameleonMLP(
             hidden_size=self.hidden_size,
@@ -855,7 +862,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.num_hidden_layers,
             lambda prefix: decoder_layer(config=config,
                                          cache_config=cache_config,
-                                         quant_config=quant_config),
+                                         quant_config=quant_config,
+                                         prefix=prefix),
             prefix=f"{prefix}.layers",
         )
 
@@ -980,6 +988,29 @@ def _parse_and_validate_image_input(
             data=self._validate_pixel_values(pixel_values),
         )
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        assert self.model.vqmodel is not None
+        image_tokens = self.model.get_image_tokens(image_input["data"].to(
+            self.config.torch_dtype))
+        vision_embeddings = self.model.get_input_embeddings(image_tokens)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.model.vocabulary_mapping.image_token_id)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -987,27 +1018,27 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
 
         if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
             input_ids = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                assert self.model.vqmodel is not None
-                image_tokens = self.model.get_image_tokens(
-                    image_input["data"].to(self.config.torch_dtype))
-                image_token_id = self.model.vocabulary_mapping.image_token_id
-                special_image_mask = input_ids == image_token_id
-                image_tokens = image_tokens.to(input_ids.device,
-                                               input_ids.dtype)
-                input_ids = input_ids.masked_scatter(special_image_mask,
-                                                     image_tokens)
-
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+
+        hidden_states = self.model(input_ids,
+                                   positions,
+                                   kv_caches,
+                                   attn_metadata,
+                                   intermediate_tensors,
+                                   inputs_embeds=inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -1034,7 +1065,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -1044,6 +1076,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -1111,3 +1144,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 70e9b607b0642..6c50882d83c3b 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -3,7 +3,8 @@
 """Inference-only ChatGLM model compatible with THUDM weights."""
 from argparse import Namespace
 from array import array
-from typing import Dict, Iterable, List, Mapping, Optional, Tuple, TypedDict
+from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple,
+                    TypedDict)
 
 import torch
 from PIL import Image
@@ -29,9 +30,11 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
@@ -228,6 +231,7 @@ def __init__(
         config: ChatGLMConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -283,7 +287,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -362,6 +367,7 @@ def __init__(
         config: ChatGLMConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.apply_residual_connection_post_layernorm = (
@@ -375,7 +381,10 @@ def __init__(
                                                eps=config.layernorm_epsilon)
 
         # Self attention.
-        self.self_attention = GLMAttention(config, cache_config, quant_config)
+        self.self_attention = GLMAttention(config,
+                                           cache_config,
+                                           quant_config,
+                                           prefix=f"{prefix}.self_attention")
         self.hidden_dropout = config.hidden_dropout
 
         # Layernorm on the attention output
@@ -444,7 +453,8 @@ def __init__(
         # Transformer layers.
         self.start_layer, self.end_layer, self.layers = make_layers(
             self.num_layers,
-            lambda prefix: GLMBlock(config, cache_config, quant_config),
+            lambda prefix: GLMBlock(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers",
         )
 
@@ -498,16 +508,22 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.num_layers = config.num_layers
         self.multi_query_group_num = config.multi_query_group_num
         self.kv_channels = config.kv_channels
-        self.encoder = GLMTransformer(config, cache_config, quant_config)
+        self.encoder = GLMTransformer(config,
+                                      cache_config,
+                                      quant_config,
+                                      prefix=f"{prefix}.encoder")
 
         self.output_layer = ParallelLMHead(config.padded_vocab_size,
                                            config.hidden_size,
-                                           quant_config=quant_config)
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.output_layer")
 
         vision_config_flag = getattr(config, 'vision_config', None)
         if vision_config_flag is not None:
             self.vision_config = Namespace(**config.vision_config)
-            self.vision = EVA2CLIPModel(self.config, quant_config)
+            self.vision = EVA2CLIPModel(self.config,
+                                        quant_config,
+                                        prefix=f"{prefix}.vision")
         else:
             self.vision = None
 
@@ -530,6 +546,30 @@ def _parse_and_validate_image_input(
                     """)
         return GLMImagePixelInputs(pixel_values=pixel_values)
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input["pixel_values"] is None:
+            return None
+        pixel_values = image_input["pixel_values"].to(
+            dtype=self.config.torch_dtype)
+        vision_embeddings = self.vision(pixel_values)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embedding(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_glm_vision_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                vision_embeddings=multimodal_embeddings,
+                boi_token_id=self.config.boi_token_id,
+                eoi_token_id=self.config.eoi_token_id)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -537,26 +577,17 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> torch.Tensor:
-        if intermediate_tensors is None:
-            inputs_embeds = self.embedding(input_ids)
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input["pixel_values"] is not None:
-                pixel_values = image_input["pixel_values"].to(
-                    dtype=inputs_embeds.dtype)
-                image_embeds = self.vision(pixel_values)
-
-                boi_token_id = self.config.boi_token_id
-                eoi_token_id = self.config.eoi_token_id
-
-                inputs_embeds = merge_glm_vision_embeddings(
-                    input_ids=input_ids,
-                    inputs_embeds=inputs_embeds,
-                    vision_embeddings=image_embeds,
-                    boi_token_id=boi_token_id,
-                    eoi_token_id=eoi_token_id)
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        if intermediate_tensors is None and inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
         else:
             inputs_embeds = intermediate_tensors["hidden_states"]
 
@@ -573,25 +604,7 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_glmv)
-class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
-                         SupportsMultiModal):
-    packed_modules_mapping = {
-        "query_key_value": ["query_key_value"],
-        "dense_h_to_4h": ["dense_h_to_4h"]
-    }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "query_key_value",
-        "dense",
-        "dense_h_to_4h",
-        "dense_4h_to_h",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
+class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -645,7 +658,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         # Merge two ColumnParallelLinear into one MergedColumnParallelLinear
         merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = {
             "transformer.vision.linear_proj.merged_proj.weight": {
@@ -655,6 +669,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         }
 
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             is_weight_to_be_merge = False
             for _, merged_weight_dict in merged_weights_dict.items():
@@ -677,6 +692,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
 
         for combined_name, merged_weight_dict in merged_weights_dict.items():
             if combined_name in params_dict:
@@ -686,3 +702,81 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, combined_weight)
+                loaded_params.add(combined_name)
+        return loaded_params
+
+
+class ChatGLM(ChatGLMBaseModel):
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+        "dense_h_to_4h": ["dense_h_to_4h"]
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "query_key_value",
+        "dense",
+        "dense_h_to_4h",
+        "dense_4h_to_h",
+    ]
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+
+class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal):
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+        "dense_h_to_4h": ["dense_h_to_4h"],
+        "merged_proj": ["gate_proj", "dense_h_to_4h"]
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "query_key_value",
+        "dense",
+        "dense_h_to_4h",
+        "dense_4h_to_h",
+        # vision
+        "fc1",
+        "fc2",
+        "merged_proj",
+        "linear_proj"
+    ]
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="transformer.encoder",
+            connector="transformer.vision.linear_proj",
+            tower_model="transformer.vision.transformer")
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_glmv)
+class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
+                         SupportsMultiModal):
+    # Ensure that the LoRA support check passes when the class is not
+    # initialized, but set all these attributes to empty.
+    packed_modules_mapping = {}
+    supported_lora_modules = []
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __new__(
+        cls,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
+        config = vllm_config.model_config.hf_config
+        # Initialize VL
+        if hasattr(config, "visual"):
+            return ChatGLMV(vllm_config=vllm_config, prefix=prefix)
+        # Initialize LLM
+        else:
+            return ChatGLM(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 2d81b9266826b..a5300dfd986f3 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,14 +1,14 @@
 """Minimal implementation of CLIPVisionModel intended to be only used
 within a vision language model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
 import torch.nn as nn
 from PIL import Image
 from transformers import CLIPVisionConfig
-from transformers.models.clip.modeling_clip import CLIPSdpaAttention
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import DecoderOnlyInputs, token_inputs
@@ -20,15 +20,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
+                                   repeat_and_pad_placeholder_tokens,
+                                   resolve_visual_encoder_outputs)
 from vllm.sequence import SequenceData
 
-try:
-    from xformers import ops as xops
-    USE_XFORMERS_OPS = True
-except ImportError:
-    USE_XFORMERS_OPS = False
-
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -197,7 +192,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-class CLIPParallelAttention(nn.Module):
+class CLIPAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
@@ -237,6 +232,9 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads,
                            self.head_dim).transpose(1, 2).contiguous()
@@ -246,27 +244,10 @@ def forward(
         hidden_states: torch.Tensor,
     ):
         """Input shape: Batch x Time x Channel"""
-        bsz, tgt_len, _ = hidden_states.size()
 
         qkv_states, _ = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
-
-        query_states = query_states.view(bsz, tgt_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        key_states = key_states.view(bsz, tgt_len,
-                                     self.num_heads_per_partition,
-                                     self.head_dim)
-        value_states = value_states.view(bsz, tgt_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-
-        out = xops.memory_efficient_attention_forward(query_states,
-                                                      key_states,
-                                                      value_states,
-                                                      p=self.dropout,
-                                                      scale=self.scale)
-        out = out.view(bsz, tgt_len, -1)
+        out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
 
         return attn_output, None
@@ -311,17 +292,11 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
-
-        num_heads = config.num_attention_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        if USE_XFORMERS_OPS and num_heads % tp_size == 0:
-            self.self_attn = CLIPParallelAttention(
-                config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.self_attn",
-            )
-        else:
-            self.self_attn = CLIPSdpaAttention(config)
+        self.self_attn = CLIPAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
         self.layer_norm1 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
         self.mlp = CLIPMLP(config,
@@ -377,12 +352,20 @@ def __init__(
             for layer_idx in range(num_hidden_layers)
         ])
 
-    def forward(self, inputs_embeds: torch.Tensor):
-
+    def forward(
+        self, inputs_embeds: torch.Tensor, return_all_hidden_states: bool
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        hidden_states_pool = []
         hidden_states = inputs_embeds
+
         for encoder_layer in self.layers:
             hidden_states = encoder_layer(hidden_states)
-
+            if return_all_hidden_states:
+                hidden_states_pool.append(hidden_states)
+        # If we have multiple feature sample layers, we return all hidden
+        # states in order and grab the ones we need by index.
+        if return_all_hidden_states:
+            return hidden_states_pool
         return hidden_states
 
 
@@ -407,6 +390,7 @@ def __init__(
         # NOTE: This typo of "layrnorm" is not fixed on purpose to match
         # the original transformers code and name of the model weights.
         self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
         self.encoder = CLIPEncoder(
             config=config,
             quant_config=quant_config,
@@ -434,16 +418,26 @@ def __init__(
     def forward(
         self,
         pixel_values: torch.Tensor,
+        feature_sample_layers: Optional[list[int]] = None,
     ) -> torch.Tensor:
 
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layrnorm(hidden_states)
-        hidden_states = self.encoder(inputs_embeds=hidden_states)
 
-        if self.post_layernorm is None:
-            return hidden_states
+        return_all_hidden_states = feature_sample_layers is not None
+
+        # Produces either the last layer output or all of the hidden states,
+        # depending on if we have feature_sample_layers or not
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=return_all_hidden_states)
+
+        # Handle post-norm (if applicable) and stacks feature layers if needed
+        encoder_outputs = resolve_visual_encoder_outputs(
+            encoder_outputs, feature_sample_layers, self.post_layernorm,
+            self.config.num_hidden_layers)
 
-        return self.post_layernorm(hidden_states)
+        return encoder_outputs
 
 
 class CLIPVisionModel(nn.Module):
@@ -461,21 +455,19 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
-
-        tp_size = get_tensor_model_parallel_world_size()
-        num_heads = config.num_attention_heads
-        self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0
-
         self.vision_model = CLIPVisionTransformer(
             config=config,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             require_post_norm=require_post_norm,
-            prefix=f"{prefix}.vision_model",
-        )
+            prefix=f"{prefix}.vision_model")
 
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        return self.vision_model(pixel_values)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        feature_sample_layers: Optional[list[int]] = None,
+    ) -> torch.Tensor:
+        return self.vision_model(pixel_values, feature_sample_layers)
 
     @property
     def device(self):
@@ -483,14 +475,16 @@ def device(self):
 
     # (TODO) Add prefix argument for filtering out weights to be loaded
     #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
             ("qkv_proj", "v_proj", "v"),
-        ] if self.shard_weight else []
+        ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         layer_count = len(self.vision_model.encoder.layers)
 
         for name, loaded_weight in weights:
@@ -508,8 +502,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
+                name = name.replace(weight_name, param_name)
 
-                param = params_dict[name.replace(weight_name, param_name)]
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -518,3 +513,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index cd5c1d6844716..c846e42f1b0c3 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -48,7 +48,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -120,6 +120,7 @@ def __init__(
         config: CohereConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         tp_size = get_tensor_model_parallel_world_size()
@@ -170,12 +171,27 @@ def __init__(
             rope_scaling=self.rope_scaling,
             is_neox_style=False,
         )
+
+        sliding_window = getattr(config, "sliding_window", None)
+        # Model v2 has sliding windows, v1 does not
+        self.v1 = sliding_window is None
+
+        layer_idx = extract_layer_index(prefix)
+        layer_has_sliding_window = (
+            getattr(config, "sliding_window_pattern", False)
+            and (layer_idx + 1) % self.config.sliding_window_pattern != 0)
+
+        self.sliding_window = (sliding_window
+                               if layer_has_sliding_window else None)
+
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              per_layer_sliding_window=self.sliding_window,
+                              prefix=f"{prefix}.attn")
         if self.use_qk_norm:
             self.q_norm = LayerNorm(param_shape=(self.num_heads,
                                                  self.head_dim),
@@ -204,7 +220,8 @@ def forward(
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         if self.use_qk_norm:
             q, k = self._apply_qk_norm(q, k)
-        q, k = self.rotary_emb(positions, q, k)
+        if self.v1 or self.sliding_window:
+            q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.o_proj(attn_output)
         return output
@@ -215,13 +232,15 @@ class CohereDecoderLayer(nn.Module):
     def __init__(self,
                  config: CohereConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.hidden_size = config.hidden_size
 
         self.self_attn = CohereAttention(config,
                                          cache_config,
-                                         quant_config=quant_config)
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.self_attn")
 
         self.mlp = CohereMLP(config, quant_config=quant_config)
         self.input_layernorm = LayerNorm(param_shape=(config.hidden_size),
@@ -271,8 +290,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                    config.hidden_size)
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: CohereDecoderLayer(config, cache_config,
-                                              quant_config),
+            lambda prefix: CohereDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self.norm = LayerNorm(param_shape=(config.hidden_size),
                               eps=config.layer_norm_eps)
@@ -280,6 +299,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -287,9 +309,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -354,6 +380,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     @torch.no_grad()
     def forward(
         self,
@@ -362,9 +391,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -390,7 +421,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -435,3 +467,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index fff8710f6b475..3932d8b52a9d1 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -154,6 +154,7 @@ def __init__(
         config: DbrxConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.d_model = config.d_model
@@ -208,7 +209,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -234,10 +236,14 @@ def __init__(
         config: DbrxConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.d_model = config.d_model
-        self.attn = DbrxAttention(config, cache_config, quant_config)
+        self.attn = DbrxAttention(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.attn")
         self.norm_1 = nn.LayerNorm(self.d_model)
         self.norm_2 = nn.LayerNorm(self.d_model)
 
@@ -269,10 +275,14 @@ def __init__(
         config: DbrxConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
-        self.norm_attn_norm = DbrxFusedNormAttention(config, cache_config,
-                                                     quant_config)
+        self.norm_attn_norm = DbrxFusedNormAttention(
+            config,
+            cache_config,
+            quant_config,
+            prefix=f"{prefix}.norm_attn_norm")
         self.ffn = DbrxMoE(config, quant_config)
 
     def forward(
@@ -308,7 +318,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.blocks = make_layers(
             config.n_layers,
-            lambda prefix: DbrxBlock(config, cache_config, quant_config),
+            lambda prefix: DbrxBlock(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.blocks",
         )
         self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5)
@@ -321,6 +332,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.d_model))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -328,9 +342,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.wte(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors
             hidden_states = intermediate_tensors["hidden_states"]
@@ -376,6 +394,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -383,9 +404,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -405,13 +428,15 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
 
         expert_params_mapping = [(
             "w13_weight" if weight_name in ["w1", "v1"] else "w2_weight",
             f"mlp.{weight_name}",
         ) for weight_name in ["w1", "v1", "w2"]]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for param_name, weight_name in expert_params_mapping:
                 if weight_name not in name:
@@ -435,3 +460,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index b38fd9fa49c21..c551853956b92 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -22,7 +22,7 @@
 # limitations under the License.
 """Inference-only DeciLM model compatible with HuggingFace weights."""
 
-from typing import Iterable, Tuple
+from typing import Iterable, Set, Tuple
 
 import torch
 
@@ -57,7 +57,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         delattr(config, "num_key_value_heads_per_layer")
         super().__init__(vllm_config=vllm_config)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -67,6 +68,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -97,6 +99,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     def _degroup_weight(self, loaded_weight: torch.Tensor) -> torch.Tensor:
         hidden_size = self.config.hidden_size
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index a9bf1440c4d60..74b6bfdf21909 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Deepseek model."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -49,7 +49,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -63,6 +63,7 @@ def __init__(
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         reduce_results: bool = True,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -92,6 +93,7 @@ def __init__(
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -184,6 +186,7 @@ def __init__(
         max_position_embeddings: int = 8192,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -236,7 +239,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -258,11 +262,12 @@ class DeepseekDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        layer_idx: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        layer_idx = extract_layer_index(prefix)
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
@@ -277,17 +282,21 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
         )
         if (config.n_routed_experts is not None
                 and layer_idx >= config.first_k_dense_replace
                 and layer_idx % config.moe_layer_freq == 0):
-            self.mlp = DeepseekMoE(config=config, quant_config=quant_config)
+            self.mlp = DeepseekMoE(config=config,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
         else:
             self.mlp = DeepseekMLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
             )
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -343,16 +352,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: DeepseekDecoderLayer(config,
-                                                int(prefix.split(".")[-1]),
-                                                cache_config,
-                                                quant_config=quant_config),
+            lambda prefix: DeepseekDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
             prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -360,9 +371,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             hidden_states = intermediate_tensors["hidden_states"]
@@ -401,6 +416,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -408,9 +426,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -430,7 +450,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -441,6 +462,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -475,3 +497,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 4fb1eed15a2e7..4cf4e6c358bf2 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only DeepseekV2 model."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -268,7 +268,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_local_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -445,6 +446,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -452,9 +456,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -495,6 +503,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -502,9 +513,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -538,7 +551,8 @@ def make_empty_intermediate_tensors(
                         device=device),
         })
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
@@ -554,6 +568,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.n_routed_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -611,3 +626,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 85c51e8404584..f138d13630263 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -78,6 +78,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     def sampler(self):
         return self.model.sampler
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -86,11 +89,14 @@ def forward(
         attn_metadata: AttentionMetadata,
         previous_hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
-        tok_embeds = self.model.model.embed_tokens(input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids)
+
         inputs_embeds = self.fc(
-            torch.cat([tok_embeds, previous_hidden_states], dim=-1))
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
 
         inputs_embeds[positions == 0] = 0  # masking inputs at position=0
 
@@ -100,7 +106,8 @@ def forward(
             positions=positions,
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
-            intermediate_tensors=intermediate_tensors)
+            intermediate_tensors=intermediate_tensors,
+        )
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index cd3e7da657e0e..0398f0943a70a 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -22,7 +22,7 @@
 # limitations under the License.
 """Inference-only Exaone model compatible with HuggingFace weights."""
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -174,6 +174,7 @@ def __init__(
             num_kv_heads=self.num_kv_heads,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=f"{prefix}.attn",
         )
 
     def forward(
@@ -219,7 +220,7 @@ def __init__(
             quant_config=quant_config,
             bias=bias,
             cache_config=cache_config,
-            prefix=prefix,
+            prefix=f"{prefix}.attention",
         )
 
     def forward(
@@ -472,13 +473,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
+        self.sampler = get_sampler()
+
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -486,9 +491,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.transformer(input_ids, positions, kv_caches,
-                                        attn_metadata, intermediate_tensors)
+                                        attn_metadata, intermediate_tensors,
+                                        inputs_embeds)
         return model_output
 
     def compute_logits(
@@ -508,7 +515,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -518,6 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".c_fc_1", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -538,6 +547,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
                 continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
@@ -571,6 +581,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index b3dbf063ac298..8660cf79b9cdb 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -18,7 +18,7 @@
 """PyTorch Falcon model."""
 
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -84,6 +84,7 @@ def __init__(
         config: FalconConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
 
@@ -158,7 +159,8 @@ def __init__(
                                   self.head_dim,
                                   self.inv_norm_factor,
                                   num_kv_heads=self.num_kv_heads,
-                                  quant_config=quant_config)
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
         elif self.use_alibi:
             tp_rank = get_tensor_model_parallel_rank()
             head_start = tp_rank * self.num_heads
@@ -171,14 +173,16 @@ def __init__(
                                   self.inv_norm_factor,
                                   num_kv_heads=self.num_kv_heads,
                                   alibi_slopes=alibi_slopes,
-                                  quant_config=quant_config)
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
         else:
             self.attn = Attention(self.num_heads,
                                   self.head_dim,
                                   scale=self.inv_norm_factor,
                                   num_kv_heads=self.num_kv_heads,
                                   cache_config=cache_config,
-                                  quant_config=quant_config)
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -241,12 +245,16 @@ def __init__(
         config: FalconConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
-        self.self_attention = FalconAttention(config, cache_config,
-                                              quant_config)
+        self.self_attention = FalconAttention(
+            config,
+            cache_config,
+            quant_config,
+            prefix=f"{prefix}.self_attention")
         self.mlp = FalconMLP(config, quant_config)
         self.config = config
 
@@ -357,8 +365,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Transformer blocks
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: FalconDecoderLayer(config, cache_config,
-                                              quant_config),
+            lambda prefix: FalconDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.h")
 
         # Final Layer Norm
@@ -367,6 +375,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.word_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -374,9 +385,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.word_embeddings(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for i in range(self.start_layer, self.end_layer):
@@ -397,12 +412,6 @@ class FalconForCausalLM(nn.Module, SupportsPP):
 
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {}
-    default_bitsandbytes_target_modules = [
-        ".query_key_value.",
-        ".dense.",
-        ".dense_h_to_4h.",
-        ".dense_4h_to_h.",
-    ]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -432,6 +441,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.LongTensor,
@@ -439,9 +451,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -461,7 +475,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         total_num_heads = self.config.num_attention_heads
         if self.config.new_decoder_architecture:
             total_num_kv_heads = self.config.num_kv_heads
@@ -471,6 +486,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             total_num_kv_heads = total_num_heads
         num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if name == "lm_head.weight" and self.tie_word_embeddings:
                 # Falcon uses tied embeddings except Falcon-11b.
@@ -507,3 +523,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 971a71180164b..3a5fe8e1f4144 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,5 +1,5 @@
 import math
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 import torch.nn as nn
@@ -35,10 +35,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
         self.encoder = BartEncoder(config,
                                    cache_config=cache_config,
-                                   quant_config=quant_config)
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.encoder")
         self.decoder = BartDecoder(config,
                                    cache_config=cache_config,
-                                   quant_config=quant_config)
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.decoder")
 
         if self.config.tie_word_embeddings:
             self.encoder.embed_tokens.weight = self.shared.weight
@@ -99,7 +101,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.model = Florence2LanguageModel(vllm_config=vllm_config,
-                                            prefix=prefix)
+                                            prefix=f"{prefix}.model")
         embed_scale = math.sqrt(
             config.d_model) if config.scale_embedding else 1.0
 
@@ -156,7 +158,8 @@ def sample(self, logits: torch.Tensor,
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -165,12 +168,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-
-                param = params_dict[name.replace(weight_name, param_name)]
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -183,6 +187,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class Florence2ForConditionalGeneration(nn.Module):
@@ -194,7 +200,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # TODO(Isotr0py): Add vision backbone
         self.language_model = Florence2LanguageForConditionalGeneration(
             vllm_config=vllm_config.with_hf_config(config.text_config),
-            prefix=prefix,
+            prefix=f"{prefix}.language_model",
         )
 
     @property
@@ -248,10 +254,11 @@ def sample(
     ) -> SamplerOutput:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         skip_prefixes = [
             'image_projection', "vision_tower", "image_proj_norm",
             "image_pos_embed", "visual_temporal_embed"
         ]
         loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 31fc098a8bb3f..6e86900326c4b 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -16,7 +16,8 @@
 """ PyTorch Fuyu model."""
 import math
 from array import array
-from typing import Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict)
 
 import torch
 import torch.nn as nn
@@ -34,6 +35,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges)
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -301,6 +303,25 @@ def _process_image_input(
         vision_embeddings, _ = self.vision_embed_tokens(image_input["data"])
         return vision_embeddings
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                _IMAGE_TOKEN_ID)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -308,24 +329,19 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ):
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
 
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.language_model.model.embed_tokens(
-                    input_ids)
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.image_token_id)
-
-            else:
-                inputs_embeds = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model(
             input_ids=input_ids,
@@ -354,6 +370,7 @@ def sample(
         next_tokens = self.language_model.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 55baba809e58f..b28715c48adfb 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -174,7 +174,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -349,15 +350,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "down_proj",
     ]
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -390,6 +382,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -397,9 +392,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -419,7 +416,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -464,3 +462,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             logger.warning(
                 "Some weights are not initialized from checkpoints: %s",
                 unloaded_params)
+        return loaded_params
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index eeb3fd98a7eac..4664aa53ea092 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -30,19 +30,18 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -85,7 +84,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class Gemma2Attention(nn.Module):
 
     def __init__(self,
-                 layer_idx: int,
                  config: Gemma2Config,
                  hidden_size: int,
                  num_heads: int,
@@ -95,9 +93,9 @@ def __init__(self,
                  rope_theta: float,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 attn_logits_soft_cap: Optional[float] = None) -> None:
+                 attn_logits_soft_cap: Optional[float] = None,
+                 prefix: str = "") -> None:
         super().__init__()
-        self.layer_idx = layer_idx
         self.config = config
         self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -142,19 +140,22 @@ def __init__(self,
             is_neox_style=True,
         )
 
-        # FIXME(woosuk): While Gemma 2 uses sliding window attention for every
-        # odd layer, vLLM currently ignores it and uses global attention for
-        # all layers.
-        use_sliding_window = (layer_idx % 2 == 1
-                              and config.sliding_window is not None)
-        del use_sliding_window  # Unused.
+        # reference:
+        # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa
+        layer_idx = extract_layer_index(prefix)
+        use_sliding_window = (layer_idx % 2 == 0 and
+                              config.interleaved_sliding_window is not None)
+        sliding_window = config.interleaved_sliding_window if \
+            use_sliding_window else None
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              logits_soft_cap=attn_logits_soft_cap)
+                              logits_soft_cap=attn_logits_soft_cap,
+                              per_layer_sliding_window=sliding_window,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -175,15 +176,14 @@ class Gemma2DecoderLayer(nn.Module):
 
     def __init__(
         self,
-        layer_idx: int,
         config: Gemma2Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = Gemma2Attention(
-            layer_idx=layer_idx,
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -194,6 +194,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
             attn_logits_soft_cap=config.attn_logit_softcapping,
+            prefix=f"{prefix}.self_attn",
         )
         self.hidden_size = config.hidden_size
         self.mlp = Gemma2MLP(
@@ -257,8 +258,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Gemma2DecoderLayer(int(prefix.split(".")[
-                -1]), config, cache_config, quant_config),
+            lambda prefix: Gemma2DecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -272,6 +273,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: Optional[torch.Tensor],
@@ -285,7 +289,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.embed_tokens(input_ids)
+                hidden_states = self.get_input_embeddings(input_ids)
             hidden_states *= self.normalizer
             residual = None
         else:
@@ -309,7 +313,8 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -351,6 +356,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             logger.warning(
                 "Some weights are not initialized from checkpoints: %s",
                 unloaded_params)
+        return loaded_params
 
 
 class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
@@ -378,15 +384,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = []
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -414,6 +411,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -421,9 +421,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -443,58 +445,11 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        loader.load_weights(weights)
-
-
-class Gemma2EmbeddingModel(nn.Module, SupportsPP):
-    """
-    A model that uses Gemma2 with additional embedding functionalities.
-
-    This class encapsulates the Gemma2Model and provides an interface for
-    embedding operations and customized pooling functions.
-
-    Attributes:
-        model: An instance of Gemma2Model used for forward operations.
-        _pooler: An instance of Pooler used for pooling operations.
-    """
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        self.model = Gemma2Model(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
-        self._pooler = Pooler.from_config_with_defaults(
-            vllm_config.model_config.pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        return self.model(input_ids, positions, kv_caches, attn_metadata,
-                          intermediate_tensors, inputs_embeds)
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        self.model.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
new file mode 100644
index 0000000000000..942d1e14baed1
--- /dev/null
+++ b/vllm/model_executor/models/glm.py
@@ -0,0 +1,21 @@
+"""Inference-only HF format GLM-4 model compatible with THUDM weights."""
+from vllm.config import VllmConfig
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .utils import PPMissingLayer
+
+
+class GlmForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # Hack Llama model to fit HF format GLM implementation
+        # Attention difference between GLM and Llama:
+        # 1. Half partial rotary_dim and no Neox style.
+        # 2. There is no bias for o_proj in attention
+        for layer in self.model.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.rotary_emb.rotary_dim //= 2
+                layer.self_attn.rotary_emb.is_neox_style = False
+                layer.self_attn.o_proj.bias = None
+                layer.self_attn.o_proj.skip_bias_add = True
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index 025615b0920fd..39a5736eb199b 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -8,6 +8,7 @@
 from torch import nn
 from torch.nn import LayerNorm
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -56,6 +57,7 @@ def __init__(
         self,
         config,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -76,27 +78,16 @@ def __init__(
             quant_config=quant_config,
         )
 
+        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
+                                       self.scale)
         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, L, _ = x.shape
         qkv, _ = self.query_key_value(x)  # B, L, 3 * H * D
         q, k, v = qkv.chunk(3, dim=-1)
-        q = q.reshape(B, L, self.num_heads_per_rank,
-                      self.head_dim).permute(0, 2, 1, 3)  # B, H, L, D
-        k = k.reshape(B, L, self.num_heads_per_rank,
-                      self.head_dim).permute(0, 2, 1, 3)  # B, H, L, D
-        v = v.reshape(B, L, self.num_heads_per_rank,
-                      self.head_dim).permute(0, 2, 1, 3)  # B, H, L, D
-
-        out = torch.nn.functional.scaled_dot_product_attention(q,
-                                                               k,
-                                                               v,
-                                                               attn_mask=None,
-                                                               dropout_p=0.,
-                                                               is_causal=False)
-
-        output, _ = self.dense(out.transpose(1, 2).view(B, L, -1))
+
+        out = self.attn(q, k, v)
+        output, _ = self.dense(out)
         output = self.output_dropout(output)
         return output
 
@@ -135,11 +126,14 @@ def __init__(
         self,
         config,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
     ):
         super().__init__()
         self.input_layernorm = LayerNorm(config.hidden_size,
                                          eps=config.layer_norm_eps)
-        self.attention = Attention(config, quant_config=quant_config)
+        self.attention = Attention(config,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.attention")
         self.mlp = MLP(config, quant_config=quant_config)
         self.post_attention_layernorm = LayerNorm(config.hidden_size,
                                                   eps=config.layer_norm_eps)
@@ -161,11 +155,14 @@ def __init__(
         self,
         config,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
     ):
         super().__init__()
         self.layers = nn.ModuleList([
-            TransformerLayer(config, quant_config=quant_config)
-            for _ in range(config.num_hidden_layers)
+            TransformerLayer(config,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.layer.{layer_idx}")
+            for layer_idx in range(config.num_hidden_layers)
         ])
 
     def forward(self, hidden_states):
@@ -252,12 +249,14 @@ def __init__(
         self,
         config,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
     ):
         super().__init__()
         vision_config = Namespace(**config.vision_config)
         self.patch_embedding = PatchEmbedding(vision_config)
         self.transformer = Transformer(vision_config,
-                                       quant_config=quant_config)
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.transformer")
         self.linear_proj = GLU(config,
                                in_features=config.hidden_size,
                                quant_config=quant_config)
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index cc85693f99526..fd926ff0254d4 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -84,7 +84,8 @@ def __init__(
                               self.head_dim,
                               scale=self.scale,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -209,6 +210,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.n_embd))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -220,7 +224,7 @@ def forward(
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.wte(input_ids)
+                inputs_embeds = self.get_input_embeddings(input_ids)
             position_embeds = self.wpe(position_ids)
             hidden_states = inputs_embeds + position_embeds
         else:
@@ -262,7 +266,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.transformer.make_empty_intermediate_tensors)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.wte(input_ids)
+        return self.transformer.get_input_embeddings(input_ids)
 
     def forward(
         self,
@@ -295,8 +299,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 # GPT-2 ties the weights of the embedding layer and the final
@@ -325,3 +331,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index ab25c66c3a887..a5ae373c6d88d 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPTBigCode model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -28,14 +28,19 @@
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
-                                               RowParallelLinear)
+                                               QuantizationConfigOverride,
+                                               RowParallelLinear,
+                                               TiedWeightLinearMethod)
+# yapf: enable
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
@@ -52,6 +57,7 @@ def __init__(
         config: GPTBigCodeConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -92,7 +98,8 @@ def __init__(
                               scale=self.scale,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -151,6 +158,7 @@ def __init__(
         config: GPTBigCodeConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -158,7 +166,10 @@ def __init__(
                      hidden_size)
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPTBigCodeAttention(config, cache_config, quant_config)
+        self.attn = GPTBigCodeAttention(config,
+                                        cache_config,
+                                        quant_config,
+                                        prefix=f"{prefix}.attn")
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.mlp = GPTBigMLP(inner_dim, config, quant_config)
 
@@ -204,13 +215,20 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
-        self.wte = VocabParallelEmbedding(self.vocab_size,
-                                          self.embed_dim,
-                                          org_num_embeddings=config.vocab_size)
+        self.wte = VocabParallelEmbedding(
+            self.vocab_size,
+            self.embed_dim,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: GPTBigCodeBlock(config, cache_config, quant_config),
+            lambda prefix: GPTBigCodeBlock(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.h",
         )
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
@@ -218,6 +236,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.n_embd))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -225,11 +246,12 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            inputs_embeds = self.wte(input_ids)
-            position_embeds = self.wpe(position_ids)
-            hidden_states = inputs_embeds + position_embeds
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+            hidden_states = inputs_embeds + self.wpe(position_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
 
@@ -248,7 +270,7 @@ def forward(
 class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {"c_attn": ["c_attn"]}
 
-    supported_lora_modules = ["c_fc", "c_proj", "wte", "c_attn"]
+    supported_lora_modules = ["c_fc", "c_proj", "wte", "lm_head", "c_attn"]
 
     embedding_modules = {
         "wte": "input_embeddings",
@@ -269,22 +291,47 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.quant_config = quant_config
         self.transformer = GPTBigCodeModel(vllm_config=vllm_config,
                                            prefix=prefix)
-        if self.config.tie_word_embeddings:
-            self.lm_head = self.transformer.wte
-        else:
-            self.lm_head = ParallelLMHead(
-                self.transformer.vocab_size,
-                self.transformer.embed_dim,
-                org_num_embeddings=self.config.vocab_size)
+
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        if self.config.tie_word_embeddings:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=QuantizationConfigOverride(
+                    TiedWeightLinearMethod),
+                params_dtype=self.transformer.wte.weight.dtype,
+            )
+            self.lm_head.register_parameter("weight",
+                                            self.transformer.wte.weight)
+        else:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -292,9 +339,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -314,10 +363,12 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "lm_head.weight" in name:
+            if "lm_head.weight" in name and self.config.tie_word_embeddings:
                 continue
             if ".attn.bias" in name:
                 # Skip attention mask.
@@ -335,3 +386,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, 'v')
             else:
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index a83d03480dde1..4829578a56959 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-J model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -53,6 +53,7 @@ def __init__(
         config: GPTJConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.total_num_heads = config.num_attention_heads
@@ -94,7 +95,8 @@ def __init__(
                               self.head_size,
                               scaling,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -147,12 +149,16 @@ def __init__(
         config: GPTJConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         inner_dim = (4 * config.n_embd
                      if config.n_inner is None else config.n_inner)
         self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.attn = GPTJAttention(config, cache_config, quant_config)
+        self.attn = GPTJAttention(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.attn")
         self.mlp = GPTJMLP(inner_dim, config, quant_config)
 
     def forward(
@@ -193,7 +199,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.h = make_layers(
             config.n_layer,
-            lambda prefix: GPTJBlock(config, cache_config, quant_config),
+            lambda prefix: GPTJBlock(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.h",
         )
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
@@ -201,6 +208,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.n_embd))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -208,9 +218,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.wte(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for i in range(self.start_layer, self.end_layer):
@@ -250,6 +264,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -257,9 +274,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -279,7 +298,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -289,6 +309,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "attn.bias" in name or "attn.masked_bias" in name:
                 continue
@@ -318,3 +339,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 794b141bfa4aa..731642772011c 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-NeoX model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -52,6 +52,7 @@ def __init__(
         config: GPTNeoXConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.total_num_heads = config.num_attention_heads
@@ -94,7 +95,8 @@ def __init__(
                               self.head_size,
                               scaling,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -145,6 +147,7 @@ def __init__(
         config: GPTNeoXConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.use_parallel_residual = config.use_parallel_residual
@@ -152,7 +155,10 @@ def __init__(
                                             eps=config.layer_norm_eps)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
                                                      eps=config.layer_norm_eps)
-        self.attention = GPTNeoXAttention(config, cache_config, quant_config)
+        self.attention = GPTNeoXAttention(config,
+                                          cache_config,
+                                          quant_config,
+                                          prefix=f"{prefix}.attention")
         self.mlp = GPTNeoXMLP(config, quant_config)
 
     def forward(
@@ -205,7 +211,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: GPTNeoXLayer(config, cache_config, quant_config),
+            lambda prefix: GPTNeoXLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         self.final_layer_norm = nn.LayerNorm(config.hidden_size,
@@ -214,6 +221,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_in(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -221,9 +231,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_in(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for i in range(self.start_layer, self.end_layer):
@@ -262,6 +276,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.gpt_neox.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.gpt_neox.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -269,9 +286,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.gpt_neox(input_ids, positions, kv_caches,
-                                      attn_metadata, intermediate_tensors)
+                                      attn_metadata, intermediate_tensors,
+                                      inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -291,8 +310,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if ("attention.bias" in name or "attention.masked_bias" in name
                     or "rotary_emb.inv_freq" in name):
@@ -325,3 +346,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index d1e6e31f2b8d1..f9e0443b9a508 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only IBM Granite model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -161,7 +161,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -399,16 +400,20 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 self.lm_head.weight = self.model.embed_tokens.weight
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-
             if hasattr(config, "logits_scaling"):
                 logit_scale /= config.logits_scaling
+
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     scale=logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
+        self.sampler = get_sampler()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -416,9 +421,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  inputs_embeds)
         return model_output
 
     def compute_logits(
@@ -450,7 +457,8 @@ def make_empty_intermediate_tensors(
                         device=device),
         })
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -460,6 +468,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -480,6 +489,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
@@ -513,6 +523,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 2ed115c56af45..51296ef0cc08e 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GraniteMoe model."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -164,7 +164,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -277,6 +278,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -284,9 +288,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             hidden_states *= self.embedding_multiplier
             residual = None
         else:
@@ -366,6 +374,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.sampler = get_sampler()
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -373,9 +384,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -407,7 +420,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         new_weights = {}
         for n, p in weights:
             if n.endswith('.block_sparse_moe.input_linear.weight'):
@@ -440,4 +454,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 pass
             else:
                 new_weights[n] = p
-        mixtral.MixtralForCausalLM.load_weights(self, new_weights.items())
+        return mixtral.MixtralForCausalLM.load_weights(self,
+                                                       new_weights.items())
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
new file mode 100644
index 0000000000000..d179d6235424a
--- /dev/null
+++ b/vllm/model_executor/models/gritlm.py
@@ -0,0 +1,248 @@
+from array import array
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+from vllm.attention import AttentionMetadata
+from vllm.attention.backends.xformers import XFormersImpl
+from vllm.config import ModelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.pooler import PoolerHead
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.pooling_metadata import (PoolingMetadata,
+                                                  PoolingTensors)
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.sequence import (IntermediateTensors, PoolerOutput,
+                           PoolingSequenceGroupOutput)
+
+logger = init_logger(__name__)
+
+
+class GritLMPooler(nn.Module):
+
+    def __init__(self, model_config: ModelConfig):
+        super().__init__()
+
+        self.model_config = model_config
+
+        tokenizer = cached_get_tokenizer(
+            self.model_config.tokenizer,
+            tokenizer_mode=self.model_config.tokenizer_mode,
+            tokenizer_revision=self.model_config.tokenizer_revision,
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
+
+        # Collect the tokens needed for pattern matching.
+        # "▁<" is different from "_<". The former uses "▁" to indicate that
+        # the next token is the start of a word.
+        # "<0x0A>" is the newline token (i.e. "\n")."
+        self.token_ids = {
+            tok: tokenizer.convert_tokens_to_ids([tok])[0]
+            for tok in ["<s>", "▁<", "<", "|", "embed", ">", "<0x0A>", "user"]
+        }
+
+        def tokens_to_ids(tokens: list[str]) -> array:
+            return array("i", [self.token_ids[token] for token in tokens])
+
+        self.user_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "user", "|", ">", "<0x0A>"])
+        self.embed_newline_pattern_ids = tokens_to_ids(
+            ["<0x0A>", "<", "|", "embed", "|", ">", "<0x0A>"])
+        self.embed_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "embed", "|", ">", "<0x0A>"])
+
+        self.head = PoolerHead(normalize=True, softmax=False)
+
+    def _find_array(self, arr: array, target: array, start_idx: int) -> int:
+        """
+        Find the first occurrence of target in arr starting from start_idx.
+
+        Args:
+        arr: The array to search within
+        target: The consecutive subsequence to find
+        start_idx: The starting index to search from
+
+        Returns:
+        int: The index of the first occurrence of target in arr.
+        """
+        if start_idx < 0:
+            raise ValueError("start_idx must be non-negative")
+        if not target or not arr:
+            raise ValueError("Empty arr or target not allowed")
+
+        target_len = len(target)
+        for i in range(start_idx, len(arr) - target_len + 1):
+            if arr[i:i + target_len] == target:
+                return i
+        return -1
+
+    def _get_instruction_len(self, prompt_token_ids: array) -> int:
+        """
+        Get the length of the instruction in the prompt.
+
+        We do a pattern matching to find the instruction in the prompt,
+        and then return the length of the instruction.
+
+        The pattern matching is done using integers instead of strings
+        because the prompt is given as a list of token IDs.
+        """
+
+        instruction_len = 0
+
+        # Return no instruction in case of missing BOS token.
+        if prompt_token_ids[0] != self.token_ids["<s>"]:
+            logger.warning("BOS token not found in prompt,"
+                           "thus using empty string for instruction."
+                           "GritLM requires BOS token in prompt.")
+            return instruction_len
+
+        # If user pattern is found in the prompt, that means there should be
+        # a newline token before the embed pattern.
+        embed_pattern_ids = self.embed_pattern_ids
+        if self._find_array(prompt_token_ids,
+                            self.user_pattern_ids,
+                            start_idx=1) == 1:
+            embed_pattern_ids = self.embed_newline_pattern_ids
+
+        # Find the embed pattern in the prompt.
+        found_embed_pattern_idx = self._find_array(prompt_token_ids,
+                                                   embed_pattern_ids,
+                                                   start_idx=1)
+
+        if found_embed_pattern_idx != -1:
+            instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
+        else:
+            logger.warning("Query instruction not found in prompt,"
+                           "thus using BOS token as instruction instead."
+                           "GritLM requires query instruction in prompt.")
+            instruction_len = 1
+
+        return instruction_len
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        """
+        Pool the hidden states by summing the embeddings of
+        non-instruction tokens.
+        """
+        prompts_token_ids = [
+            token_ids.prompt_token_ids_array
+            for _, token_ids in pooling_metadata.seq_data.items()
+        ]
+
+        instruction_lens = torch.tensor(
+            [
+                self._get_instruction_len(prompt_token_ids)
+                for prompt_token_ids in prompts_token_ids
+            ],
+            device=hidden_states.device,
+        )
+
+        prompt_lens = PoolingTensors.from_pooling_metadata(
+            pooling_metadata, hidden_states.device).prompt_lens
+
+        mask = torch.zeros_like(hidden_states, dtype=torch.bool)
+
+        start_idx = 0
+        for prompt_len, instruction_len in zip(prompt_lens, instruction_lens):
+            end_idx = start_idx + prompt_len
+            mask[start_idx + instruction_len:end_idx] = True
+            start_idx = end_idx
+
+        masked_hidden_states = hidden_states.masked_fill(~mask, 0.0)
+
+        sum_embeddings = torch.zeros(len(prompt_lens),
+                                     hidden_states.size(1),
+                                     device=hidden_states.device)
+
+        start_idx = 0
+        for i, prompt_len in enumerate(prompt_lens):
+            end_idx = start_idx + prompt_len
+            sum_embeddings[i] = masked_hidden_states[start_idx:end_idx].sum(
+                dim=0)
+            start_idx = end_idx
+
+        num_non_instruction_tokens = prompt_lens - instruction_lens
+        mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze(
+            1)
+
+        pooled_data = self.head(mean_embeddings)
+
+        pooled_outputs = [
+            PoolingSequenceGroupOutput(data) for data in pooled_data
+        ]
+
+        return PoolerOutput(outputs=pooled_outputs)
+
+
+class GritLM(LlamaForCausalLM):
+    """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
+
+    The class inherits from LlamaForCausalLM and provides a custom pooling
+    layer.
+
+    The main difference between the pooling layer in GritLM and the one in
+    LlamaForCausalLM is that GritLM ignores the query instruction in the prompt
+    when pooling the hidden states.
+
+    Embedding prompts should be in the following format:
+    - With instruction: "<|user|>\nINSTRUCTION\n<|embed|>\nPROMPT".
+    - Without instruction: "<|embed|>\nPROMPT".
+
+    Generation prompts should be in the following format:
+    - "<|user|>\nPROMPT\n<|assistant|>\n"
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+        self.runner_type = vllm_config.model_config.runner_type
+
+        self._pooler = GritLMPooler(vllm_config.model_config)
+
+        for layer in self.model.layers:
+            if self.runner_type == "pooling" and hasattr(layer, "self_attn"):
+                assert isinstance(layer.self_attn.attn.impl, XFormersImpl), (
+                    "GritLM embedding is only supported by XFormers backend, "
+                    "which can be forced by VLLM_ATTENTION_BACKEND=XFORMERS")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        # Change attention to non-causal for pooling tasks.
+        if self.runner_type == "pooling":
+            assert attn_metadata.prefill_metadata.attn_bias is None
+            attn_metadata.prefill_metadata.attn_bias = [
+                BlockDiagonalMask.from_seqlens(attn_metadata.seq_lens)
+            ]
+
+        return super().forward(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            **kwargs,
+        )
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index b21bc2a3f9ce1..e430a158d869a 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -15,14 +15,14 @@
 # limitations under the License.
 """PyTorch Idefics2 model."""
 
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 from torch import nn
 from transformers.models.idefics2.configuration_idefics2 import (
     Idefics2Config, Idefics2VisionConfig)
-from xformers import ops as xops
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -141,35 +141,18 @@ def __init__(
         )
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
-        self.is_causal = False
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
-        batch_size, q_len, _ = hidden_states.size()
         qkv, _ = self.qkv_proj(
             hidden_states
         )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
         query_states, key_states, value_states = qkv.chunk(3, dim=-1)
-        query_states = query_states.view(batch_size, q_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        key_states = key_states.view(batch_size, q_len,
-                                     self.num_heads_per_partition,
-                                     self.head_dim)
-        value_states = value_states.view(batch_size, q_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        # see: https://facebookresearch.github.io/xformers/components/ops.html
-        out = xops.memory_efficient_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            p=self.dropout,
-            scale=self.scale,
-        )
-        out = out.view(batch_size, q_len, -1)
+        out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
         return attn_output
 
@@ -331,7 +314,8 @@ def forward(
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -339,11 +323,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-                param = params_dict[name.replace(weight_name, param_name)]
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -352,3 +338,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 0cecc754e916f..17e772e7faa32 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -15,7 +15,7 @@
 
 import math
 from typing import (Dict, Iterable, List, Literal, Mapping, NamedTuple,
-                    Optional, Tuple, TypedDict, Union)
+                    Optional, Set, Tuple, TypedDict, Union)
 
 import torch
 import torch.utils.checkpoint
@@ -39,6 +39,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
+from vllm.multimodal.inputs import NestedTensors
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import is_list_of
@@ -59,7 +60,8 @@ class Idefics3ImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
+    Shape: `(batch_size * num_images * num_patches, 
+             num_channels, height, width)`
     """
     pixel_attention_mask: Optional[torch.BoolTensor]
 
@@ -266,54 +268,56 @@ def input_processor_for_idefics3(ctx: InputContext,
     n_images_in_text = []
 
     text = inputs.get("prompt")
-    if text is not None:
-        if isinstance(text, str):
-            text = [text]
-        elif not isinstance(text, list) and not isinstance(text[0], str):
-            raise ValueError("Invalid input text. Please provide a string, "
-                             "or a list of strings")
-
-        fake_image_token = processor.fake_image_token.content
-        image_token = processor.image_token.content
-        global_img_token = processor.global_image_tag
-
-        prompt_strings = []
-        for sample, sample_rows, sample_cols in zip(text, image_rows,
-                                                    image_cols):
-            n_images_in_text.append(sample.count(image_token))
-
-            # Replace the image token with fake tokens around the expanded
-            # image token sequence of length `image_seq_len`
-            image_prompt_strings = []
-            for n_rows, n_cols in zip(sample_rows, sample_cols):
-                image_prompt_string = _get_image_prompt_string(
-                    n_rows,
-                    n_cols,
-                    processor.image_seq_len,
-                    image_token=image_token,
-                    fake_token_around_image=fake_image_token,
-                    global_img_token=global_img_token,
-                )
-                image_prompt_strings.append(image_prompt_string)
-
-            split_sample = sample.split(image_token)
-            if len(split_sample) == 0:
-                raise ValueError(
-                    "The image token should be present in the text.")
+    if text is None:
+        prompt_token_ids = inputs.get("prompt_token_ids", [])
+        assert prompt_token_ids
+        text = tokenizer.decode(prompt_token_ids)
+
+    if isinstance(text, str):
+        text = [text]
+    elif not isinstance(text, list) and not isinstance(text[0], str):
+        raise ValueError("Invalid input text. Please provide a string, "
+                         "or a list of strings")
+
+    fake_image_token = processor.fake_image_token.content
+    image_token = processor.image_token.content
+    global_img_token = processor.global_image_tag
+
+    prompt_strings = []
+    for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
+        n_images_in_text.append(sample.count(image_token))
+
+        # Replace the image token with fake tokens around the expanded
+        # image token sequence of length `image_seq_len`
+        image_prompt_strings = []
+        for n_rows, n_cols in zip(sample_rows, sample_cols):
+            image_prompt_string = _get_image_prompt_string(
+                n_rows,
+                n_cols,
+                processor.image_seq_len,
+                image_token=image_token,
+                fake_token_around_image=fake_image_token,
+                global_img_token=global_img_token,
+            )
+            image_prompt_strings.append(image_prompt_string)
 
-            # Place in the image prompt strings where the image tokens are
-            sample = split_sample[0]
-            for i, image_prompt_string in enumerate(image_prompt_strings):
-                sample += image_prompt_string + split_sample[i + 1]
-            prompt_strings.append(sample)
+        split_sample = sample.split(image_token)
+        if len(split_sample) == 0:
+            raise ValueError("The image token should be present in the text.")
 
-        prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
+        # Place in the image prompt strings where the image tokens are
+        sample = split_sample[0]
+        for i, image_prompt_string in enumerate(image_prompt_strings):
+            sample += image_prompt_string + split_sample[i + 1]
+        prompt_strings.append(sample)
 
-        return token_inputs(
-            prompt_token_ids=prompt_token_ids,
-            prompt=prompt_strings[0],
-            multi_modal_data=multi_modal_data,
-        )
+    prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
+
+    return token_inputs(
+        prompt_token_ids=prompt_token_ids,
+        prompt=prompt_strings[0],
+        multi_modal_data=multi_modal_data,
+    )
 
 
 def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int:
@@ -517,13 +521,17 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            return Idefics3ImagePixelInputs(type="pixel_values",
-                                            data=self._validate_pixel_values(
-                                                flatten_bn(pixel_values,
-                                                           concat=True)),
-                                            pixel_attention_mask=flatten_bn(
-                                                pixel_attention_mask,
-                                                concat=True))
+            if isinstance(pixel_values, list):
+                pixel_values = torch.cat(pixel_values, dim=1)
+                pixel_attention_mask = torch.cat(pixel_attention_mask, dim=1)
+            else:
+                pixel_values = flatten_bn(pixel_values)
+                pixel_attention_mask = flatten_bn(pixel_attention_mask)
+
+            return Idefics3ImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(pixel_values),
+                pixel_attention_mask=pixel_attention_mask)
 
         raise AssertionError("This line should be unreachable.")
 
@@ -597,6 +605,12 @@ def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
         image_features = self._process_image_pixels(image_input)
         return self.connector(image_features)
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.text_model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -604,26 +618,8 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        **kwargs: object,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        if intermediate_tensors is not None:
-            input_ids = None
-            inputs_embeds = None
-        else:
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.text_model.get_input_embeddings(input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.image_token_id)
-            else:
-                inputs_embeds = self.text_model.get_input_embeddings(input_ids)
-            input_ids = None
 
         hidden_states = self.text_model(
             input_ids,
@@ -667,21 +663,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
     ]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        # vision_model
-        ".fc1.",
-        ".fc2.",
-        ".out_proj.",
-        # connector
-        ".proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -718,6 +699,25 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
         self.sampler = Sampler()
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self.model._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self.model._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_id)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -725,16 +725,27 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-            intermediate_tensors,
-            **kwargs,
-        )
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.model.text_model(input_ids,
+                                              positions,
+                                              kv_caches,
+                                              attn_metadata,
+                                              intermediate_tensors,
+                                              inputs_embeds=inputs_embeds)
+
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -751,9 +762,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
 
     def get_mm_mapping(self) -> MultiModelKeys:
         """
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index dcead65115132..70b78fe64f2d8 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -2,17 +2,22 @@
                     Protocol, Type, Union, overload, runtime_checkable)
 
 import torch
-from typing_extensions import TypeIs
+from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
 from vllm.utils import supports_kw
 
+from .interfaces_base import is_pooling_model
+
 if TYPE_CHECKING:
-    from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig
+    from vllm.attention import AttentionMetadata
+    from vllm.multimodal.inputs import NestedTensors  # noqa: F401
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
 
+T = TypeVar("T", default="NestedTensors")
+
 
 @runtime_checkable
 class SupportsMultiModal(Protocol):
@@ -27,7 +32,39 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def __init__(self, *, multimodal_config: "MultiModalConfig") -> None:
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
+        """
+        Returns multimodal embeddings generated from multimodal kwargs 
+        to be merged with text embeddings.
+
+        The output embeddings must be one of the following formats:
+        - A list or tuple of 2D tensors, where each tensor corresponds to 
+          each input image.
+        - A single 3D tensor, with the batch dimension grouping the 2D tensors.
+        """
+        ...
+
+    # Only for models that support v0 chunked prefill
+    # TODO(ywang96): Remove this overload once v0 is deprecated
+    @overload
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[T] = None,
+        attn_metadata: Optional["AttentionMetadata"] = None,
+    ) -> torch.Tensor:
+        ...
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[T] = None,
+    ) -> torch.Tensor:
+        """
+        Returns the input embeddings merged from the text embeddings from 
+        input_ids and the multimodal embeddings generated from multimodal 
+        kwargs.
+        """
         ...
 
 
@@ -37,9 +74,6 @@ def __init__(self, *, multimodal_config: "MultiModalConfig") -> None:
 class _SupportsMultiModalType(Protocol):
     supports_multimodal: Literal[True]
 
-    def __call__(self, *, multimodal_config: "MultiModalConfig") -> None:
-        ...
-
 
 @overload
 def supports_multimodal(
@@ -79,10 +113,6 @@ class SupportsLoRA(Protocol):
     embedding_modules: ClassVar[Dict[str, str]]
     embedding_padding_modules: ClassVar[List[str]]
 
-    # lora_config is None when LoRA is not enabled
-    def __init__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
-        ...
-
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
 # so we need to treat the class as an instance and use isinstance instead
@@ -95,9 +125,6 @@ class _SupportsLoRAType(Protocol):
     embedding_modules: Dict[str, str]
     embedding_padding_modules: List[str]
 
-    def __call__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
-        ...
-
 
 @overload
 def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]:
@@ -274,21 +301,11 @@ class HasInnerState(Protocol):
         for max_num_seqs, etc. True for e.g. both Mamba and Jamba.
     """
 
-    def __init__(self,
-                 *,
-                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
-        ...
-
 
 @runtime_checkable
 class _HasInnerStateType(Protocol):
     has_inner_state: ClassVar[Literal[True]]
 
-    def __init__(self,
-                 *,
-                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
-        ...
-
 
 @overload
 def has_inner_state(model: object) -> TypeIs[HasInnerState]:
@@ -321,17 +338,11 @@ class IsAttentionFree(Protocol):
         True for Mamba but not Jamba.
     """
 
-    def __init__(self) -> None:
-        ...
-
 
 @runtime_checkable
 class _IsAttentionFreeType(Protocol):
     is_attention_free: ClassVar[Literal[True]]
 
-    def __init__(self) -> None:
-        ...
-
 
 @overload
 def is_attention_free(model: object) -> TypeIs[IsAttentionFree]:
@@ -350,3 +361,74 @@ def is_attention_free(
         return isinstance(model, _IsAttentionFreeType)
 
     return isinstance(model, IsAttentionFree)
+
+
+@runtime_checkable
+class IsHybrid(Protocol):
+    """The interface required for all models like Jamba that have both
+    attention and mamba blocks, indicates that 
+    hf_config has 'layers_block_type'"""
+
+    is_hybrid: ClassVar[Literal[True]] = True
+    """
+        A flag that indicates this model has both mamba and attention blocks
+        , also indicates that the model's hf_config has 
+        'layers_block_type' """
+
+
+@runtime_checkable
+class _IsHybridType(Protocol):
+    is_hybrid: ClassVar[Literal[True]]
+
+
+@overload
+def is_hybrid(model: object) -> TypeIs[IsHybrid]:
+    ...
+
+
+@overload
+def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]:
+    ...
+
+
+def is_hybrid(
+    model: Union[Type[object], object]
+) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]:
+    if isinstance(model, type):
+        return isinstance(model, _IsHybridType)
+
+    return isinstance(model, IsHybrid)
+
+
+@runtime_checkable
+class SupportsCrossEncoding(Protocol):
+    """The interface required for all models that support cross encoding."""
+
+    supports_cross_encoding: ClassVar[Literal[True]] = True
+
+
+@overload
+def supports_cross_encoding(
+        model: Type[object]) -> TypeIs[Type[SupportsCrossEncoding]]:
+    ...
+
+
+@overload
+def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]:
+    ...
+
+
+def _supports_cross_encoding(
+    model: Union[Type[object], object],
+) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
+
+    if isinstance(model, type):
+        return isinstance(model, SupportsCrossEncoding)
+
+    return isinstance(model, SupportsCrossEncoding)
+
+
+def supports_cross_encoding(
+    model: Union[Type[object], object],
+) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
+    return is_pooling_model(model) and _supports_cross_encoding(model)
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 7bb43beff255c..de733b6d49a53 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -71,7 +71,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
                         and issubclass(model, nn.Module)):
         logger.warning(
             "The model (%s) is missing "
-            "vLLM-specific keywords from its initializer: %s",
+            "vLLM-specific keywords from its `forward` method: %s",
             model,
             missing_kws,
         )
@@ -141,7 +141,7 @@ def is_text_generation_model(
 
 
 @runtime_checkable
-class VllmModelForEmbedding(VllmModel[C_co, T], Protocol[C_co, T]):
+class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]):
 
     def pooler(
         self,
@@ -153,23 +153,22 @@ def pooler(
 
 
 @overload
-def is_embedding_model(
-        model: Type[object]) -> TypeIs[Type[VllmModelForEmbedding]]:
+def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]:
     ...
 
 
 @overload
-def is_embedding_model(model: object) -> TypeIs[VllmModelForEmbedding]:
+def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]:
     ...
 
 
-def is_embedding_model(
+def is_pooling_model(
     model: Union[Type[object], object],
-) -> Union[TypeIs[Type[VllmModelForEmbedding]], TypeIs[VllmModelForEmbedding]]:
+) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]:
     if not is_vllm_model(model):
         return False
 
     if isinstance(model, type):
-        return isinstance(model, VllmModelForEmbedding)
+        return isinstance(model, VllmModelForPooling)
 
-    return isinstance(model, VllmModelForEmbedding)
+    return isinstance(model, VllmModelForPooling)
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 9761635d2a6c2..7ff68bd60e8ad 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -5,13 +5,14 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from functools import partial
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -24,12 +25,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-try:
-    from xformers import ops as xops
-    USE_XFORMERS_OPS = True
-except ImportError:
-    USE_XFORMERS_OPS = False
-
 NORM2FN = {
     'rms_norm': RMSNorm,
     'layer_norm': nn.LayerNorm,
@@ -186,6 +181,9 @@ def __init__(
             prefix=f"{prefix}.proj",
         )
 
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
     def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
         if self.tp_size > 1:
             q = tensor_model_parallel_all_gather(q.contiguous())
@@ -207,15 +205,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.qk_normalization:
             q, k = self._apply_qk_norm(q, k)
 
-        q = q.view(B, N, self.num_heads_per_partition, self.head_dim)
-        k = k.view(B, N, self.num_heads_per_partition, self.head_dim)
-        v = v.view(B, N, self.num_heads_per_partition, self.head_dim)
-
-        x = xops.memory_efficient_attention_forward(q, k, v, scale=self.scale)
-        x = x.view(B, N, -1)
-
-        x, _ = self.proj(x)
-        return x
+        out = self.attn(q, k, v)
+        out, _ = self.proj(out)
+        return out
 
 
 class InternSdpaAttention(nn.Module):
@@ -362,7 +354,7 @@ def _init_attn(
         tp_size = get_tensor_model_parallel_world_size()
         num_heads = config.num_attention_heads
 
-        if USE_XFORMERS_OPS and (num_heads + num_dummy_heads) % tp_size == 0:
+        if (num_heads + num_dummy_heads) % tp_size == 0:
             return InternParallelAttention(config,
                                            quant_config=quant_config,
                                            num_dummy_heads=num_dummy_heads,
@@ -469,10 +461,14 @@ def forward(
 
         return encoder_outputs
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             param = params_dict[name]
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 21fa6983063b8..41b9f110d771f 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -27,7 +27,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -250,7 +250,12 @@ def forward(
 @support_torch_compile
 class InternLM2Model(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(
+            self,
+            *,
+            vllm_config: VllmConfig,
+            prefix: str = "",
+            layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -266,7 +271,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: InternLMDecoderLayer(
+            lambda prefix: layer_type(
                 config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -290,7 +295,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.tok_embeddings(input_ids)
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -314,16 +319,38 @@ def forward(
         return hidden_states
 
 
-class InternLM2ForCausalLM(nn.Module, SupportsPP):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "wqkv": ["wqkv"],
+        "gate_up_proj": ["w1", "w3"],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "wqkv",
+        "wo",
+        "gate_up_proj",
+        "w2",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 model_type: Type[InternLM2Model] = InternLM2Model):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.quant_config = quant_config
-        self.model = InternLM2Model(vllm_config=vllm_config,
-                                    prefix=maybe_prefix(prefix, "model"))
+        self.lora_config = lora_config
+
+        self.model = model_type(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
         self.output = ParallelLMHead(config.vocab_size,
                                      config.hidden_size,
                                      quant_config=quant_config,
@@ -335,6 +362,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -342,9 +372,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -364,13 +396,15 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "w1", 0),
             ("gate_up_proj", "w3", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -397,3 +431,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index f1b7c896cadfe..93ac2dcf8d587 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -14,8 +14,6 @@
                                                   InternLM2MLP, InternLM2Model)
 from vllm.sequence import IntermediateTensors
 
-from .utils import make_layers, maybe_prefix
-
 
 class InternLM2VEDecoderLayer(nn.Module):
 
@@ -105,17 +103,9 @@ def forward(
 class InternLM2VEModel(InternLM2Model):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: InternLM2VEDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=InternLM2VEDecoderLayer)
 
     def forward(
         self,
@@ -159,7 +149,6 @@ def forward(
 class InternLM2VEForCausalLM(InternLM2ForCausalLM):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        self.model = InternLM2VEModel(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"))
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         model_type=InternLM2VEModel)
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 92579e3aae949..f4b7e4478c164 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -6,7 +6,7 @@
 # --------------------------------------------------------
 import re
 from functools import cached_property, partial
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -19,13 +19,14 @@
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
-from vllm.model_executor.layers.quantization import (AWQConfig,
-                                                     QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -51,12 +52,18 @@ class InternVLImagePixelInputs(TypedDict):
     Shape:
     `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
     """
+    patches_per_image: List[int]
+    """
+    List of number of total patches for each image in the batch.
+    """
 
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+    data: NestedTensors
+    """ 
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
@@ -123,8 +130,15 @@ def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
     return blocks, target_width, target_height
 
 
-def calculate_num_blocks_wrapper(hf_config: PretrainedConfig,
-                                 max_dynamic_patch: Optional[int] = None):
+def calculate_num_blocks_wrapper(
+    hf_config: PretrainedConfig,
+    max_dynamic_patch: Optional[int] = None,
+    dynamic_image_size: Optional[bool] = None,
+):
+    if dynamic_image_size is None:
+        dynamic_image_size = hf_config.dynamic_image_size
+
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
     if max_dynamic_patch is None:
         max_dynamic_patch = hf_config.max_dynamic_patch
     min_num = hf_config.min_dynamic_patch
@@ -183,10 +197,17 @@ def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int,
     return pixel_values
 
 
-def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
-                                  max_dynamic_patch: Optional[int] = None):
+def image_to_pixel_values_wrapper(
+    hf_config: PretrainedConfig,
+    max_dynamic_patch: Optional[int] = None,
+    dynamic_image_size: Optional[bool] = None,
+):
     image_size = hf_config.vision_config.image_size
     min_num = hf_config.min_dynamic_patch
+    if dynamic_image_size is None:
+        dynamic_image_size = hf_config.dynamic_image_size
+
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
     if max_dynamic_patch is None:
         max_dynamic_patch = hf_config.max_dynamic_patch
     use_thumbnail = hf_config.use_thumbnail
@@ -207,11 +228,17 @@ def get_internvl_num_patches(hf_config: PretrainedConfig):
         (downsample_ratio**2))
 
 
-def get_max_internvl_image_tokens(ctx: InputContext,
-                                  *,
-                                  max_dynamic_patch: Optional[int] = None):
+def get_max_internvl_image_tokens(
+    ctx: InputContext,
+    *,
+    max_dynamic_patch: Optional[int] = None,
+    dynamic_image_size: Optional[bool] = None,
+):
     hf_config = ctx.get_hf_config()
+    if dynamic_image_size is None:
+        dynamic_image_size = hf_config.dynamic_image_size
 
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
     if max_dynamic_patch is None:
         max_dynamic_patch = hf_config.max_dynamic_patch
     use_thumbnail = hf_config.use_thumbnail
@@ -222,12 +249,18 @@ def get_max_internvl_image_tokens(ctx: InputContext,
     return num_patches * max_dynamic_patch
 
 
-def get_max_internvl_image_size(ctx: InputContext,
-                                *,
-                                max_dynamic_patch: Optional[int] = None):
+def get_max_internvl_image_size(
+    ctx: InputContext,
+    *,
+    max_dynamic_patch: Optional[int] = None,
+    dynamic_image_size: Optional[bool] = None,
+):
     hf_config = ctx.get_hf_config()
     image_size = hf_config.vision_config.image_size
+    if dynamic_image_size is None:
+        dynamic_image_size = hf_config.dynamic_image_size
 
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
     if max_dynamic_patch is None:
         max_dynamic_patch = hf_config.max_dynamic_patch
     use_thumbnail = hf_config.use_thumbnail
@@ -281,6 +314,7 @@ def input_processor(
         inputs: DecoderOnlyInputs,
         *,
         max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
     ) -> DecoderOnlyInputs:
         multi_modal_data = inputs.get("multi_modal_data")
         if multi_modal_data is None or "image" not in multi_modal_data:
@@ -292,7 +326,7 @@ def input_processor(
         image_data = multi_modal_data["image"]
         num_patches = get_internvl_num_patches(hf_config)
         num_blocks_calculator = calculate_num_blocks_wrapper(
-            hf_config, max_dynamic_patch)
+            hf_config, max_dynamic_patch, dynamic_image_size)
         if isinstance(image_data, Image.Image):
             width, height = image_data.size
             num_blocks, _, _ = num_blocks_calculator(width, height)
@@ -321,10 +355,32 @@ def input_processor(
         new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
                                                num_patches)
         new_prompt_token_ids = tokenizer.encode(new_prompt)
+        img_context_token_id = tokenizer.encode(self.img_context_token,
+                                                add_special_tokens=False)
+        assert len(img_context_token_id) == 1, \
+            (f"Invalid image token '{self.img_context_token}': A valid image "
+            f"token encodes to a single token ID, got {img_context_token_id}.")
+        img_context_token_id = img_context_token_id[0]
+
+        # Get precise tracking of placeholder positions
+        token_idx = image_idx = 0
+        placeholder_ranges = []
+        while token_idx < len(new_prompt_token_ids):
+            if new_prompt_token_ids[token_idx] == img_context_token_id:
+                curr_image_featue_size = image_feature_sizes[image_idx]
+                placeholder_ranges.append(
+                    PlaceholderRange(offset=token_idx,
+                                     length=curr_image_featue_size))
+                image_idx += 1
+                token_idx += curr_image_featue_size
+            else:
+                token_idx += 1
 
-        return token_inputs(prompt=prompt,
-                            prompt_token_ids=new_prompt_token_ids,
-                            multi_modal_data=multi_modal_data)
+        return token_inputs(
+            prompt=prompt,
+            prompt_token_ids=new_prompt_token_ids,
+            multi_modal_data=multi_modal_data,
+            multi_modal_placeholders={"image": placeholder_ranges})
 
     def input_mapper(
         self,
@@ -332,11 +388,12 @@ def input_mapper(
         data: object,
         *,
         max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
     ):
         hf_config = ctx.get_hf_config()
 
         image_pixel_values_mapper = image_to_pixel_values_wrapper(
-            hf_config, max_dynamic_patch)
+            hf_config, max_dynamic_patch, dynamic_image_size)
         if isinstance(data, Image.Image):
             data = image_pixel_values_mapper(data)
             # Add an N dimension for number of images per prompt (currently 1).
@@ -366,13 +423,17 @@ def dummy_data(
         mm_counts: Mapping[str, int],
         *,
         max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
     ):
         num_images = mm_counts["image"]
 
         hf_config = ctx.get_hf_config()
 
         image_feature_size = get_max_internvl_image_tokens(
-            ctx, max_dynamic_patch=max_dynamic_patch)
+            ctx,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
         model_config = ctx.model_config
         tokenizer = cached_get_tokenizer(
             model_config.tokenizer,
@@ -388,7 +449,10 @@ def dummy_data(
         )
 
         max_image_width, max_image_height = get_max_internvl_image_size(
-            ctx, max_dynamic_patch=max_dynamic_patch)
+            ctx,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
 
         mm_data = dummy_image_for_clip(
             hf_config.vision_config,
@@ -438,13 +502,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         )
 
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.mlp1 = self._init_mlp1(config)
 
         self.img_context_token_id = None
+        self.visual_token_mask = None
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
@@ -576,35 +642,78 @@ def _parse_and_validate_image_input(
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
+
+            patches_per_image = []
+            for request_pixel_values in pixel_values:
+                for image_pixel_values in request_pixel_values:
+                    patches_per_image.append(image_pixel_values.shape[0])
             # We need to flatten (B, N, P) to (B*N*P),
             # so we call flatten_bn twice.
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
                     flatten_bn(flatten_bn(pixel_values), concat=True)),
-            )
+                patches_per_image=patches_per_image)
 
         raise AssertionError("This line should be unreachable.")
 
     def _process_image_input(
         self,
         image_input: InternVLImageInputs,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
         assert self.vision_model is not None
+
         image_embeds = self.extract_feature(image_input["data"])
 
+        patches_per_image = image_input["patches_per_image"]
+
+        # Only one image in the current batch
+        if len(patches_per_image) == 1:
+            image_embeds = image_embeds.view(
+                -1, self.config.text_config.hidden_size).unsqueeze(0)
+            return image_embeds
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in patches_per_image
+        ]
+        image_embeds = image_embeds.split(image_feature_sizes)
         return image_embeds
 
-    def _get_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
         if self.is_mono:
-            visual_token_mask = (
+            self.visual_token_mask = (
                 input_ids == self.img_context_token_id).reshape(-1, 1)
         else:
-            visual_token_mask = None
-        return visual_token_mask
+            self.visual_token_mask = None
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            assert self.img_context_token_id is not None
+            self._set_visual_token_mask(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.img_context_token_id)
+        return inputs_embeds
 
     def forward(
         self,
@@ -613,26 +722,21 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[SamplerOutput, IntermediateTensors]:
+
         if intermediate_tensors is not None:
             input_ids = None
             inputs_embeds = None
-            visual_token_mask = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is not None:
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.img_context_token_id)
-                visual_token_mask = self._get_visual_token_mask(input_ids)
-                input_ids = None
-            else:
-                inputs_embeds = None
-                visual_token_mask = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         forward_kwargs = {
             "input_ids": input_ids,
@@ -642,8 +746,12 @@ def forward(
             "intermediate_tensors": intermediate_tensors,
             "inputs_embeds": inputs_embeds,
         }
-        if self.is_mono:
-            forward_kwargs.update({"visual_token_mask": visual_token_mask})
+
+        # Only required if the model is mono-architecture
+        if self.visual_token_mask is not None:
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
 
         hidden_states = self.language_model.model(**forward_kwargs)
         return hidden_states
@@ -663,6 +771,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 65800c44e5a93..8c81dff6b5768 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -19,7 +19,7 @@
 """Inference-only Jais model compatible with HuggingFace weights."""
 
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -76,6 +76,7 @@ def __init__(
         config: JAISConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -114,7 +115,8 @@ def __init__(
                               scale=self.scale,
                               alibi_slopes=alibi_slopes,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -178,6 +180,7 @@ def __init__(
         config: JAISConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -185,7 +188,10 @@ def __init__(
                      hidden_size)
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = JAISAttention(config, cache_config, quant_config)
+        self.attn = JAISAttention(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.attn")
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.mlp = JAISMLP(inner_dim, config, quant_config)
 
@@ -241,7 +247,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.num_hidden_layers,
             lambda prefix: JAISBlock(config=config,
                                      cache_config=cache_config,
-                                     quant_config=quant_config),
+                                     quant_config=quant_config,
+                                     prefix=prefix),
             prefix=f"{prefix}.h",
         )
 
@@ -250,6 +257,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.n_embd))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -257,9 +267,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[IntermediateTensors, torch.Tensor]:
         if get_pp_group().is_first_rank:
-            inputs_embeds = self.wte(input_ids)
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
             if self.wpe is not None:
                 position_embeds = self.wpe(position_ids)
                 hidden_states = inputs_embeds + position_embeds
@@ -311,6 +323,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -318,9 +333,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[IntermediateTensors, torch.Tensor]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -340,8 +357,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 # GPT-2 ties the weights of the embedding layer and the final
@@ -372,3 +391,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 88fb8d5cf555a..831db2ae52d74 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,5 +1,5 @@
 """Inference-only Jamba model."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -9,6 +9,7 @@
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -25,11 +26,12 @@
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
-                                      _get_graph_batch_size)
+from vllm.utils import LayerBlockType
 
-from .interfaces import HasInnerState, SupportsLoRA
-from .utils import maybe_prefix
+from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -102,7 +104,8 @@ def __init__(self,
                  config: JambaConfig,
                  layer_idx: int,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
         super().__init__()
         self.config = config
         self.mamba = MambaMixer(hidden_size= config.hidden_size,
@@ -157,6 +160,7 @@ def __init__(
         layer_idx: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -198,6 +202,7 @@ def __init__(
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             cache_config=cache_config,
+            prefix=f"{prefix}.attn",
         )
 
         num_experts = config.layers_num_experts[layer_idx]
@@ -280,18 +285,30 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             org_num_embeddings=config.vocab_size,
         )
 
-        decoder_layers = []
-        for i in range(config.num_hidden_layers):
-            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[i]]
-            decoder_layers.append(
-                layer_class(config,
-                            layer_idx=i,
-                            cache_config=cache_config,
-                            quant_config=quant_config))
-        self.layers = nn.ModuleList(decoder_layers)
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.layers_block_type[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
         self.final_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -299,22 +316,34 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        mamba_cache_index = 0
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             kv_cache = None
             layer_mamba_cache_params = None
             if isinstance(layer, JambaAttentionDecoderLayer):
-                kv_cache = kv_caches[(i - self.config.attn_layer_offset) //
-                                     self.config.attn_layer_period]
+                kv_cache = kv_caches[kv_cache_index]
+                kv_cache_index += 1
             if isinstance(layer, JambaMambaDecoderLayer):
-                current_state_layer = i - (1 +
-                                           (i - self.config.attn_layer_offset)
-                                           // self.config.attn_layer_period)
+                current_state_layer = mamba_cache_index
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     current_state_layer)
+                mamba_cache_index += 1
 
             hidden_states, residual = layer(
                 positions=positions,
@@ -323,11 +352,17 @@ def forward(
                 attn_metadata=attn_metadata,
                 residual=residual,
                 mamba_cache_params=layer_mamba_cache_params)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
 
-class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
+class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                       IsHybrid):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -359,6 +394,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         super().__init__()
         self.config = config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
         self.scheduler_config = scheduler_config
         self.model = JambaModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
@@ -381,25 +418,38 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 config.vocab_size)
         self.sampler = get_sampler()
 
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+        if self.scheduler_config is not None and \
+            not self.model_config.enforce_eager:
+            if self.scheduler_config.max_num_seqs > \
+                vllm_config.compilation_config.max_capture_size:
+                self.max_batch_size = \
+                    vllm_config.compilation_config.max_capture_size
+            else:
+                self.max_batch_size = vllm_config.pad_for_cudagraph(
+                    self.scheduler_config.max_num_seqs)
+        else:
+            self.max_batch_size = 8192 + 2
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 kv_caches: List[KVCache],
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (_get_graph_batch_size(
-                self.scheduler_config.max_num_seqs) if self.scheduler_config
-                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
-
-            layers_type = self.config.layers_block_type
-            num_mamba_layers = sum(
-                [layer_type == "mamba" for layer_type in layers_type])
 
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
-                *self._get_mamba_cache_shape())
+                self.lm_head.weight.dtype, num_mamba_layers,
+                self.max_batch_size, *self._get_mamba_cache_shape())
         (
             mamba_cache_tensors,
             state_indices_tensor,
@@ -409,7 +459,8 @@ def forward(self,
                                               mamba_cache_tensors[1],
                                               state_indices_tensor)
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, mamba_cache_params)
+                                   attn_metadata, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
@@ -450,7 +501,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -467,6 +519,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -488,8 +541,12 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
+
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -504,6 +561,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     if weight_name not in name:
                         continue
 
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     name = name.replace(weight_name, param_name)
                     param = params_dict[name]
                     weight_loader = param.weight_loader
@@ -517,11 +576,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
 
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 def _is_moe_layer(name: str):
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e53631ef19f31..2902e6999c2fd 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -37,7 +37,6 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
@@ -47,13 +46,13 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.platforms import current_platform
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -113,6 +112,7 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
+        layer_idx = extract_layer_index(prefix)
         self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = num_heads
@@ -167,6 +167,20 @@ def __init__(
             rope_scaling=rope_scaling,
             is_neox_style=is_neox_style,
         )
+
+        if hasattr(config, "interleaved_sliding_window"):
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} is not supported.")
+        else:
+            sliding_window = None
+
         self.attn = Attention(
             self.num_heads,
             self.head_dim,
@@ -174,6 +188,8 @@ def __init__(
             num_kv_heads=self.num_kv_heads,
             cache_config=cache_config,
             quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            prefix=f"{prefix}.attn",
         )
 
     def forward(
@@ -271,7 +287,11 @@ def forward(
 @support_torch_compile
 class LlamaModel(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -297,10 +317,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: LlamaDecoderLayer(config=config,
-                                             cache_config=cache_config,
-                                             quant_config=quant_config,
-                                             prefix=prefix),
+            lambda prefix: layer_type(config=config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
+                                      prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         if get_pp_group().is_last_rank:
@@ -350,7 +370,8 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -360,6 +381,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -375,6 +397,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
                 continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
@@ -390,7 +413,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
-
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
@@ -408,6 +430,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
@@ -453,15 +477,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = ["lm_head"]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -496,12 +511,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
         self.config = config
         self.lora_config = lora_config
 
-        self.model = LlamaModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
             if lora_config:
@@ -527,16 +542,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
+
+        self.sampler = get_sampler()
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.STEP,
-            normalize=False,
-            softmax=False)
+
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return LlamaModel(vllm_config=vllm_config, prefix=prefix)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -564,26 +579,19 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        logits = self.compute_logits(hidden_states, None)
-        return self._pooler(logits, pooling_metadata)
-
     def sample(self, logits: torch.Tensor,
                sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        loader.load_weights(
+        return loader.load_weights(
             self.maybe_remap_mistral(name, loaded_weight)
             for name, loaded_weight in weights)
 
@@ -621,74 +629,3 @@ def permute(w: torch.Tensor, n_heads: int):
                 name = name.replace(item, mapping[item])
 
         return name, loaded_weight
-
-
-class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
-    """
-    A model that uses Llama with additional embedding functionalities.
-
-    This class encapsulates the LlamaModel and provides an interface for
-    embedding operations and customized pooling functions.
-
-    Attributes:
-        model: An instance of LlamaModel used for forward operations.
-        _pooler: An instance of Pooler used for pooling operations.
-    """
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
-    }
-
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens"
-    ]
-    embedding_modules = {
-        "embed_tokens": "input_embeddings",
-    }
-    embedding_padding_modules = []
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        pooler_config = vllm_config.model_config.pooler_config
-
-        self.model = LlamaModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        return self.model(input_ids, positions, kv_caches, attn_metadata,
-                          intermediate_tensors, inputs_embeds)
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        self.model.load_weights(weights)
-
-    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
-        self.model.load_kv_cache_scales(quantization_param_path)
-
-    # LRUCacheWorkerLoRAManager instantiation requires model config.
-    @property
-    def config(self):
-        return self.model.config
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index b13bcfa676811..a2e404cf43238 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,37 +1,40 @@
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Protocol,
+from types import MethodType
+from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set,
                     Tuple, TypedDict, Union)
 
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import (CLIPVisionConfig, LlavaConfig, PixtralVisionConfig,
-                          PretrainedConfig, SiglipVisionConfig)
+from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
+                          PixtralVisionConfig, PretrainedConfig,
+                          ProcessorMixin, SiglipVisionConfig)
+from transformers.models.llava import LlavaProcessor
+from transformers.models.pixtral import PixtralProcessor
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
 
 from .clip import (CLIPVisionModel, dummy_image_for_clip,
-                   dummy_seq_data_for_clip, get_max_clip_image_tokens,
-                   input_processor_for_clip)
+                   get_max_clip_image_tokens)
 from .interfaces import SupportsMultiModal, SupportsPP
 from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf,
-                      dummy_seq_data_for_pixtral_hf,
                       get_max_pixtral_hf_image_tokens,
-                      input_processor_for_pixtral_hf)
+                      get_pixtral_hf_image_feature_size)
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     dummy_seq_data_for_siglip, get_max_siglip_image_tokens,
-                     input_processor_for_siglip)
+                     get_max_siglip_image_tokens)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -59,25 +62,32 @@ class LlavaImageEmbeddingInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs]
 
 
-# TODO(xwjiang): Run benchmark and decide if TP.
 class LlavaMultiModalProjector(nn.Module):
 
-    def __init__(self, vision_hidden_size: int, text_hidden_size: int,
-                 projector_hidden_act: str):
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 projector_hidden_act: str,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
 
-        self.linear_1 = nn.Linear(vision_hidden_size,
-                                  text_hidden_size,
-                                  bias=True)
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=True,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
         self.act = get_act_fn(projector_hidden_act)
-        self.linear_2 = nn.Linear(text_hidden_size,
-                                  text_hidden_size,
-                                  bias=True)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=True,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.linear_1(image_features)
+        hidden_states, _ = self.linear_1(image_features)
         hidden_states = self.act(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
         return hidden_states
 
 
@@ -104,107 +114,151 @@ def get_max_llava_image_tokens(ctx: InputContext):
         raise ValueError(f"Unexpected select feature strategy: {strategy}")
 
 
-def dummy_data_for_llava(ctx: InputContext, seq_len: int,
-                         mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
+class LlavaMultiModalProcessor(BaseMultiModalProcessor):
 
-    image_feature_size = get_max_llava_image_tokens(ctx)
+    def _patch_pixtral_processor(self, hf_processor: PixtralProcessor):
+        if getattr(hf_processor, "__is_patched__", False):
+            return  # Already patched
 
-    if isinstance(vision_config, CLIPVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_clip(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
+        image_processor = hf_processor.image_processor  # type: ignore
+        orig_preprocess = image_processor.preprocess
 
-        mm_data = dummy_image_for_clip(vision_config, num_images)
-        return DummyData(seq_data, mm_data, ranges)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_siglip(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
+        def preprocess(__self, *args, **kwargs):
+            hf_inputs = orig_preprocess(*args, **kwargs)
+            hf_inputs["is_pixtral"] = torch.tensor(True)
+            return hf_inputs
 
-        mm_data = dummy_image_for_siglip(vision_config, num_images)
-        return DummyData(seq_data, mm_data, ranges)
-    elif isinstance(vision_config, PixtralVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_pixtral_hf(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
+        image_processor.preprocess = MethodType(preprocess, image_processor)
 
-        mm_data = dummy_image_for_pixtral_hf(vision_config, num_images)
-        return DummyData(seq_data, mm_data, ranges)
+        hf_processor.__is_patched__ = True  # type: ignore
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
+        hf_processor = self.ctx.get_hf_processor()
+        assert isinstance(hf_processor, (LlavaProcessor, PixtralProcessor))
 
+        if isinstance(hf_processor, PixtralProcessor):
+            self._patch_pixtral_processor(hf_processor)
 
-def input_processor_for_llava(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+        return hf_processor
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    vision_config = hf_config.vision_config
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        image_token_id = hf_config.image_token_index
+
+        processor = self._get_hf_processor()
+        if isinstance(processor, PixtralProcessor):
+            image_token = processor.image_token
+            image_break_token = processor.image_break_token
+            image_end_token = processor.image_end_token
+
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+
+            def get_replacement_pixtral(item_idx: int):
+                image_size = mm_items.get_image_size(item_idx)
+                (
+                    num_width_tokens,
+                    num_height_tokens,
+                ) = get_pixtral_hf_image_feature_size(
+                    vision_config,
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
 
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        image_feature_size = get_max_llava_image_tokens(ctx)
-    elif is_list_of(image_data, Image.Image):
-        image_feature_size = [get_max_llava_image_tokens(ctx)
-                              ] * len(image_data)
-    elif isinstance(image_data, torch.Tensor):
-        num_images, image_feature_size, hidden_size = image_data.shape
-    elif is_list_of(image_data, torch.Tensor):
-        image_feature_size = [item.shape[1] for item in image_data]
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
+                tokens = ([image_token] * num_width_tokens +
+                          [image_break_token]) * num_height_tokens
+                tokens[-1] = image_end_token
 
-    if isinstance(vision_config, CLIPVisionConfig):
-        return input_processor_for_clip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
-    elif isinstance(vision_config, SiglipVisionConfig):
-        return input_processor_for_siglip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
-    elif isinstance(vision_config, PixtralVisionConfig):
-        # We ignore image_feature_size_override since we have non-uniform
-        # image sizes for Pixtral
-        return input_processor_for_pixtral_hf(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-        )
+                return "".join(tokens)
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id],
+                    replacement=get_replacement_pixtral,
+                ),
+            ]
+
+        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=[image_token_id] * max_image_tokens,
+            )
+        ]
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        vision_config = hf_config.vision_config
+        num_images = mm_counts["image"]
+
+        if isinstance(vision_config, CLIPVisionConfig):
+            data = dummy_image_for_clip(vision_config, num_images)
+        elif isinstance(vision_config, SiglipVisionConfig):
+            data = dummy_image_for_siglip(vision_config, num_images)
+        elif isinstance(vision_config, PixtralVisionConfig):
+            data = dummy_image_for_pixtral_hf(vision_config, num_images)
+        else:
+            msg = f"Unsupported vision config: {type(vision_config)}"
+            raise NotImplementedError(msg)
+
+        hf_processor = self._get_hf_processor()
+        image_token = hf_processor.image_token
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 class LlavaLikeConfig(Protocol):
     vision_config: PretrainedConfig
-    vision_feature_layer: int
+    vision_feature_layer: Union[int, List[int]]
+
+
+def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
+    """Determine the number of hidden layers to initialize up to in the
+    visual encoder.
+    
+    Args:
+        hf_config: Model config with vision feature layer(s).
+    """
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest one
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    """Given an signed vision feature layer, get the number of hidden layers
+    needed to leverage it.
+
+    Args:
+        feature_layer_index: Index of a required layer in the visual encoder.
+        num_hidden_layers: The total number of hidden layers in the visual
+            encoder.
+    """
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index + 1
 
 
 def init_vision_tower_for_llava(
@@ -216,13 +270,8 @@ def init_vision_tower_for_llava(
 ):
     vision_config = hf_config.vision_config
 
-    # Initialize the vision tower only up to the required feature layer
-    vision_feature_layer = hf_config.vision_feature_layer
-    if vision_feature_layer < 0:
-        num_hidden_layers = hf_config.vision_config.num_hidden_layers \
-            + vision_feature_layer + 1
-    else:
-        num_hidden_layers = vision_feature_layer + 1
+    # Initialize the vision tower only up to the deepest required feature layer
+    num_hidden_layers = _get_num_hidden_layers(hf_config)
 
     if isinstance(vision_config, CLIPVisionConfig):
         return CLIPVisionModel(
@@ -253,11 +302,18 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
@@ -287,12 +343,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
-            projector_hidden_act=config.projector_hidden_act)
+            projector_hidden_act=config.projector_hidden_act,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
 
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
@@ -317,38 +376,10 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
 
         return data
 
-    def _validate_image_sizes(self, images: List[torch.Tensor],
-                              sizes: List[torch.Tensor]) -> List[torch.Tensor]:
-        if not isinstance(sizes, list):
-            sizes = [sizes]
-
-        total_images = sum(size.numel() // 2 for size in sizes)
-        if total_images != len(images):
-            raise ValueError("Mismatch in number of images. "
-                             f"Expected {total_images}, got {len(images)}")
-        img_idx = 0
-        for size in sizes:
-            # Flatten the size tensor to a list of (height, width) pairs
-            size = size.view(-1, 2).tolist()
-            for expected_h, expected_w in size:
-                if img_idx >= len(images):
-                    raise ValueError("Ran out of images before sizes. "
-                                     f"{img_idx} >= {len(images)}")
-                img = images[img_idx]
-                if img.shape[-2:] != (expected_h, expected_w):
-                    raise ValueError(
-                        "Image size mismatch. Expected "
-                        f"{(expected_h, expected_w)}, got {img.shape[-2:]}")
-                if img.shape[-3] != 3:
-                    raise ValueError("Image channel mismatch. Expected 3, "
-                                     f"got {img.shape[-3]}")
-                img_idx += 1
-        return images
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
-        image_sizes = kwargs.pop("image_sizes", None)
+        is_pixtral = kwargs.pop("is_pixtral", torch.tensor([False]))
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
@@ -359,9 +390,8 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            # Case for models like PixtralHF that have dynamic image sizes
-            # so we need to produce a list of tensors
-            if image_sizes is not None:
+            assert isinstance(is_pixtral, torch.Tensor)
+            if is_pixtral.any():
                 images = pixel_values
 
                 def flatten_to_3d_tensors(item):
@@ -384,7 +414,7 @@ def flatten_to_3d_tensors(item):
 
                 return LlavaImagePixelInputs(
                     type="pixel_values",
-                    data=self._validate_image_sizes(images, image_sizes),
+                    data=images,
                 )
 
             return LlavaImagePixelInputs(
@@ -449,7 +479,7 @@ def _process_image_input(self,
         image_features = self._process_image_pixels(image_input)
         return self.multi_modal_projector(image_features)
 
-    def process_mm_inputs(self, **kwargs):
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -459,12 +489,12 @@ def process_mm_inputs(self, **kwargs):
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        vision_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[NestedTensors] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if vision_embeddings is not None:
+        if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, vision_embeddings,
+                input_ids, inputs_embeds, multimodal_embeddings,
                 self.config.image_token_index)
         return inputs_embeds
 
@@ -515,10 +545,11 @@ def forward(
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            vision_embeddings = self.process_mm_inputs(**kwargs)
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
             input_ids = None
@@ -547,6 +578,32 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
+
+
+class MantisMultiModalProcessor(LlavaMultiModalProcessor):
+
+    def _get_hf_processor(self) -> ProcessorMixin:
+        try:
+            from mantis.models.mllava import MLlavaProcessor
+        except ModuleNotFoundError as exc:
+            raise ModuleNotFoundError(
+                "You need to `pip install "
+                "git+https://github.com/TIGER-AI-Lab/Mantis.git` "
+                "to use this model") from exc
+
+        processor = MLlavaProcessor.from_pretrained(
+            self.ctx.model_config.tokenizer)
+        assert isinstance(processor, ProcessorMixin)
+        return processor
+
+
+# To use this model, please use
+# `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
+@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
+class MantisForConditionalGeneration(LlavaForConditionalGeneration):
+    pass
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index dd2fa6cac969f..a39f2f4124d05 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -14,12 +14,11 @@
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext)
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.multimodal.inputs import NestedTensors
+from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import (CLIPVisionModel, dummy_image_for_clip,
@@ -285,9 +284,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        pooler_config = vllm_config.model_config.pooler_config
         multimodal_config = vllm_config.model_config.multimodal_config
 
+        vision_feature_layer = config.vision_feature_layer
+        # Determine the layer up to which we will initialize the vision tower
+        if isinstance(vision_feature_layer, int):
+            vision_hidden_size = config.vision_config.hidden_size
+            self.feature_sample_layers = None
+        # Used for multimodal granite models to control encoder outputs
+        elif isinstance(vision_feature_layer, (list, tuple)):
+            vision_hidden_size = config.vision_config.hidden_size * len(
+                vision_feature_layer)
+            self.feature_sample_layers = vision_feature_layer
+        else:
+            raise TypeError(
+                f"vision_layer_feature type: {type(vision_feature_layer)}"
+                " is not supported")
+
         self.config = config
         self.multimodal_config = multimodal_config
 
@@ -300,22 +313,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
         self.multi_modal_projector = LlavaMultiModalProjector(
-            vision_hidden_size=config.vision_config.hidden_size,
+            vision_hidden_size=vision_hidden_size,
             text_hidden_size=config.text_config.hidden_size,
             projector_hidden_act=config.projector_hidden_act)
 
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
-
-        # The same model class supports both language generation and embedding
-        # because the architecture name is the same
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
@@ -419,7 +426,8 @@ def _image_pixels_to_features(
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values)
+        image_features = vision_tower(
+            pixel_values, feature_sample_layers=self.feature_sample_layers)
 
         return self._select_image_features(
             image_features,
@@ -549,6 +557,30 @@ def _process_image_input(
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+
+        if multimodal_embeddings is None:
+            return self.language_model.get_input_embeddings(input_ids)
+
+        inputs_embeds = embed_multimodal(
+            input_ids,
+            self.config.image_token_index,
+            self.language_model.model.get_input_embeddings,
+            multimodal_embeddings,
+        )
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -556,6 +588,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for LlaVA-NeXT.
@@ -604,24 +637,14 @@ def forward(
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                inputs_embeds = embed_multimodal(
-                    input_ids,
-                    self.config.image_token_index,
-                    self.language_model.model.get_input_embeddings,
-                    lambda _: self._process_image_input(image_input),
-                )
-            else:
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
 
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
@@ -629,7 +652,6 @@ def forward(
                                                   attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
-
         return hidden_states
 
     def compute_logits(
@@ -647,13 +669,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 5d5598d07bfde..0de9d8c5ea572 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -1,6 +1,6 @@
 import math
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import numpy as np
@@ -18,6 +18,7 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import IntermediateTensors
@@ -274,9 +275,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             text_hidden_size=config.text_config.hidden_size,
             projector_hidden_act=config.projector_hidden_act)
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
@@ -388,6 +390,25 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
             raise ValueError(
                 f"Unsupported type of video input {type(video_pixels)}")
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        video_input = self._parse_and_validate_video_input(**kwargs)
+        if video_input is None:
+            return None
+        vision_embeddings = self._process_video_pixels(video_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.video_token_index)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -395,6 +416,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for LlaVA-NeXT-Video.
@@ -404,22 +426,15 @@ def forward(
             pixel_values_videos: Pixels in each frames for each input videos.
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            video_input = self._parse_and_validate_video_input(**kwargs)
-            if video_input is not None:
-                video_embeddings = self._process_video_pixels(video_input)
-                inputs_embeds = self.language_model \
-                    .model.get_input_embeddings(input_ids)
 
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, video_embeddings,
-                    self.config.video_token_index)
-
-                input_ids = None
-            else:
-                inputs_embeds = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
@@ -445,10 +460,11 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
             self,
             # This model doesn't support images for now
             ignore_unexpected_prefixes=["image_newline"],
         )
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index a5b2108177830..0bebc1c745e2b 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,6 +1,6 @@
 import math
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import numpy as np
@@ -21,6 +21,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import IntermediateTensors
@@ -421,9 +422,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             prefix=maybe_prefix(prefix, "vision_tower"))
         self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
 
@@ -824,6 +826,49 @@ def apply_pooling(self, image_features, stride=2):
         image_feature = image_feature.view(batch_frames, -1, dim)
         return image_feature
 
+    def get_multimodal_embeddings(
+            self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # We make a tuple of each embedding with its modality string. This is a
+        # temporary workaround for models to handle mixed modalities when
+        # get_multimodal_embeddings and get_input_embeddings are called
+        # separately.
+        # TODO(ywang96): Add support for mixed-modality inference for v1.
+        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
+
+        if "images" in modalities:
+            image_input = modalities["images"]
+            vision_embeddings = self._process_image_input(image_input)
+            multimodal_embeddings.append((vision_embeddings, "image"))
+        if "videos" in modalities:
+            video_input = modalities["videos"]
+            video_embeddings = self._process_video_pixels(video_input)
+            multimodal_embeddings.append((video_embeddings, "video"))
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[List[Tuple[NestedTensors,
+                                                   str]]] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            for embeddings, modality in multimodal_embeddings:
+                if modality == "image":
+                    inputs_embeds = merge_multimodal_embeddings(
+                        input_ids, inputs_embeds, embeddings,
+                        self.config.image_token_index)
+                if modality == "video":
+                    inputs_embeds = merge_multimodal_embeddings(
+                        input_ids, inputs_embeds, embeddings,
+                        self.config.video_token_index)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -831,6 +876,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for LlaVA-Onevision.
@@ -840,28 +886,15 @@ def forward(
             pixel_values_videos: Pixels in each frames for each input videos.
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-            if modalities:
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-                if "images" in modalities:
-                    image_input = modalities["images"]
-                    vision_embeddings = self._process_image_input(image_input)
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, vision_embeddings,
-                        self.config.image_token_index)
-                if "videos" in modalities:
-                    video_input = modalities["videos"]
-                    video_embeddings = self._process_video_pixels(video_input)
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, video_embeddings,
-                        self.config.video_token_index)
-                input_ids = None
-            else:
-                inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
@@ -887,6 +920,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 55c575e22a0f6..06c8d9723cd01 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,5 +1,5 @@
 """PyTorch MAMBA model."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -8,6 +8,7 @@
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
@@ -18,15 +19,16 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (HasInnerState,
-                                                   IsAttentionFree)
+                                                   IsAttentionFree, SupportsPP)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
-                                      _get_graph_batch_size)
+from vllm.utils import LayerBlockType
 
-from .utils import maybe_prefix
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -49,6 +51,7 @@ def __init__(self,
                                 use_conv_bias=config.use_conv_bias,
                                 use_bias=config.use_bias,
                                 use_rms_norm=self.is_falcon_mamba,
+                                rms_norm_has_weight=not self.is_falcon_mamba,
                                 rms_norm_eps=mixer_rms_eps,
                                 activation=config.hidden_act)
 
@@ -96,15 +99,20 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             org_num_embeddings=config.vocab_size,
         )
 
-        decoder_layers = []
-        for i in range(config.num_hidden_layers):
-            decoder_layers.append(
-                MambaDecoderLayer(config,
-                                  cache_config=cache_config,
-                                  quant_config=quant_config))
-        self.layers = nn.ModuleList(decoder_layers)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MambaDecoderLayer(
+                config, cache_config=cache_config, quant_config=quant_config),
+            prefix=f"{prefix}.layers")
+
         self.norm_f = RMSNorm(config.hidden_size,
                               eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings(input_ids)
 
     def forward(
         self,
@@ -112,25 +120,40 @@ def forward(
         positions: torch.Tensor,
         attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
 
-        hidden_states = self.embeddings(input_ids)
-        residual = None
-
-        for i in range(len(self.layers)):
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
                 attn_metadata=attn_metadata,
                 residual=residual,
-                mamba_cache_params=mamba_cache_params.at_layer_idx(i))
+                mamba_cache_params=mamba_cache_params.at_layer_idx(
+                    i - self.start_layer))
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm_f(hidden_states, residual)
 
         return hidden_states
 
 
-class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
+class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
@@ -142,7 +165,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         super().__init__()
         self.config = config
+        self.vllm_config = vllm_config
         self.scheduler_config = scheduler_config
+        self.model_config = vllm_config.model_config
         self.backbone = MambaModel(vllm_config=vllm_config,
                                    prefix=maybe_prefix(prefix, "backbone"))
         self.unpadded_vocab_size = config.vocab_size
@@ -168,20 +193,37 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 config.vocab_size)
         self.sampler = get_sampler()
 
+        self.make_empty_intermediate_tensors = (
+            self.backbone.make_empty_intermediate_tensors)
+        if self.scheduler_config is not None and \
+            not self.model_config.enforce_eager:
+            if self.scheduler_config.max_num_seqs > \
+                vllm_config.compilation_config.max_capture_size:
+                self.max_batch_size = \
+                    vllm_config.compilation_config.max_capture_size
+            else:
+                self.max_batch_size = vllm_config.pad_for_cudagraph(
+                    self.scheduler_config.max_num_seqs)
+        else:
+            self.max_batch_size = 8192 + 2
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.get_input_embeddings(input_ids)
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 kv_caches: List[KVCache],
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (_get_graph_batch_size(
-                self.scheduler_config.max_num_seqs) if self.scheduler_config
-                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, self.config.num_hidden_layers,
-                max_batch_size, *self._get_mamba_cache_shape())
+                self.lm_head.weight.dtype, num_mamba_layers,
+                self.max_batch_size, *self._get_mamba_cache_shape())
 
         (
             mamba_cache_tensors,
@@ -194,7 +236,8 @@ def forward(self,
                                               state_indices_tensor)
 
         hidden_states = self.backbone(input_ids, positions, attn_metadata,
-                                      mamba_cache_params)
+                                      mamba_cache_params, intermediate_tensors,
+                                      inputs_embeds)
 
         return hidden_states
 
@@ -232,16 +275,22 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "A_log" in name:
                 name = name.replace("A_log", "A")
             # Skip loading extra bias for GPTQ models.
             if name.endswith(".bias") and name not in params_dict:
                 continue
+            if is_pp_missing_parameter(name, self):
+                continue
 
             param = params_dict[name]
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index de5b2d89c0962..66bdcb89a0213 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 import torch.nn as nn
@@ -14,11 +14,14 @@
 
 class ResidualBlock(nn.Module):
 
-    def __init__(self, hidden_size: int, num_layers: int) -> None:
+    def __init__(self, config: VllmConfig, hidden_size: int,
+                 num_layers: int) -> None:
         super().__init__()
 
         self.layers = nn.ModuleList([
-            nn.Linear(hidden_size, hidden_size, bias=False)
+            nn.Linear(hidden_size,
+                      hidden_size,
+                      bias=getattr(config, "medusa_fc_bias", False))
             for _ in range(num_layers)
         ])
         self.act = nn.SiLU()
@@ -49,7 +52,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         self.config = config
         self.blocks = nn.ModuleList([
-            ResidualBlock(hidden_size=self.config.hidden_size,
+            ResidualBlock(config=config,
+                          hidden_size=self.config.hidden_size,
                           num_layers=self.config.num_hidden_layers)
             for _ in range(self.config.num_heads)
         ])
@@ -57,14 +61,25 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.truncated_vocab_size = config.truncated_vocab_size
         self.unpadded_vocab_size = self.truncated_vocab_size
 
-        self.lm_heads = nn.ModuleList([
-            ParallelLMHead(
+        if getattr(config, "original_lm_head", False):
+            self.lm_head = ParallelLMHead(
                 self.unpadded_vocab_size,
                 config.hidden_size,
                 org_num_embeddings=self.truncated_vocab_size,
                 padding_size=DEFAULT_VOCAB_PADDING_SIZE,
-            ) for _ in range(self.config.num_heads)
-        ])
+            )
+            self.lm_heads = [
+                self.lm_head for _ in range(self.config.num_heads)
+            ]
+        else:
+            self.lm_heads = nn.ModuleList([
+                ParallelLMHead(
+                    self.unpadded_vocab_size,
+                    config.hidden_size,
+                    org_num_embeddings=self.truncated_vocab_size,
+                    padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                ) for _ in range(self.config.num_heads)
+            ])
 
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
@@ -152,8 +167,10 @@ def generate_proposals(
             sampling_metadata=sampling_metadata,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
 
         weights_map = {}
 
@@ -166,6 +183,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                                   requires_grad=False)
             elif name in params_dict:
                 weights_map[name] = loaded_weight
+            elif (getattr(self.config, "original_lm_head", False)
+                  and name == "lm_heads.0.weight"):
+                weights_map["lm_head.weight"] = loaded_weight
 
         for name, loaded_weight in weights_map.items():
             if "lm_head" in name and self.token_map is not None and\
@@ -177,9 +197,12 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
 
         if self.token_map is not None:
             self.token_map.to(device=self.lm_heads[0].weight.device)
 
         assert (self.truncated_vocab_size
                 == self.orig_vocab_size) or (self.token_map is not None)
+
+        return loaded_params
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 2db953329fd91..5a0f202364f26 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -21,7 +21,7 @@
 # limitations under the License.
 """Inference-only MiniCPM model compatible with HuggingFace weights."""
 import math
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -52,7 +52,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -192,6 +192,7 @@ def __init__(
         max_position_embeddings: int = 8192,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -246,7 +247,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -273,6 +275,7 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
@@ -283,6 +286,7 @@ def __init__(
         self.rope_scaling = getattr(config, "rope_scaling", None)
         self.max_position_embeddings = getattr(config,
                                                "max_position_embeddings", 8192)
+        self.prefix = prefix
         self._init_attn_block()
         self._init_ffn_block()
 
@@ -298,6 +302,7 @@ def _init_attn_block(self):
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
+            prefix=f"{self.prefix}.self_attn",
         )
 
     def _init_ffn_block(self):
@@ -373,6 +378,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
         )
+        self.num_experts = getattr(self.config, "num_experts", 0)
         self._init_layers(prefix, config, cache_config, quant_config)
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.make_empty_intermediate_tensors = (
@@ -388,8 +394,8 @@ def _init_layers(
     ):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: MiniCPMDecoderLayer(config, cache_config,
-                                               quant_config),
+            lambda prefix: MiniCPMDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -432,6 +438,73 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            ("ws" if weight_name in ["w1", "w3"] else "w2s",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -461,6 +534,16 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -475,8 +558,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.cache_config = cache_config
         self.quant_config = quant_config
 
-        self.num_experts = getattr(self.config, "num_experts", 0)
-        self._init_model(vllm_config=vllm_config, prefix=prefix)
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
         unpadded_vocab_size = config.vocab_size
         if lora_config:
             unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -501,8 +585,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors)
 
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        self.model = MiniCPMModel(vllm_config=vllm_config,
-                                  prefix=maybe_prefix(prefix, "model"))
+        return MiniCPMModel(vllm_config=vllm_config, prefix=prefix)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
 
     def forward(
         self,
@@ -511,9 +597,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -534,70 +622,11 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        expert_params_mapping = [
-            # (param_name, weight_name, expert_id)
-            ("ws" if weight_name in ["w1", "w3"] else "w2s",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id)
-            for expert_id in range(self.num_experts)
-            for weight_name in ["w1", "w2", "w3"]
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            # With tie_word_embeddings, we can skip lm_head.weight
-            # The weight might appear unnecessarily in the files if the model is
-            # processed with quantization, LoRA, fine-tuning, etc.
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for param_name, weight_name, expert_id in expert_params_mapping:
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  weight_name,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 278c4bbe6e563..e9d7eada1d16c 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -40,7 +40,7 @@
                                                 MiniCPMForCausalLM,
                                                 MiniCPMModel)
 
-from .utils import make_layers, maybe_prefix
+from .utils import make_layers
 
 
 class MiniCPM3Attention(nn.Module):
@@ -60,6 +60,7 @@ def __init__(
         max_position_embeddings: int = 8192,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -119,7 +120,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_local_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -195,6 +197,7 @@ def _init_attn_block(self):
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
+            prefix=f"{self.prefix}.self_attn",
         )
 
 
@@ -209,8 +212,8 @@ def _init_layers(
     ):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: MiniCPM3DecoderLayer(config, cache_config,
-                                                quant_config),
+            lambda prefix: MiniCPM3DecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
 
 
@@ -238,6 +241,11 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
     # `embedding_modules` and `embedding_padding_modules`
     # are inherited from MiniCPMForCausalLM
 
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        self.model = MiniCPM3Model(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
+        return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index fd8eda997f76f..1e8f9bd4cf418 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -22,9 +22,9 @@
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import re
-from functools import partial
+from functools import cached_property, partial
 from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
-                    Tuple, TypedDict, Union)
+                    Set, Tuple, TypedDict, Union)
 
 import torch
 import torch.types
@@ -37,19 +37,15 @@
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.llama import LlamaModel
-from vllm.model_executor.models.minicpm import MiniCPMModel
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.models.qwen2 import Qwen2Model
-from vllm.model_executor.models.utils import LLMWrapper
+from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
@@ -58,11 +54,7 @@
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import is_pp_missing_parameter, maybe_prefix
-
-_KEYS_TO_MODIFY_MAPPING = {
-    "llm.lm_head": "lm_head",
-}
+from .utils import AutoWeightsLoader, maybe_prefix
 
 RawImageType = Union[Image.Image, torch.Tensor]
 
@@ -297,10 +289,9 @@ def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs):
 
     def get_placeholder(image_size: Tuple[int, int], num_image: int):
         if version == (2, 0) or version == (2, 5):
-            return image_processor. \
-                get_slice_image_placeholder(image_size)
-        return image_processor. \
-            get_slice_image_placeholder(image_size, num_image)
+            return image_processor.get_slice_image_placeholder(image_size)
+        return image_processor.get_slice_image_placeholder(
+            image_size, num_image)
 
     prompt = inputs.get("prompt")
     token_ids = inputs.get("prompt_token_ids")
@@ -400,37 +391,32 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.vpm = self.init_vision_module(config,
                                            quant_config,
                                            prefix=maybe_prefix(prefix, "vpm"))
-        param_dtype = torch.get_default_dtype()
-        self.vpm.to(dtype=param_dtype)
         self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
                            self.vpm.embeddings.embed_dim)
         self.embed_dim = self.config.hidden_size
+
         self.resampler = self.init_resampler(self.embed_dim,
                                              self.vision_dim,
                                              quant_config=quant_config,
                                              prefix=maybe_prefix(
                                                  prefix, "resampler"))
-        self.resampler.to(device="cuda", dtype=param_dtype)
-        # TODO: why is there _KEYS_TO_MODIFY_MAPPING? lm_head should be in llm
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(
-                                          prefix, "llm.lm_head"))
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.llm.make_empty_intermediate_tensors)
 
+    @cached_property
+    def sampler(self):
+        if hasattr(self.llm, "sampler"):
+            return self.llm.sampler
+
+        return get_sampler()
+
     def get_embedding(
         self,
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImageInputs],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        vlm_embedding: torch.Tensor = self.llm.embed_tokens(input_ids)
-        if hasattr(self.config, "scale_emb"):
-            vlm_embedding *= self.config.scale_emb
+        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
 
         if image_inputs is None:  # No image
             vision_hidden_states = torch.tensor([], device=input_ids.device)
@@ -575,7 +561,7 @@ def forward(
         # for `torch.compile` integration
         input_ids = None
 
-        output = self.llm(
+        output = self.llm.model(
             input_ids=input_ids,
             positions=positions,
             kv_caches=kv_caches,
@@ -590,9 +576,7 @@ def compute_logits(
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
+        return self.llm.compute_logits(hidden_states, sampling_metadata)
 
     def sample(
         self,
@@ -602,50 +586,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
-                if key_to_modify in name:
-                    name = name.replace(key_to_modify, new_key)
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            use_default_weight_loading = False
-            if self.is_default_weight_loading(name):
-                use_default_weight_loading = True
-            else:
-                for param_name, weight_name, shard_id in stacked_params_mapping:
-                    if weight_name not in name:
-                        continue
-                    if is_pp_missing_parameter(
-                            name.replace(weight_name, param_name), self):
-                        continue
-                    param = params_dict[name.replace(weight_name, param_name)]
-                    weight_loader = param.weight_loader
-                    weight_loader(param, loaded_weight, shard_id)
-                    break
-                else:
-                    use_default_weight_loading = True
-            if use_default_weight_loading:
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
 
     def get_mm_mapping(self) -> MultiModelKeys:
         """
@@ -689,9 +633,6 @@ def get_vision_hidden_states(self,
                                  data: MiniCPMVImageInputs) -> torch.Tensor:
         raise NotImplementedError
 
-    def is_default_weight_loading(self, name: str) -> bool:
-        raise NotImplementedError
-
 
 class MiniCPMV2_0(MiniCPMVBaseModel):
 
@@ -704,8 +645,7 @@ def init_llm(
         vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
-        return LLMWrapper(MiniCPMModel(vllm_config=vllm_config, prefix=prefix),
-                          name="model")
+        return MiniCPMForCausalLM(vllm_config=vllm_config, prefix=prefix)
 
     def init_vision_module(
         self,
@@ -713,11 +653,12 @@ def init_vision_module(
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
     ) -> nn.Module:
-        # TODO :refactor this vision model
+        # TODO: refactor this vision model
         try:
             import timm
         except ImportError:
             raise ImportError("Please install timm==0.9.10") from ImportError
+
         with set_default_torch_dtype(torch.float16):
             model = timm.create_model(
                 "vit_so400m_patch14_siglip_384.webli",
@@ -727,6 +668,8 @@ def init_vision_module(
                 dynamic_img_pad=True,
             )
 
+        model = model.to(dtype=torch.get_default_dtype())
+
         if (isinstance(model, timm.models.VisionTransformer)
                 and model.attn_pool is not None):
             model.attn_pool = torch.nn.Identity()
@@ -755,7 +698,7 @@ def init_resampler(self,
                                    quant_config=quant_config,
                                    prefix=prefix)
 
-        return resampler
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
 
     def get_vision_embedding(
         self,
@@ -786,9 +729,6 @@ def get_vision_hidden_states(self,
 
         return self.get_vision_embedding(pixel_values)
 
-    def is_default_weight_loading(self, name: str) -> bool:
-        return "resampler" in name or "vpm" in name
-
 
 class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
     packed_modules_mapping = {
@@ -818,25 +758,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
     ]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        # vision encoder
-        ".fc1.",
-        ".fc2.",
-        # Currently, vllm does not support BNB quantization for the `out_proj`
-        # of the resampler, so it's necessary to distinguish between the
-        # vision encoder and the resampler's out_proj. The same applies to
-        # MiniCPMV2_6.
-        ".self_attn.out_proj.",  #  vision encoder out_proj
-        # resampler
-        ".kv_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -858,8 +779,7 @@ def init_llm(
         vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
-        return LLMWrapper(LlamaModel(vllm_config=vllm_config, prefix=prefix),
-                          name="model")
+        return LlamaForCausalLM(vllm_config=vllm_config, prefix=prefix)
 
     def init_vision_module(
         self,
@@ -886,7 +806,8 @@ def init_resampler(self,
                                      kv_dim=vision_dim,
                                      quant_config=quant_config,
                                      prefix=prefix)
-        return resampler
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
 
     def get_vision_embedding(
         self,
@@ -928,9 +849,6 @@ def get_vision_hidden_states(self,
         return self.get_vision_embedding(all_pixel_values.type(dtype),
                                          patch_attn_mask, tgt_sizes)
 
-    def is_default_weight_loading(self, name: str) -> bool:
-        return "resampler" in name
-
 
 class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
     packed_modules_mapping = {
@@ -960,21 +878,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
     ]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        # vision encoder
-        ".fc1.",
-        ".fc2.",
-        ".self_attn.out_proj.",
-        # resampler
-        ".kv_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -996,8 +899,7 @@ def init_llm(
         vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
-        return LLMWrapper(Qwen2Model(vllm_config=vllm_config, prefix=prefix),
-                          name="model")
+        return Qwen2ForCausalLM(vllm_config=vllm_config, prefix=prefix)
 
     def init_vision_module(
         self,
@@ -1025,7 +927,8 @@ def init_resampler(self,
                                      kv_dim=vision_dim,
                                      quant_config=quant_config,
                                      prefix=prefix)
-        return resampler
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
 
     def get_vision_embedding(
         self,
@@ -1073,9 +976,6 @@ def get_vision_hidden_states(self,
 
         return self.resampler(vision_embedding, tgt_sizes)
 
-    def is_default_weight_loading(self, name: str) -> bool:
-        return "resampler" in name
-
 
 _SUPPORT_VERSION = {
     (2, 0): MiniCPMV2_0,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 3eb2f60fd4fc7..a5b364fe5ec85 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -166,7 +166,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -281,6 +282,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -288,9 +292,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -363,6 +371,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -370,9 +381,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -392,7 +405,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -409,6 +423,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_local_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -466,3 +481,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 95cfb6f54dc10..7a9b8cd88cfd0 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -170,6 +170,7 @@ def __init__(
         rope_theta: float = 10000,
         quant_config: Optional[QuantizationConfig] = None,
         cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -219,7 +220,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -243,6 +245,7 @@ def __init__(
         config: MixtralConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -255,7 +258,9 @@ def __init__(
             num_kv_heads=config.num_key_value_heads,
             rope_theta=rope_theta,
             cache_config=cache_config,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
         self.block_sparse_moe = MixtralMoE(config=config,
                                            quant_config=quant_config)
         self.input_layernorm = RMSNorm(config.hidden_size,
@@ -311,13 +316,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: MixtralDecoderLayer(
-                config, cache_config, quant_config=quant_config),
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
             prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -325,9 +334,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -368,6 +381,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -375,9 +391,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -397,7 +415,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -406,6 +425,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -436,3 +456,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index db7ee7b2d8537..6536f9807730c 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """PyTorch Mllama model."""
 import math
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import numpy as np
@@ -32,9 +32,8 @@
 
 import vllm.distributed.parallel_state as ps
 from vllm.attention import Attention, AttentionMetadata, AttentionType
-from vllm.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.attention.backends.xformers import XFormersMetadata
 from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.attention.selector import _Backend
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs,
@@ -828,7 +827,8 @@ def _attention_with_mask(
     ) -> torch.Tensor:
         # Skip writing kv-cache for the initial profiling run.
         if len(kv_cache.shape) > 1:
-            if isinstance(attn_metadata, FlashAttentionMetadata):
+            if self.attn.backend in (_Backend.FLASH_ATTN,
+                                     _Backend.FLASH_ATTN_VLLM_V1):
                 cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
                 cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
                 torch.ops._C_cache_ops.reshape_and_cache_flash(
@@ -842,7 +842,7 @@ def _attention_with_mask(
                     1.0,
                     1.0,
                 )
-            elif isinstance(attn_metadata, XFormersMetadata):
+            elif self.attn.backend in (_Backend.XFORMERS, _Backend.TORCH_SDPA):
                 key_cache, value_cache = PagedAttention.split_kv_cache(
                     kv_cache, self.num_local_key_value_heads, self.head_dim)
                 cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
@@ -852,9 +852,9 @@ def _attention_with_mask(
                     attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0)
             else:
                 raise ValueError(
-                    f"Unsupported AttentionMetadata {type(attn_metadata)} "
-                    f"class found. Expected the AttentionMetadata to "
-                    f"be either XFormersMetadata or FlashAttentionMetadata.")
+                    f"Unsupported Attention backend {self.attn.backend} "
+                    "enum found. Expected the Attention backend to be "
+                    "FLASH_ATTN, FLASH_ATTN_VLLM_V1, XFORMERS or TORCH_SDPA.")
 
         # We have to call torch.sdpa for prefill when using a
         # custom cross-attention mask. Because the mask is not a
@@ -1104,20 +1104,6 @@ def forward(
 @INPUT_REGISTRY.register_input_processor(input_processor_for_mllama)
 class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        ".fc1.",
-        ".fc2.",
-        # The `multi_modal_projector` is at the top level of the model,
-        # so we can't add a dot in front of it.
-        "multi_modal_projector."
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -1427,7 +1413,8 @@ def forward(
 
         return outputs
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -1437,7 +1424,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        updated_params = set()
+        updated_params: Set[str] = set()
         for name, loaded_weight in weights:
             if 'patch_embedding.weight' in name:
                 name = name.replace('patch_embedding.weight',
@@ -1457,6 +1444,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+                updated_params.add(name)
+        return updated_params
 
 
 def skip_attention_mask(sparse_mask: List[List[int]]) -> bool:
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 4d7e82880041d..d49da5f29aa14 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -1,5 +1,5 @@
 import math
-from typing import Iterable, List, Tuple
+from typing import Iterable, List, Set, Tuple
 
 import torch
 import torch.nn as nn
@@ -188,11 +188,16 @@ def generate_proposals(
 
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            param = params_dict.get(name.replace("speculator.", ""))
+            name = name.replace("speculator.", "")
+            param = params_dict.get(name)
             if param is not None:
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 035a1e2ab7b02..a328b5a2aeea7 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -3,7 +3,7 @@
 from array import array
 from dataclasses import dataclass
 from functools import lru_cache, partial
-from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union
+from typing import Iterable, List, Mapping, Optional, Set, Tuple, TypedDict
 
 import torch
 from einops import rearrange
@@ -13,7 +13,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.attention.selector import _Backend
+from vllm.attention.layer import MultiHeadAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -37,20 +37,25 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
 from vllm.transformers_utils.processor import get_processor
 
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import (get_vit_attn_backend,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
 NUM_PREFIX_TOKENS = 1
 ADDITIONAL_VOCAB_SIZE = 128
+DEFAULT_IMAGE_PATCH_TOKEN_ID = 152066
+DEFAULT_IM_START_TOKEN_ID = 152067
+DEFAULT_IM_END_TOKEN_ID = 152064
+DEFAULT_IM_COL_TOKEN_ID = 152065
 
 
 class MolmoImageInputs(TypedDict):
@@ -74,6 +79,11 @@ class MolmoImageInputs(TypedDict):
     `(batch_size, num_crops, num_patch)`
     """
 
+    image_start_end: Tuple[int, int]
+    """Starting and ending index of placeholder 
+    tokens
+    """
+
 
 @dataclass
 class VisionBackboneConfig:
@@ -186,13 +196,11 @@ def __init__(
             quant_config=quant_config,
         )
 
-        # Detect attention implementation.
-        self.attn_backend: _Backend = get_vit_attn_backend()
-        if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
-        }:
-            raise RuntimeError(
-                f"Molmo does not support {self.attn_backend} backend now.")
+        self.scale = self.head_dim**-0.5
+        self.attn = MultiHeadAttention(self.num_heads,
+                                       self.head_dim,
+                                       self.scale,
+                                       num_kv_heads=self.num_kv_heads)
 
     def forward(self,
                 inputs_q: torch.Tensor,
@@ -208,25 +216,8 @@ def forward(self,
         xq, _ = self.wq(inputs_q)
         xk, _ = self.wk(inputs_k)
         xv, _ = self.wv(inputs_v)
-        q_shape = xq.size()[:-1] + (self.num_heads, self.head_dim)
-        kv_shape = xk.size()[:-1] + (self.num_kv_heads, self.head_dim)
-        xq = xq.view(*q_shape)
-        xk = xk.view(*kv_shape)
-        xv = xv.view(*kv_shape)
-
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            from flash_attn import flash_attn_func
-            output = flash_attn_func(xq, xk, xv, dropout_p=0.0, causal=False)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
-            xq, xk, xv = (rearrange(x, "b s h d -> b h s d")
-                          for x in (xq, xk, xv))
-            output = F.scaled_dot_product_attention(xq, xk, xv)
-            output = rearrange(output, "b h s d -> b s h d ")
-        elif self.attn_backend == _Backend.XFORMERS:
-            from xformers import ops as xops
-            output = xops.memory_efficient_attention_forward(xq, xk, xv, p=0)
-
-        output = rearrange(output, "b s h d -> b s (h d)").contiguous()
+
+        output = self.attn(xq, xk, xv)
         output, _ = self.wo(output)
 
         return output
@@ -370,6 +361,7 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -427,7 +419,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
         # Attention output projection.
         self.o_proj = RowParallelLinear(
@@ -517,10 +510,14 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         # Attention block.
-        self.self_attn = MolmoAttention(config, cache_config, quant_config)
+        self.self_attn = MolmoAttention(config,
+                                        cache_config,
+                                        quant_config,
+                                        prefix=f"{prefix}.self_attn")
 
         # MLP block.
         self.mlp = MolmoMLP(config, quant_config=quant_config)
@@ -713,6 +710,42 @@ def forward(
         # image_features: (batch_size, num_image, num_patch, d_model)
         return image_features
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 @support_torch_compile
 class MolmoModel(nn.Module):
@@ -738,7 +771,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             else MolmoDecoderLayer
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: decoder_layer(config, cache_config, quant_config),
+            lambda prefix: decoder_layer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers",
         )
 
@@ -749,6 +783,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -790,6 +830,28 @@ def forward(
             hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "gate_up_proj" in name:
+                up_proj, gate_proj = loaded_weight.chunk(2, dim=0)
+                loaded_weight = torch.cat([gate_proj, up_proj], dim=0)
+
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 cached_get_processor = lru_cache(get_processor)
 
@@ -865,6 +927,8 @@ def image_input_mapper_for_molmo(
     ctx: InputContext,
     data: object,
 ):
+    if isinstance(data, list):
+        data = data[0]
     return MultiModalKwargs(data)
 
 
@@ -914,7 +978,22 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
     dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
-    return DummyData(dummy_seqdata, {"image": dummy_imgdata})
+    size = 0
+    offset = -1
+    for i in range(len(token_ids)):
+        if token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID,
+                            DEFAULT_IM_START_TOKEN_ID, DEFAULT_IM_END_TOKEN_ID,
+                            DEFAULT_IM_COL_TOKEN_ID):
+            if offset < 0:
+                offset = i
+            size += 1
+    dummy_imgdata["image_start_end"] = (offset, offset + size)
+    return DummyData(seq_data=dummy_seqdata,
+                     multi_modal_data={"image": dummy_imgdata},
+                     multi_modal_placeholders={
+                         "image":
+                         [PlaceholderRange(offset=offset, length=size)]
+                     })
 
 
 def pad_images(
@@ -1002,19 +1081,34 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
     if image_masks is not None:
         image_data["image_masks"] = image_masks
 
-    image_data["seq_len"] = torch.tensor(len(out["input_ids"]),
+    new_prompt_token_ids = out["input_ids"].tolist()
+    image_data["seq_len"] = torch.tensor(len(new_prompt_token_ids),
                                          dtype=torch.long)
 
     multi_modal_data = dict(image=image_data)
+    size = 0
+    offset = -1
+    for i in range(len(new_prompt_token_ids)):
+        if new_prompt_token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID,
+                                       DEFAULT_IM_START_TOKEN_ID,
+                                       DEFAULT_IM_END_TOKEN_ID,
+                                       DEFAULT_IM_COL_TOKEN_ID):
+            if offset < 0:
+                offset = i
+            size += 1
+    image_data["image_start_end"] = (offset, offset + size)
 
     prompt = inputs.get("prompt")
     if prompt is None:
-        prompt = tokenizer.decode(out["input_ids"])
+        prompt = tokenizer.decode(new_prompt_token_ids)
 
     return token_inputs(
-        prompt_token_ids=out["input_ids"],
+        prompt_token_ids=new_prompt_token_ids,
         prompt=prompt,
         multi_modal_data=multi_modal_data,
+        multi_modal_placeholders={
+            "image": [PlaceholderRange(offset=offset, length=size)]
+        },
     )
 
 
@@ -1060,6 +1154,7 @@ def _parse_and_validate_image_input(
     ) -> Optional[MolmoImageInputs]:
         images = kwargs.pop("images", None)
         image_masks = kwargs.pop("image_masks", None)
+        image_start_end = kwargs.pop("image_start_end", None)
         if images is None:
             return None
 
@@ -1077,6 +1172,7 @@ def _parse_and_validate_image_input(
             image_input_idx=image_input_idx,
             seq_len=seq_len,
             image_masks=image_masks,
+            image_start_end=image_start_end,
         )
 
     def _process_image_input(
@@ -1091,19 +1187,16 @@ def _process_image_input(
 
         return image_features
 
-    def _merge_multimodal_embeddings(
-        self,
-        inputs_embeds: torch.Tensor,
-        image_features: torch.Tensor,
-        image_input_idx: torch.Tensor,
-        seq_len: Union[torch.Tensor, List[torch.Tensor]],
-    ) -> torch.Tensor:
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        image_features = self._process_image_input(image_input)
+        image_input_idx = image_input["image_input_idx"]
+        seq_len = image_input["seq_len"]
         batch_size, num_image, num_patch = image_features.shape[:3]
         assert image_input_idx.shape == (batch_size, num_image, num_patch)
 
-        image_features = image_features.to(inputs_embeds.device)
-        seq_len = seq_len.to(inputs_embeds.device)
-
         # insert the image feature into the embedding.
         image_features = image_features.view(batch_size, num_image * num_patch,
                                              -1)
@@ -1123,12 +1216,35 @@ def _merge_multimodal_embeddings(
         image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
         image_input_idx = image_input_idx.flatten()[:, None]
         mat = image_input_idx == torch.arange(
-            seq_len.sum().item(), device=inputs_embeds.device)[None, :]
+            seq_len.sum().item(), device=image_features.device)[None, :]
         mat = mat.to(image_features.dtype)
 
-        inputs_embeds = inputs_embeds + torch.einsum('nd,nm->md',
-                                                     image_features, mat)
+        # Note: In this original implementation from AI2, the final
+        # vision_embeddings will be always be the same length
+        # of input embeddings.
+        vision_embeddings = torch.einsum('nd,nm->md', image_features, mat)
+
+        # Split by the sizes of the input sequences. For each full embedding,
+        # extract the actual vision embeddings to be merged.
+        vision_embeddings = list(vision_embeddings.split(seq_len.tolist()))
+        for i in range(len(vision_embeddings)):
+            start, end = image_input['image_start_end'][i]
+            vision_embeddings[i] = vision_embeddings[i][start:end]
+
+        return vision_embeddings
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings, [
+                    DEFAULT_IMAGE_PATCH_TOKEN_ID, DEFAULT_IM_START_TOKEN_ID,
+                    DEFAULT_IM_END_TOKEN_ID, DEFAULT_IM_COL_TOKEN_ID
+                ])
         return inputs_embeds
 
     def forward(
@@ -1138,39 +1254,27 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> SamplerOutput:
+
         if intermediate_tensors is not None:
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                inputs_embeds = self.model.embed_tokens(input_ids)
-                image_features = self._process_image_input(image_input)
-
-                inputs_embeds = self._merge_multimodal_embeddings(
-                    inputs_embeds,
-                    image_features,
-                    image_input["image_input_idx"],
-                    image_input["seq_len"],
-                )
-            else:
-                inputs_embeds = self.model.embed_tokens(input_ids)
 
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
-
-        hidden_states = self.model(
-            input_ids=input_ids,
-            positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-        )
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.model(input_ids,
+                                   positions,
+                                   kv_caches,
+                                   attn_metadata,
+                                   intermediate_tensors,
+                                   inputs_embeds=inputs_embeds)
 
         return hidden_states
 
@@ -1189,103 +1293,53 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-
-        params_mapping = [
-            ("model.transformer.ln_f.weight", "model.norm.weight"),
-            ("attn_out", "self_attn.o_proj"),
-            ("att_proj", "self_attn.qkv_proj"),
-            ("q_norm", "self_attn.q_norm"),
-            ("k_norm", "self_attn.k_norm"),
-            ("attn_norm", "input_layernorm"),
-            ("ff_norm", "post_attention_layernorm"),
-        ]
-
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-
-        embedding_weight = dict()
-        projector_weight = dict()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-
-            if "wte.embedding" in name:
-                embedding_weight["embedding"] = loaded_weight
-                continue
-
-            if "wte.new_embedding" in name:
-                embedding_weight["new_embedding"] = loaded_weight
-                continue
-
-            if "vision_backbone" in name:
-                if name.startswith("model"):
-                    name = name[len("model."):]
-                if 'image_projector' in name:
-                    if 'w1' in name:
-                        projector_weight['gate_proj'] = loaded_weight
-                    elif 'w3' in name:
-                        projector_weight['up_proj'] = loaded_weight
-                    elif 'w2' in name:
-                        projector_weight['down_proj'] = loaded_weight
-                    else:
-                        raise ValueError(
-                            f"Unexpected projector weight: {name}")
-                    continue
-            else:
-                if "transformer.blocks" in name:
-                    name = name.replace("transformer.blocks", "layers")
-
-                if "ff_proj" in name:
-                    name = name.replace("ff_proj", "mlp.gate_up_proj")
-                    assert 'weight' in name
-                    up_weight, gate_weight = loaded_weight.chunk(2, dim=0)
-                    loaded_weight = torch.cat([gate_weight, up_weight], dim=0)
-
-                elif "ff_out" in name:
-                    if "layers" in name:
-                        name = name.replace("ff_out", "mlp.down_proj")
-                    else:
-                        # lm head
-                        name = name.replace("model.transformer.ff_out",
-                                            "lm_head")
-
-                else:
-                    for (param_name, weight_name) in params_mapping:
-                        if param_name in name:
-                            name = name.replace(param_name, weight_name)
-                            break
-
-            try:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-            except KeyError:
-                raise ValueError(f"Unexpected weight: {name}") from None
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-
-        gate_up_proj_weight = torch.cat(
-            [projector_weight["gate_proj"], projector_weight["up_proj"]],
-            dim=0)
-        name = "vision_backbone.image_projector.gate_up_proj.weight"
-        param = params_dict[name]
-        weight_loader = getattr(param, "weight_loader", default_weight_loader)
-        weight_loader(param, gate_up_proj_weight)
-
-        down_proj_weight = projector_weight["down_proj"]
-        name = "vision_backbone.image_projector.down_proj.weight"
-        param = params_dict[name]
-        weight_loader = getattr(param, "weight_loader", default_weight_loader)
-        weight_loader(param, down_proj_weight)
-
-        embedding_weight = torch.cat(
-            [embedding_weight["embedding"], embedding_weight["new_embedding"]],
-            dim=0)
-        name = "model.embed_tokens.weight"
-        param = params_dict[name]
-        weight_loader = getattr(param, "weight_loader", default_weight_loader)
-        weight_loader(param, embedding_weight)
+        hf_to_vllm_mapper = WeightsMapper(
+            orig_to_new_substr={
+                # vision backbone mapping
+                "image_projector.w1.": "image_projector.gate_proj.",
+                "image_projector.w3.": "image_projector.up_proj.",
+                "image_projector.w2.": "image_projector.down_proj.",
+                # language backbone mapping
+                "att_proj": "self_attn.qkv_proj",
+                "attn_out": "self_attn.o_proj",
+                "q_norm": "self_attn.q_norm",
+                "k_norm": "self_attn.k_norm",
+                "ff_proj": "mlp.gate_up_proj",
+                "ff_out": "mlp.down_proj",
+                "attn_norm": "input_layernorm",
+                "ff_norm": "post_attention_layernorm",
+            },
+            orig_to_new_prefix={
+                # vision backbone mapping
+                "model.vision_backbone.": "vision_backbone.",
+                # language backbone mapping
+                "model.transformer.blocks.": "model.layers.",
+                "model.transformer.ln_f.": "model.norm.",
+                # lm_head is renamed to model.transformer.mlp.down_proj firstly,
+                # we need to run a second renaming for it
+                "model.transformer.mlp.down_proj.": "lm_head.",
+            },
+        )
+        loader = AutoWeightsLoader(self)
+        weights = _get_weights_with_merged_embedding(weights)
+        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+
+
+def _get_weights_with_merged_embedding(
+    weights: Iterable[Tuple[str, torch.Tensor]]
+) -> Iterable[Tuple[str, torch.Tensor]]:
+    embedding_weights = {}
+    for name, weight in weights:
+        if "wte.embedding" in name:
+            embedding_weights["embedding"] = weight
+        elif "wte.new_embedding" in name:
+            embedding_weights["new_embedding"] = weight
+        else:
+            yield (name, weight)
+    # this is compatible with most of quantization,
+    # because they won't quantize embed_tokens
+    embedding_weights = torch.cat(
+        [embedding_weights["embedding"], embedding_weights["new_embedding"]],
+        dim=0,
+    )
+    yield ("model.embed_tokens.weight", embedding_weights)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index e15c0fe8db060..1235816413a44 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,6 +1,6 @@
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -50,6 +50,7 @@ def __init__(
         config: MPTConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.d_model = config.d_model
@@ -115,7 +116,8 @@ def __init__(
                               alibi_slopes=alibi_slopes,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -176,11 +178,15 @@ def __init__(
         config: MPTConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.d_model
         self.norm_1 = nn.LayerNorm(hidden_size)
-        self.attn = MPTAttention(config, cache_config, quant_config)
+        self.attn = MPTAttention(config,
+                                 cache_config,
+                                 quant_config,
+                                 prefix=f"{prefix}.attn")
         self.norm_2 = nn.LayerNorm(hidden_size)
         self.ffn = MPTMLP(config, quant_config)
 
@@ -224,7 +230,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.blocks = make_layers(
             config.n_layers,
-            lambda prefix: MPTBlock(config, cache_config, quant_config),
+            lambda prefix: MPTBlock(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.blocks")
         self.norm_f = nn.LayerNorm(config.d_model)
         if config.no_bias:
@@ -237,6 +244,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.d_model))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -244,9 +254,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.wte(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -283,6 +297,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -290,9 +307,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -312,8 +331,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             # Skip loading extra bias for GPTQ models.
             if name.endswith(".bias") and name not in params_dict:
@@ -324,3 +345,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index e09d7088a69ce..34cb9981c167b 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Nemotron model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -195,7 +195,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -434,12 +435,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
+
+        self.sampler = get_sampler()
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -447,9 +453,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  inputs_embeds)
         return model_output
 
     def compute_logits(
@@ -469,7 +477,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -477,6 +486,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".qkv_proj", ".v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -517,3 +527,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 3467ae5896494..538e31ec91699 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OLMo model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -62,6 +62,7 @@ def __init__(
         config: OlmoConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -101,7 +102,8 @@ def __init__(
                               self.head_dim,
                               scale=self.scaling,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
         # Attention output projection.
         self.o_proj = RowParallelLinear(
@@ -184,10 +186,14 @@ class OlmoDecoderLayer(nn.Module):
     def __init__(self,
                  config: OlmoConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         # Attention block.
-        self.self_attn = OlmoAttention(config, cache_config, quant_config)
+        self.self_attn = OlmoAttention(config,
+                                       cache_config,
+                                       quant_config,
+                                       prefix=f"{prefix}.self_attn")
 
         # MLP block.
         self.mlp = OlmoMLP(config, quant_config)
@@ -238,8 +244,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                    config.hidden_size)
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: OlmoDecoderLayer(config, cache_config, quant_config
-                                            ),
+            lambda prefix: OlmoDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self.norm = nn.LayerNorm(config.hidden_size,
                                  elementwise_affine=False,
@@ -248,6 +254,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -255,17 +264,16 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         """
         if get_pp_group().is_first_rank:
-            # Get embeddings of input.
-            # shape: (batch_size, seq_len, d_model)
-            inputs_embeds = self.embed_tokens(input_ids)
-
-            # embed positions
-            hidden_states = inputs_embeds
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -315,6 +323,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -322,6 +333,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(
             input_ids=input_ids,
@@ -329,6 +341,7 @@ def forward(
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
         )
         return hidden_states
 
@@ -349,7 +362,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -359,6 +373,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -395,3 +410,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
new file mode 100644
index 0000000000000..a35c911f90d96
--- /dev/null
+++ b/vllm/model_executor/models/olmo2.py
@@ -0,0 +1,432 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
+# Copyright 2024 The vLLM team.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only OLMo2 model compatible with HuggingFace weights."""
+
+from functools import partial
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
+from vllm.distributed.utils import split_tensor_along_last_dim
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.utils import (
+    is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
+    make_layers, maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.olmo2 import Olmo2Config
+
+
+class Olmo2Attention(nn.Module):
+    """
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        assert isinstance(self.config, Olmo2Config)
+
+        hidden_size = self.config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = self.config.num_attention_heads
+
+        assert hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % self.tp_size == 0
+
+        self.num_heads = self.total_num_heads // self.tp_size
+        self.total_num_kv_heads = (self.config.num_key_value_heads
+                                   or self.total_num_heads)
+        if self.total_num_kv_heads >= self.tp_size:
+            assert self.total_num_kv_heads % self.tp_size == 0
+        else:
+            assert self.tp_size % self.total_num_kv_heads == 0
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.max_position_embeddings = self.config.max_position_embeddings
+        self.rope_theta = self.config.rope_theta
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.k_norm = RMSNorm(
+            self.total_num_kv_heads * self.head_dim,
+            eps=self.config.rms_norm_eps,
+        )
+        self.q_norm = RMSNorm(self.config.hidden_size,
+                              eps=self.config.rms_norm_eps)
+
+        # Rotary embeddings.
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,  # type: ignore
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            quant_config=vllm_config.quant_config,
+            prefix=prefix,
+        )
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm.forward_native(q)
+        k = self.k_norm.forward_native(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Olmo2MLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as
+    ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo2Config)
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+
+        # Feed-forward input projection.
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Olmo2DecoderLayer(nn.Module):
+    """
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo2Config)
+        # Attention block.
+        self.self_attn = Olmo2Attention(vllm_config=vllm_config,
+                                        prefix=f"{prefix}.self_attn")
+
+        # MLP block.
+        self.mlp = Olmo2MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
+
+        # LayerNorm
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+        self.post_feedforward_layernorm = RMSNorm(config.hidden_size,
+                                                  eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.self_attn(positions, hidden_states, kv_cache,
+                                       attn_metadata)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Olmo2Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        assert isinstance(self.config, Olmo2Config)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            self.config.num_hidden_layers,
+            lambda prefix: Olmo2DecoderLayer(vllm_config=vllm_config,
+                                             prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(
+            self.config.hidden_size,
+            eps=self.config.rms_norm_eps,
+        )
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    self.config.hidden_size))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        if get_pp_group().is_first_rank:
+            # Get embeddings of input.
+            # shape: (batch_size, seq_len, d_model)
+            inputs_embeds = self.embed_tokens(input_ids)
+
+            # embed positions
+            hidden_states = inputs_embeds
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            assert isinstance(hidden_states, torch.Tensor)
+
+        # Apply blocks one-by-one.
+        for i in range(self.start_layer, self.end_layer):
+            # shape: (batch_size, seq_len, d_model)
+            hidden_states = self.layers[i](
+                positions,
+                hidden_states,
+                kv_caches[i - self.start_layer],
+                attn_metadata,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class Olmo2ForCausalLM(nn.Module, SupportsPP):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo2Config)
+        self.config = config
+        self.model = Olmo2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=vllm_config.quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader  # type: ignore
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 3d31919edd862..5d9091cfb9311 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -10,7 +10,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OLMoE model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -102,6 +102,7 @@ def __init__(
         max_position_embeddings: int = 4096,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -156,7 +157,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -179,9 +181,9 @@ class OlmoeDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        layer_idx: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -199,6 +201,7 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
         )
 
         self.mlp = OlmoeMoE(
@@ -260,8 +263,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: OlmoeDecoderLayer(config, int(
-                prefix.split(".")[-1]), cache_config, quant_config),
+            lambda prefix: OlmoeDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=1e-5)
 
@@ -269,6 +272,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -276,9 +282,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -326,6 +336,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -333,9 +346,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -352,7 +367,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -371,6 +387,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -443,3 +460,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 997fe642439e6..7edafcd20b5db 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OPT model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -337,9 +337,6 @@ class OPTForCausalLM(nn.Module, SupportsPP):
         "k_proj": ("qkv_proj", 1),
         "v_proj": ("qkv_proj", 2),
     }
-    default_bitsandbytes_target_modules = [
-        ".q_proj.", ".k_proj.", ".v_proj.", ".out_proj.", ".fc1.", ".fc2."
-    ]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -394,7 +391,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -402,6 +400,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "lm_head.weight" in name and self.config.tie_word_embeddings:
                 continue
@@ -431,3 +430,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 38821c8288347..a3757b5c8808e 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -3,7 +3,7 @@
 # Copyright (c) OrionStar Inc.
 # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
 """Inference-only Orion-14B model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -75,6 +75,7 @@ def __init__(
         max_position_embeddings: int = 8192,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -126,7 +127,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -150,6 +152,7 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -166,6 +169,7 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
         )
         self.mlp = OrionMLP(
             hidden_size=self.hidden_size,
@@ -226,10 +230,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: OrionDecoderLayer(
-                config,
-                cache_config,
-                quant_config,
-            ),
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.make_empty_intermediate_tensors = (
@@ -237,6 +238,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 "hidden_states",
             ], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -244,9 +248,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -286,6 +294,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -293,9 +304,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -315,7 +328,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -325,6 +339,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -356,3 +371,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index eea229359255e..f9ad0c67adaba 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,4 +1,4 @@
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -13,6 +13,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors
 
@@ -104,6 +105,11 @@ def input_processor_for_paligemma(ctx: InputContext,
         orig_prompt_ids.remove(hf_config.image_token_index)
 
     new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n"
+
+    # The PaliGemma 2 tokenizer does not include a starting BOS token
+    if orig_prompt_ids[0] != hf_config.bos_token_id:
+        orig_prompt_ids = [hf_config.bos_token_id] + orig_prompt_ids
+
     new_token_ids = image_token_ids_pad + orig_prompt_ids + [108]  #newline
 
     # NOTE: Create a defensive copy of the original inputs
@@ -148,11 +154,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             projection_dim=config.vision_config.projection_dim)
 
         self.quant_config = quant_config
-        config.text_config.architectures = ["GemmaForCausalLM"]
+
+        if config.text_config.model_type == "gemma":
+            config.text_config.architectures = ["GemmaForCausalLM"]
+        else:
+            config.text_config.architectures = ["Gemma2ForCausalLM"]
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.language_model.logits_processor.scale *= logit_scale
 
@@ -240,36 +251,45 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_features)
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
+        vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_index)
+        return inputs_embeds
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            parsed_image_input = self._parse_and_validate_image_input(**kwargs)
 
-            if parsed_image_input is not None:
-                vision_embeddings = self._process_image_input(
-                    parsed_image_input)
-                # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
-                vision_embeddings = vision_embeddings * (
-                    self.config.hidden_size**-0.5)
-
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.config.image_token_index)
-
-                input_ids = None
-            else:
-                inputs_embeds = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
@@ -295,6 +315,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 2e34a7cc30873..14dd4b5b1b4da 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -19,7 +19,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only persimmon model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -75,7 +75,8 @@ class PersimmonAttention(nn.Module):
     def __init__(self,
                  config: PersimmonConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.config = config
         tensor_parallel_world_size = get_tensor_model_parallel_world_size()
@@ -122,7 +123,8 @@ def __init__(self,
                               self.head_dim,
                               scale=self.scaling,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
         # [seq_length, hidden_size] -> [seq_length, num_heads, head_dim]
@@ -167,12 +169,14 @@ class PersimmonDecoderLayer(nn.Module):
     def __init__(self,
                  config: PersimmonConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = PersimmonAttention(config=config,
                                             cache_config=cache_config,
-                                            quant_config=quant_config)
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.self_attn")
         self.mlp = PersimmonMLP(config, quant_config=quant_config)
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
                                             eps=config.layer_norm_eps)
@@ -226,8 +230,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                    config.hidden_size)
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: PersimmonDecoderLayer(config, cache_config,
-                                                 quant_config),
+            lambda prefix: PersimmonDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self.final_layernorm = nn.LayerNorm(config.hidden_size,
                                             eps=config.layer_norm_eps)
@@ -235,6 +239,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -248,7 +255,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.embed_tokens(input_ids)
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -282,6 +289,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -318,8 +328,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -352,3 +364,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 262f6996fc374..f9e972688ddd1 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -34,7 +34,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """Inference-only Phi-1.5 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -69,7 +69,8 @@ class PhiAttention(nn.Module):
     def __init__(self,
                  config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.total_num_heads = config.num_attention_heads
         self.hidden_size = config.hidden_size
@@ -116,7 +117,8 @@ def __init__(self,
                               self.head_size,
                               scaling,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -167,11 +169,15 @@ class PhiLayer(nn.Module):
     def __init__(self,
                  config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
                                             eps=config.layer_norm_eps)
-        self.self_attn = PhiAttention(config, cache_config, quant_config)
+        self.self_attn = PhiAttention(config,
+                                      cache_config,
+                                      quant_config,
+                                      prefix=f"{prefix}.self_attn")
         self.mlp = PhiMLP(config, quant_config)
 
     def forward(
@@ -210,7 +216,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                    config.hidden_size)
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: PhiLayer(config, cache_config, quant_config),
+            lambda prefix: PhiLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self.final_layernorm = nn.LayerNorm(config.hidden_size,
                                             eps=config.layer_norm_eps)
@@ -218,6 +225,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -225,9 +235,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -272,9 +286,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "k_proj": ("qkv_proj", 1),
         "v_proj": ("qkv_proj", 2),
     }
-    default_bitsandbytes_target_modules = [
-        ".q_proj.", ".k_proj.", ".v_proj.", ".fc1.", ".fc2.", ".dense."
-    ]
 
     embedding_modules = {}
     embedding_padding_modules = []
@@ -303,6 +314,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -310,9 +324,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
 
         return hidden_states
 
@@ -333,7 +349,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -341,6 +358,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v")
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
 
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
@@ -371,3 +389,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 34141511ea791..937858ee3b8c2 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -14,3 +14,7 @@ class Phi3ForCausalLM(LlamaForCausalLM):
             "gate_up_proj",
         ],
     }
+
+    # BitandBytes specific attributes
+    # Initialize an empty dict when there is no stacked parameter mapping.
+    bitsandbytes_stacked_params_mapping = {}
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 8a5fb6d303e60..da7e4cdbc6940 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -1,5 +1,5 @@
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -54,12 +54,12 @@ def weight_loader(self, param: torch.nn.Parameter,
         return load_column_parallel_weight(param, loaded_weight)
 
 
-@torch.jit.script
+@torch.compile(dynamic=True)
 def quick_gelu(x):
     return x * torch.sigmoid(1.702 * x)
 
 
-@torch.jit.script
+@torch.compile(dynamic=True)
 def gegelu(input, limit: Optional[float] = None):
     a_gelu, a_linear = input[..., ::2], input[..., 1::2]
     if limit is not None:
@@ -117,6 +117,7 @@ def __init__(
         layer_idx: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.layer_idx = layer_idx
@@ -214,15 +215,14 @@ def __init__(
                 "homo_head": self.homo_heads
             }
 
-        self.attn = Attention(
-            self.num_heads_per_partition,
-            self.head_dim,
-            self.scale,
-            num_kv_heads=self.num_kv_heads_per_partion,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            blocksparse_params=bs_params,
-        )
+        self.attn = Attention(self.num_heads_per_partition,
+                              self.head_dim,
+                              self.scale,
+                              num_kv_heads=self.num_kv_heads_per_partion,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              blocksparse_params=bs_params,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -259,13 +259,15 @@ def __init__(
         layer_idx: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = Phi3SmallSelfAttention(config,
                                                 layer_idx,
                                                 cache_config=cache_config,
-                                                quant_config=quant_config)
+                                                quant_config=quant_config,
+                                                prefix=f"{prefix}.self_attn")
         self.mlp = Phi3SmallMLP(config, quant_config)
 
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
@@ -315,7 +317,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.num_hidden_layers,
             lambda prefix: Phi3SmallDecoderLayer(config,
                                                  int(prefix.split('.')[-1]),
-                                                 cache_config, quant_config),
+                                                 cache_config,
+                                                 quant_config,
+                                                 prefix=prefix),
             prefix=f"{prefix}.layers")
 
         self.final_layernorm = nn.LayerNorm(config.hidden_size,
@@ -324,11 +328,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
 
     def forward(
         self,
@@ -337,9 +338,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor],
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             if (self.mup_embedding_multiplier is not None
                     and self.mup_embedding_multiplier > 0.0):
                 hidden_states = hidden_states * self.mup_embedding_multiplier
@@ -397,8 +402,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.dummy_token_indices = None
 
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
 
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
@@ -433,6 +438,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         output_hidden_states = self.model(
             input_ids=input_ids,
@@ -440,6 +446,7 @@ def forward(
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
         )
         output_hidden_states = output_hidden_states
         return output_hidden_states
@@ -454,9 +461,11 @@ def sample(
                                    sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -468,3 +477,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4db65edc174f1..b19329a57a8cf 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -12,41 +12,38 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import itertools
-import re
-from functools import cached_property, lru_cache
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
-                    Tuple, TypedDict, Union)
+from functools import cached_property
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import CLIPVisionConfig, PretrainedConfig
+from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig,
+                          ProcessorMixin)
 
 from vllm.attention import AttentionMetadata
-from vllm.config import ModelConfig, VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.config import VllmConfig
+from vllm.inputs import InputContext
 from vllm.logger import init_logger
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.models.clip import CLIPVisionModel
-from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
-from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataDict,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
+from .clip import dummy_image_for_clip
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
@@ -305,238 +302,111 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57
-def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
-    target_height = int(np.ceil(height / padding_unit) * padding_unit)
-    top_padding = int((target_height - height) / 2)
-    bottom_padding = target_height - height - top_padding
-    padded_width = width
-    padded_height = height + top_padding + bottom_padding
-    return padded_width, padded_height
-
-
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L90
-def _calc_hd_transform_size(*, width: int, height: int, hd_num: int):
-    transposed = False
-    if width < height:
-        width, height = height, width
-        transposed = True
-
-    ratio = width / height
-    scale = 1
-    while scale * np.ceil(scale / ratio) <= hd_num:
-        scale += 1
-    scale -= 1
-
-    new_width = int(scale * 336)
-    new_height = int(new_width / ratio)
-
-    padded_width, padded_height = _calc_padded_size(width=new_width,
-                                                    height=new_height)
-
-    if transposed:
-        padded_width, padded_height = padded_height, padded_width
-
-    return padded_width, padded_height
-
-
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L181
-def get_phi3v_image_feature_size(
-    hf_config: Dict[str, Any],
+def get_max_phi3v_image_tokens(
+    ctx: InputContext,
     *,
-    input_height: int,
-    input_width: int,
-    num_crops: int,
+    num_crops: Optional[int] = None,
 ) -> int:
-    if num_crops is None:
-        num_crops = hf_config.get("num_crops", 16)
-    new_width, new_height = _calc_hd_transform_size(width=input_width,
-                                                    height=input_height,
-                                                    hd_num=num_crops)
-
-    return (new_height // 336 * new_width // 336 + 1) * 144 + 1 \
-        + (new_height // 336 + 1) * 12
-
+    mm_processor_kwargs = {}
+    if num_crops:
+        mm_processor_kwargs["num_crops"] = num_crops
 
-def get_max_phi3v_image_tokens(ctx: InputContext,
-                               *,
-                               num_crops: Optional[int] = None):
+    processor = ctx.get_hf_processor(**mm_processor_kwargs)
 
-    return get_phi3v_image_feature_size(
-        ctx.get_hf_image_processor_config(),
-        input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-        num_crops=num_crops,
+    return processor.calc_num_image_tokens_from_image_size(
+        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
 
 
-def dummy_data_for_phi3v(ctx: InputContext,
-                         seq_len: int,
-                         mm_counts: Mapping[str, int],
-                         *,
-                         num_crops: Optional[int] = None):
-    num_images = mm_counts["image"]
-
-    image_feature_size = get_max_phi3v_image_tokens(ctx, num_crops=num_crops)
-
-    seq_data, ranges = dummy_seq_data_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        seq_len,
-        num_images,
-        image_token_id=_IMAGE_TOKEN_ID,
-        image_feature_size_override=image_feature_size,
-    )
-    mm_data = dummy_image_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        num_images,
-        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-    )
-
-    return DummyData(seq_data, mm_data, ranges)
-
-
-@lru_cache
-def _get_image_placeholder_token_id_candidates(
-    model_config: ModelConfig,
-    idx: int,
-) -> List[List[int]]:
-    assert idx > 0
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-
-    # This is used when the image token is at the start of the string
-    start_candidate = tokenizer.encode(f"<|image_{idx}|>",
-                                       add_special_tokens=False)
-
-    # This is used when the image token is in the middle of the string
-    # We need to get the token for "<", not "▁<"
-    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json
-    a_token_id, = tokenizer.encode("a", add_special_tokens=False)
-    a_token_id_, *middle_candidate = tokenizer.encode(f"a<|image_{idx}|>",
-                                                      add_special_tokens=False)
-    assert a_token_id == a_token_id_
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
 
-    return [start_candidate, middle_candidate]
+    def _get_hf_processor(
+        self,
+        *,
+        num_crops: Optional[int] = None,
+    ) -> ProcessorMixin:
+        if num_crops is not None:
+            return self.ctx.get_hf_processor(num_crops=num_crops)
+        return self.ctx.get_hf_processor()
+
+    def _apply_hf_processor(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._apply_hf_processor(
+            prompt, mm_data, mm_processor_kwargs)
+        # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
+        # which will cause OverflowError when decoding the prompt_ids.
+        # Therefore, we need to do an early replacement here
+        token_ids = processed_outputs['input_ids']
+        token_ids[token_ids < 0] = _IMAGE_TOKEN_ID
+        processed_outputs['input_ids'] = token_ids
+        return processed_outputs
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+        image_processor = hf_processor.image_processor  # type: ignore
+
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+
+        def get_replacement_phi3v(item_idx: int):
+            image_size = mm_items.get_image_size(item_idx)
+            num_tokens = image_processor.calc_num_image_tokens_from_image_size(
+                width=image_size.width,
+                height=image_size.height,
+            )
 
+            return [_IMAGE_TOKEN_ID] * num_tokens
 
-def input_processor_for_phi3v(ctx: InputContext,
-                              inputs: DecoderOnlyInputs,
-                              *,
-                              num_crops: Optional[int] = None):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement_phi3v,
+            ) for image_token in image_tokens[:max_images]
+        ]
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_image_processor_config()
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts["image"]
+
+        data = dummy_image_for_clip(
+            CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+            num_images,
+            image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+            image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        )
 
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        w, h = image_data.size
-        image_feature_size = [
-            get_phi3v_image_feature_size(hf_config,
-                                         input_width=w,
-                                         input_height=h,
-                                         num_crops=num_crops)
-        ]
-        image_data = [image_data]
-    elif is_list_of(image_data, Image.Image):
-        image_feature_size = []
-        for image in image_data:
-            w, h = image.size
-            image_feature_size.append(
-                get_phi3v_image_feature_size(hf_config,
-                                             input_width=w,
-                                             input_height=h,
-                                             num_crops=num_crops))
-    elif isinstance(image_data, torch.Tensor):
-        image_feature_size = [image_data.shape[0]]
-        image_data = [image_data]
-    elif is_list_of(image_data, torch.Tensor):
-        image_feature_size = [item.shape[0] for item in image_data]
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    prompt = inputs.get("prompt")
-    if prompt is None:
-        # for async server request, we assume prompt and its token_ids is always
-        # in correct format. And num_image_tags == len(image_data) always True.
-        image_idx = range(1, len(image_data) + 1)
-        new_prompt = None
-    else:
-        image_idx = sorted(map(int, re.findall(r"<\|image_(\d+)\|>+", prompt)))
-        if prompt.count("<|image|>") > 0:
-            logger.warning("Please follow the prompt format that is "
-                           "documented on HuggingFace which does not involve "
-                           "repeating <|image|> tokens.")
-        elif (num_image_tags := len(image_idx)) > 1:
-            assert num_image_tags == len(
-                image_data), "The count of image_placeholder not match image's"
-        new_prompt = prompt
-
-    prompt_token_ids = inputs["prompt_token_ids"].copy()
-
-    # masked placeholder with image token id
-    for idx in image_idx:
-        candidates = _get_image_placeholder_token_id_candidates(model_config,
-                                                                idx=idx)
-
-        for candidate in candidates:
-            for i in range(len(prompt_token_ids) - len(candidate) + 1):
-                if prompt_token_ids[i:i + len(candidate)] == candidate:
-                    prompt_token_ids[i:i +
-                                     len(candidate)] = ([_IMAGE_TOKEN_ID] *
-                                                        len(candidate))
-                    break
-
-    # merge consecutive tag ids
-    merged_token_ids: List[int] = []
-    for is_placeholder, token_ids in itertools.groupby(
-            prompt_token_ids, lambda x: x == _IMAGE_TOKEN_ID):
-        if is_placeholder:
-            merged_token_ids.append(_IMAGE_TOKEN_ID)
-        else:
-            merged_token_ids.extend(list(token_ids))
-
-    # TODO: Move this to utils or integrate with clip.
-    new_token_ids: List[int] = []
-    placeholder_ranges: List[PlaceholderRange] = []
-    placeholder_idx = 0
-    while merged_token_ids:
-        token_id = merged_token_ids.pop(0)
-        if token_id == _IMAGE_TOKEN_ID:
-            replacement_ids = repeat_and_pad_token(
-                _IMAGE_TOKEN_ID,
-                repeat_count=image_feature_size[placeholder_idx],
-            )
-            placeholder_ranges.append({
-                "offset": len(new_token_ids),
-                "length": len(replacement_ids)
-            })
-            new_token_ids.extend(replacement_ids)
-            placeholder_idx += 1
-        else:
-            new_token_ids.append(token_id)
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": placeholder_ranges})
+        return ProcessorInputs(
+            prompt_text="".join(image_tokens[:num_images]),
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
+@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        pooler_config = vllm_config.model_config.pooler_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
@@ -556,18 +426,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             quant_config,
             prefix=maybe_prefix(prefix, "model.vision_embed_tokens"))
 
-        # The prefix is empty intentionally because default prefix of
-        # LlamaForCausalLM is "model"
-        self.language_model = LlamaForCausalLM(vllm_config=vllm_config,
-                                               prefix="")
-
-        # The same model class supports both language generation and embedding
-        # because the architecture name is the same
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            # The prefix is empty intentionally because default prefix of
+            # LlamaForCausalLM is "model"
+            prefix="",
+            # We don't directly initialize vLLM's LlamaForCausalLM so we
+            # can automatically apply embedding wrapper if this model is
+            # initialized as an embedding model
+            architectures=["LlamaForCausalLM"],
+        )
+
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
@@ -676,7 +545,7 @@ def _process_image_input(
 
         return image_embeds
 
-    def process_mm_inputs(self, **kwargs):
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -686,12 +555,12 @@ def process_mm_inputs(self, **kwargs):
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        vision_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[NestedTensors] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.embed_tokens(input_ids)
-        if vision_embeddings is not None:
+        if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, vision_embeddings,
+                input_ids, inputs_embeds, multimodal_embeddings,
                 self.image_token_id)
         return inputs_embeds
 
@@ -703,12 +572,14 @@ def forward(self,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object):
+
         if intermediate_tensors is not None:
             inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility
         elif inputs_embeds is None:
-            vision_embeddings = self.process_mm_inputs(**kwargs)
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
             input_ids = None
@@ -737,14 +608,8 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         hf_to_vllm_mapper = WeightsMapper(
             orig_to_new_prefix={
                 "model.vision_embed_tokens.wte": "embed_tokens",
@@ -759,5 +624,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         # The HF config doesn't specify whether these are tied,
         # so we detect it this way
-        if "embed_tokens" not in autoloaded_weights:
+        if "embed_tokens.weight" not in autoloaded_weights:
             self.embed_tokens = self.language_model.model.embed_tokens
+            autoloaded_weights.add("embed_tokens.weight")
+        return autoloaded_weights
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 6d71a8949111b..1febd62f2f705 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only PhiMoE model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -294,6 +294,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         rope_scaling: Optional[dict] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -347,6 +348,7 @@ def __init__(
             num_kv_heads=self.num_kv_heads,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=f"{prefix}.attn",
         )
 
     def forward(
@@ -371,6 +373,7 @@ def __init__(
         config: PhiMoEConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -385,6 +388,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
             rope_scaling=config.rope_scaling,
+            prefix=f"{prefix}.self_attn",
         )
         self.block_sparse_moe = PhiMoE(
             num_experts=config.num_local_experts,
@@ -454,8 +458,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: PhiMoEDecoderLayer(config, cache_config,
-                                              quant_config),
+            lambda prefix: PhiMoEDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
         self.norm = nn.LayerNorm(config.hidden_size,
                                  eps=config.rms_norm_eps,
@@ -465,6 +469,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -472,9 +479,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -560,6 +571,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -567,9 +581,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -586,7 +602,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -601,6 +618,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_local_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -654,3 +672,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index a3e30ea2dd299..f05ea195e043d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,7 +1,6 @@
 from dataclasses import dataclass, fields
 from functools import cached_property
-from itertools import tee
-from typing import Iterable, List, Mapping, Optional, Tuple, Union
+from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
 
 import numpy
 import torch
@@ -17,6 +16,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import ModelConfig, VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
@@ -30,8 +30,10 @@
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges)
+                                   consecutive_placeholder_ranges,
+                                   resolve_visual_encoder_outputs)
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import is_list_of
@@ -45,6 +47,9 @@
 except ImportError:
     USE_XFORMERS_OPS = False
 
+PIXTRAL_IMAGE_BREAK_ID = 12
+PIXTRAL_IMAGE_END_ID = 13
+
 
 def get_max_pixtral_image_tokens(ctx: InputContext):
     tokenizer = cached_get_tokenizer(
@@ -65,18 +70,17 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
         tokenizer_mode=ctx.model_config.tokenizer_mode)
 
     mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
-    patch_size = mm_encoder.mm_config.image_patch_size
     image_token_id = mm_encoder.special_ids.img
 
-    mm_config = ctx.model_config.multimodal_config
+    mm_config = ctx.get_mm_config()
     num_images = mm_config.limit_per_prompt.get("image", 1)
 
     # dummy size
     size = 256
     image = Image.new("RGB", (size, size), color=0)
 
-    image_feature_size = (size**2) // (patch_size**2)
-
+    encoding = tokenizer.instruct.mm_encoder(ImageChunk(image=image))
+    image_feature_size = len(encoding.tokens)
     num_image_tokens = image_feature_size * num_images
     seq_data = SequenceData.from_prompt_token_counts(
         (image_token_id, num_image_tokens),
@@ -98,14 +102,13 @@ def input_mapper_for_pixtral(ctx: InputContext,
 
     Args:
         ctx: Context of the loaded model.
-        data: data potentially containing image/image embeddings to be mapped
-            to pixel_values in .forward() for a visual QWenLMHeadModel model.
+        data: data potentially containing PIL images to be processed
+            and mapped to `images`.
 
     Returns:
         MultiModalKwargs containing the stacked normalized images tensor or
         image embeddings.
     """
-    # Early exit if we have provided an image to a language only Qwen model
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(
         model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
@@ -113,35 +116,67 @@ def input_mapper_for_pixtral(ctx: InputContext,
     data_list = data if isinstance(data, list) else [data]
 
     images = []
+    image_tokens_list = []
     for image_data in data_list:
         image = ImageChunk(image=image_data)
         encoding = tokenizer.instruct.mm_encoder(image)
         image = torch.from_numpy(encoding.image).to(device="cuda",
                                                     dtype=torch.float16)
         images.append(image)
+        image_tokens_list.append(encoding.tokens)
 
-    return MultiModalKwargs({"images": images})
+    image_tokens = torch.tensor([
+        token_id for image_tokens in image_tokens_list
+        for token_id in image_tokens
+    ])
+    return MultiModalKwargs({"images": images, "image_tokens": image_tokens})
 
 
 def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
     multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is not None and "image" in multi_modal_data:
-        tokenizer = cached_get_tokenizer(
-            ctx.model_config.tokenizer,
-            tokenizer_mode=ctx.model_config.tokenizer_mode)
-
-        mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
-        image_token_id = mm_encoder.special_ids.img
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return inputs
 
-        if image_token_id not in inputs['prompt_token_ids']:
-            raise ValueError(
-                f"You've passed {inputs=} without {image_token_id=}"
-                " Make sure to process your input via mistral_common's"
-                " tokenizer or pass a chat completion request. For more"
-                " For more info, see: "
-                "https://github.com/vllm-project/vllm/issues/8411.")
+    prompt_token_ids = inputs.get("prompt_token_ids")
+    prompt = inputs.get("prompt")
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        tokenizer_mode=ctx.model_config.tokenizer_mode)
 
-    return inputs
+    mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
+    image_token_id = mm_encoder.special_ids.img
+    image_break_id = mm_encoder.special_ids.img_break
+    image_end_id = mm_encoder.special_ids.img_end
+
+    if image_token_id not in inputs['prompt_token_ids']:
+        raise ValueError(
+            f"You've passed {inputs=} without {image_token_id=}"
+            " Make sure to process your input via mistral_common's"
+            " tokenizer or pass a chat completion request. For more"
+            " For more info, see: "
+            "https://github.com/vllm-project/vllm/issues/8411.")
+
+    # Get precise tracking of placeholder positions
+    placeholder_ranges = []
+    curr_offset = -1
+    curr_length = 0
+    for i in range(len(prompt_token_ids)):
+        if prompt_token_ids[i] in (image_token_id, image_break_id):
+            if curr_offset < 0:
+                curr_offset = i
+            curr_length += 1
+        elif prompt_token_ids[i] == image_end_id:
+            curr_length += 1
+            placeholder_ranges.append(
+                PlaceholderRange(offset=curr_offset, length=curr_length))
+            curr_offset = -1
+            curr_length = 0
+        else:
+            pass
+    return token_inputs(prompt=prompt,
+                        prompt_token_ids=prompt_token_ids,
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": placeholder_ranges})
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_pixtral)
@@ -169,9 +204,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         # init MistralForCausalLM
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.vision_encoder = VisionTransformer(self.vision_args)
         self.vision_language_adapter = VisionLanguageAdapter(
@@ -187,6 +223,50 @@ def sampler(self):
 
         return get_sampler()
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input, image_tokens = self._parse_and_validate_image_input(
+            **kwargs)
+        if image_input is None:
+            return None
+
+        vision_embeddings = self._process_image_input(image_input)
+
+        # NOTE: We patch the outputs of the vision encoder with embeddings
+        # from `[IMG_BREAK]` and `[IMG_END]` tokens.
+        image_embeds = self.language_model.get_input_embeddings(image_tokens)
+        image_token_mask = image_tokens == self.vision_args.image_token_id
+        image_embeds[image_token_mask] = vision_embeddings
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the indices of `[IMG_END]` token.
+        split_indices = torch.where(
+            image_tokens == PIXTRAL_IMAGE_END_ID)[0] + 1
+        if len(split_indices) <= 1:
+            # Do not split, return as tensor of shape [1, fs, hs]
+            return image_embeds.unsqueeze(0)
+
+        # If the last split index is the last index in image_tokens, we
+        # ignore it to avoid empty split tensor
+        if split_indices[-1] == len(image_tokens):
+            split_indices = split_indices[:-1]
+
+        image_embeds = image_embeds.tensor_split(split_indices.cpu())
+        return image_embeds
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings, [
+                    self.vision_args.image_token_id, PIXTRAL_IMAGE_END_ID,
+                    PIXTRAL_IMAGE_BREAK_ID
+                ])
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -194,31 +274,21 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for pixtral.
-
-        TODO
-
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.vision_args.image_token_id)
 
-                input_ids = None
-            else:
-                inputs_embeds = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
@@ -232,10 +302,11 @@ def forward(
     def _parse_and_validate_image_input(
         self,
         images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor],
-                               torch.Tensor]] = None
+                               torch.Tensor]] = None,
+        image_tokens: Optional[torch.Tensor] = None,
     ) -> Optional[List[torch.Tensor]]:
         if images is None:
-            return None
+            return None, None
 
         if isinstance(images, torch.Tensor):
             # if passed as batch take all images
@@ -254,7 +325,16 @@ def _parse_and_validate_image_input(
 
             images = flatten_images
 
-        return images
+        if isinstance(image_tokens, torch.Tensor):
+            # image_tokens are batched
+            image_tokens = image_tokens.flatten()
+        elif isinstance(image_tokens, list):
+            # image_tokens are of different lengths thus passed as a list
+            image_tokens = torch.cat(image_tokens)
+
+        assert image_tokens.dim() == 1
+
+        return images, image_tokens
 
     def _process_image_input(self,
                              image_input: List[torch.Tensor]) -> torch.Tensor:
@@ -283,38 +363,33 @@ def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]):
         def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]):
             return weight[0].startswith("vision_language_adapter")
 
-        def is_vision_weights(weight: Tuple[str, torch.Tensor]):
-            return is_vision_encoder_weights(
-                weight) or is_vision_lang_adapter_weights(weight)
-
-        llm_weights, vision_encoder_weights, vision_lang_adapter_weights = tee(
-            weights, 3)
-
-        # llm
-        llm_weights = filter(lambda x: not is_vision_weights(x), llm_weights)
-        self.language_model.load_weights(llm_weights)
-
-        # vision encoder
-        vision_encoder_weights = filter(is_vision_encoder_weights,
-                                        vision_encoder_weights)
+        # Get references to parameters for direct loading
         vision_encoder_dict = dict(self.vision_encoder.named_parameters())
-        for name, loaded_weight in vision_encoder_weights:
-            # cut 'vision_encoder.'
-            name = '.'.join(name.split(".")[1:])
-            param = vision_encoder_dict[name]
-
-            default_weight_loader(param, loaded_weight)
-
-        # adapter
-        vision_lang_adapter_weights = filter(is_vision_lang_adapter_weights,
-                                             vision_lang_adapter_weights)
-        vision_lang_adpter_dict = dict(
+        vision_lang_adapter_dict = dict(
             self.vision_language_adapter.named_parameters())
-        for name, loaded_weight in vision_lang_adapter_weights:
-            # cut 'vision_language_adapter.'
-            name = '.'.join(name.split(".")[1:])
-            param = vision_lang_adpter_dict[name]
-            default_weight_loader(param, loaded_weight)
+
+        def llm_weights_generator():
+            # Single pass over weights
+            for name, w in weights:
+                if is_vision_encoder_weights((name, w)):
+                    # Load vision encoder weights directly
+                    trimmed_name = '.'.join(name.split(".")[1:])
+                    param = vision_encoder_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                elif is_vision_lang_adapter_weights((name, w)):
+                    # Load vision-language adapter weights directly
+                    trimmed_name = '.'.join(name.split(".")[1:])
+                    param = vision_lang_adapter_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                else:
+                    # LLM weights: yield them to be loaded
+                    # by language_model.load_weights
+                    yield (name, w)
+
+        # Now we call the language model load with the generator
+        self.language_model.load_weights(llm_weights_generator())
 
 
 # Vision encoder
@@ -329,6 +404,7 @@ class VisionEncoderArgs:
     num_attention_heads: int
     rope_theta: float  # for rope-2D
     image_token_id: int
+    adapter_bias: bool = True
 
 
 def _reshape_for_broadcast(freqs_cis: torch.Tensor,
@@ -593,10 +669,10 @@ def __init__(self, args: VisionEncoderArgs, dim: int):
         self.w_in = nn.Linear(
             args.hidden_size,
             dim,
-            bias=True,
+            bias=args.adapter_bias,
         )
         self.gelu = nn.GELU()
-        self.w_out = nn.Linear(dim, dim, bias=True)
+        self.w_out = nn.Linear(dim, dim, bias=args.adapter_bias)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.w_out(self.gelu(self.w_in(x)))
@@ -773,15 +849,28 @@ def input_processor_for_pixtral_hf(
         replace_tokens[-1] = image_end_id
         replace_tokens_list.append(replace_tokens)
 
+    reverse_offsets: List[int] = []
     # Backward iteration for replacement without affecting known indices
     for placeholder_idx, replace_tokens in zip(reversed(placeholder_indices),
                                                reversed(replace_tokens_list)):
+        reverse_offsets.append(
+            len(new_token_ids) - placeholder_idx + len(replace_tokens))
         new_token_ids[placeholder_idx:placeholder_idx + 1] = replace_tokens
 
+    placeholder_ranges: List[PlaceholderRange] = []
+    for reverse_offset, replace_tokens in zip(reversed(reverse_offsets),
+                                              replace_tokens_list):
+        placeholder_ranges.append(
+            PlaceholderRange(
+                offset=len(new_token_ids) - reverse_offset,
+                length=len(replace_tokens),
+            ))
+
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": placeholder_ranges})
 
 
 class PixtralHFMLP(nn.Module):
@@ -829,17 +918,20 @@ def __init__(
 
         self.config = config
         assert not config.hidden_size % config.num_attention_heads
-        self.n_heads = config.num_attention_heads
+        self.total_num_heads = config.num_attention_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        self.n_heads = divide(config.num_attention_heads, tp_size)
         self.head_dim = config.hidden_size // config.num_attention_heads
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size=config.hidden_size,
             head_size=self.head_dim,
-            total_num_heads=self.n_heads,
+            total_num_heads=self.total_num_heads,
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
         )
+        assert self.total_num_heads * self.head_dim == config.hidden_size
         self.o_proj = RowParallelLinear(
             input_size=config.hidden_size,
             output_size=config.hidden_size,
@@ -951,9 +1043,18 @@ def forward(
         x: torch.Tensor,
         attention_mask: torch.Tensor,
         position_embeddings: torch.Tensor,
+        return_all_hidden_states: bool,
     ) -> torch.Tensor:
+        hidden_states_pool = []
+
         for layer in self.layers:
             x = layer(x, attention_mask, position_embeddings)
+            if return_all_hidden_states:
+                hidden_states_pool.append(x)
+        # If we have multiple feature sample layers, we return all hidden
+        # states in order and grab the ones we need by index.
+        if return_all_hidden_states:
+            return hidden_states_pool
         return x
 
 
@@ -971,6 +1072,7 @@ def __init__(
         super().__init__()
 
         self.config = config
+
         self.patch_conv = nn.Conv2d(
             in_channels=config.num_channels,
             out_channels=config.hidden_size,
@@ -1005,6 +1107,7 @@ def __init__(
     def forward(
         self,
         pixel_values: List[torch.Tensor],
+        feature_sample_layers: Optional[list[int]] = None,
     ) -> torch.Tensor:
         """
         Args:
@@ -1012,6 +1115,9 @@ def forward(
                 in pixel_values. This means it will be a list of tensors
                 because multiple requests batched can have multiple images,
                 each with their own shape potentially
+            feature_sample_layers: Layer indices whose features should be
+                concatenated and used as the visual encoder output. If none
+                are provided, the last layer is used.
 
         Returns:
             image_features: tensor of token features for
@@ -1046,14 +1152,22 @@ def forward(
                 [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
                 patch_embeds)
 
-        out = self.transformer(patch_embeds, attention_mask,
-                               position_embedding)
+        return_all_hidden_states = feature_sample_layers is not None
+        out = self.transformer(
+            patch_embeds,
+            attention_mask,
+            position_embedding,
+            return_all_hidden_states=return_all_hidden_states)
+
+        out = resolve_visual_encoder_outputs(out, feature_sample_layers, None,
+                                             self.config.num_hidden_layers)
 
         return out
 
     # (TODO) Add prefix argument for filtering out weights to be loaded
     #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -1063,6 +1177,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         layer_count = len(self.transformer.layers)
 
         for name, loaded_weight in weights:
@@ -1075,8 +1190,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-
-                param = params_dict[name.replace(weight_name, param_name)]
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -1085,3 +1200,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 3d26ede722dd1..63d1374ab4092 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -8,7 +8,7 @@
 import re
 from functools import partial
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Tuple, TypedDict, Union)
+                    Optional, Set, Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
@@ -442,6 +442,7 @@ def __init__(
         rope_scaling: Optional[Dict[str, Any]] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -478,7 +479,8 @@ def __init__(
                               self.head_dim,
                               self.scaling,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -502,6 +504,7 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
@@ -514,7 +517,8 @@ def __init__(
                                   rope_theta=rope_theta,
                                   rope_scaling=rope_scaling,
                                   cache_config=cache_config,
-                                  quant_config=quant_config)
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.attn")
 
         self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
@@ -568,7 +572,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: QWenBlock(config, cache_config, quant_config),
+            lambda prefix: QWenBlock(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.h")
         self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
         self.make_empty_intermediate_tensors = (
@@ -578,6 +583,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                         quant_config=quant_config) if hasattr(
                                             config, "visual") else None
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -586,6 +594,7 @@ def forward(
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         pixel_values: Optional[QwenImageInputs],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         img_pos = None
         # If pixel / visual embeddings are provided, this is a visual model
@@ -606,6 +615,10 @@ def forward(
                 )
 
         if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             hidden_states = self.wte(input_ids)
             # Merge the image embeddings into the hidden states if actually have
             # visual features and the corresponding image tokens
@@ -862,7 +875,7 @@ def dummy_data_for_qwen(
     return DummyData(seq_data, mm_data)
 
 
-class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -915,6 +928,9 @@ def _get_image_input_type(
                 )
         return None
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -922,7 +938,8 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        pixel_values: Optional[torch.Tensor] = None
+        pixel_values: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if intermediate_tensors is not None:
             input_ids = None
@@ -932,7 +949,7 @@ def forward(
 
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata, intermediate_tensors,
-                                         pixel_values)
+                                         pixel_values, inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -952,13 +969,15 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "w2", 0),
             ("gate_up_proj", "w1", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -987,6 +1006,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class QWenLLM(QWenBaseModel):
@@ -1007,8 +1028,15 @@ class QWenLLM(QWenBaseModel):
     embedding_modules = {}
     embedding_padding_modules = []
 
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "w2": ("gate_up_proj", 0),
+        "w1": ("gate_up_proj", 1),
+    }
+
 
-class QWenVL(QWenBaseModel):
+class QWenVL(QWenBaseModel, SupportsMultiModal):
     packed_modules_mapping = {
         "c_attn": ["c_attn"],
         "gate_up_proj": [
@@ -1046,7 +1074,7 @@ def get_mm_mapping(self) -> MultiModelKeys:
 @MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_qwen)
-class QWenLMHeadModel(QWenBaseModel, SupportsLoRA):
+class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     QWenLMHeadModel is not only applicable to LLM  but also to VL, which is not 
     conducive to the current integration logic of LoRA in vLLM. Therefore, it 
@@ -1067,7 +1095,7 @@ def __new__(
         config = vllm_config.model_config.hf_config
         # Initialize VL
         if hasattr(config, "visual"):
-            return QWenVL(vllm_config=vllm_config)
+            return QWenVL(vllm_config=vllm_config, prefix=prefix)
         # Initialize LLM
         else:
-            return QWenLLM(vllm_config=vllm_config)
+            return QWenLLM(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 431e397e1e10d..3ce4eb5869f21 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -21,16 +21,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import Qwen2Config
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -50,10 +51,13 @@
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class Qwen2MLP(nn.Module):
 
@@ -164,11 +168,17 @@ def forward(
         hidden_states: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
+        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q,
+                                k,
+                                v,
+                                kv_cache,
+                                attn_metadata,
+                                attn_type=attn_type)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -210,6 +220,15 @@ def __init__(
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                 eps=config.rms_norm_eps)
 
+        # By default, Qwen2 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
+        if getattr(config, "is_causal", True):
+            self._attn_type = AttentionType.DECODER
+        else:
+            self._attn_type = AttentionType.ENCODER_ONLY
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -230,6 +249,7 @@ def forward(
             hidden_states=hidden_states,
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
+            attn_type=self._attn_type,
         )
 
         # Fully Connected
@@ -309,7 +329,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.embed_tokens(input_ids)
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -332,7 +352,8 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -342,6 +363,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -372,6 +394,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
@@ -398,15 +422,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = []
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -421,7 +436,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
 
         self.config = config
         self.lora_config = lora_config
@@ -430,26 +444,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = Qwen2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
 
-        if config.tie_word_embeddings:
-            self.lm_head = self.model.embed_tokens
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
         else:
-            self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
 
-        # The same model class supports both language generation and embedding
-        # because the architecture name is the same
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -487,20 +496,14 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
 
 
 class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
@@ -540,6 +543,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = Qwen2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
 
+        # TODO: Replace this model class with for_embedding(Qwen2ForCausalLM),
+        # after changing the default pooling method
+        if pooler_config.pooling_type is None:
+            logger.warning(
+                "This embedding model will default to last-token pooling in "
+                "an upcoming version. To avoid breaking changes, you should "
+                "pass `--override-pooler-config '{\"pooling_type\": \"MEAN\"}'`"
+                " explicitly.")
+
         self._pooler = Pooler.from_config_with_defaults(
             pooler_config,
             pooling_type=PoolingType.MEAN,
@@ -565,6 +577,8 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        loader = AutoWeightsLoader(self,
-                                   ignore_unexpected_prefixes=["lm_head."])
-        loader.load_weights(weights)
+        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+        weights = hf_to_vllm_mapper.apply(weights)
+        weights = ((name, data) for name, data in weights
+                   if not name.startswith("lm_head."))
+        self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index d30950361ad89..48a2d470414b9 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -19,8 +19,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
-from functools import lru_cache
-from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union
+from functools import cached_property, lru_cache
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import librosa
 import numpy as np
@@ -33,26 +34,19 @@
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.logger import init_logger
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .interfaces import SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
-_KEYS_TO_MODIFY_MAPPING = {
-    "language_model.lm_head": "lm_head",
-    "language_model.model": "language_model",
-}
-
 
 # # === Audio Inputs === #
 class Qwen2AudioInputs(TypedDict):
@@ -211,7 +205,7 @@ def input_processor_for_qwen2_audio(
 
     return token_inputs(
         prompt_token_ids=new_input_ids,
-        prompt=inputs['prompt'],
+        prompt=inputs.get("prompt"),
         multi_modal_data=multi_modal_data,
     )
 
@@ -278,25 +272,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.quant_config = quant_config
 
-        self.language_model = Qwen2Model(
-            vllm_config=vllm_config.with_hf_config(config.text_config),
-            prefix=prefix)
-        self.unpadded_vocab_size = config.text_config.vocab_size
-        if config.text_config.tie_word_embeddings:
-            self.lm_head = self.language_model.embed_tokens
-        else:
-            self.lm_head = ParallelLMHead(config.text_config.vocab_size,
-                                          config.text_config.hidden_size,
-                                          quant_config=quant_config)
-        logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.text_config.vocab_size,
-                                                logit_scale)
-        self.sampler = get_sampler()
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
     def _validate_and_reshape_mm_tensor(self,
                                         mm_input: Union[torch.Tensor,
                                                         List[torch.Tensor]],
@@ -370,6 +362,25 @@ def _process_audio_input(self,
 
         return masked_audio_features
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return None
+        masked_audio_features = self._process_audio_input(audio_input)
+        return masked_audio_features
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.audio_token_index)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -377,89 +388,45 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
+
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            audio_input = self._parse_and_validate_audio_input(**kwargs)
-
-            if audio_input is None:
-                inputs_embeds = None
-            else:
-                inputs_embeds = self.language_model.embed_tokens(input_ids)
-                masked_audio_features = self._process_audio_input(audio_input)
-                # merge llm embeddings and audio features
-                mask = (input_ids == self.config.audio_token_index)
-                inputs_embeds[mask, :] = masked_audio_features
-
-                input_ids = None
-
-        hidden_states = self.language_model(
-            input_ids=input_ids,
-            positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-        )
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
         return hidden_states
 
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
 
     def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if (self.config.text_config.tie_word_embeddings
-                    and "lm_head.weight" in name):
-                continue
-            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
-                if key_to_modify in name:
-                    name = name.replace(key_to_modify, new_key)
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name or 'audio' in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index 120403e948686..dc5dabf6fc38b 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -4,7 +4,7 @@
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 """Inference-only Qwen2-Classification model compatible with HF weights."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -72,6 +72,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             normalize=False,
             softmax=True)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -79,9 +82,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         logits, _ = self.score(hidden_states)
         return logits
 
@@ -92,7 +97,8 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["lm_head."])
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 51c0cd5664fd2..ba70243c6533d 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -53,7 +53,7 @@
 from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -168,6 +168,7 @@ def __init__(
         max_position_embeddings: int = 8192,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -220,7 +221,8 @@ def __init__(
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -242,9 +244,9 @@ class Qwen2MoeDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        layer_idx: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -261,10 +263,12 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
         )
 
         # Note: Qwen/Qwen2-57B-A14B-Instruct does not have
         # `mlp_only_layers` in the config.
+        layer_idx = extract_layer_index(prefix)
         mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
                            config.mlp_only_layers)
         if (layer_idx not in mlp_only_layers) and (
@@ -333,10 +337,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Qwen2MoeDecoderLayer(config=config,
-                                                layer_idx=int(
-                                                    prefix.split(".")[-1]),
                                                 cache_config=cache_config,
-                                                quant_config=quant_config),
+                                                quant_config=quant_config,
+                                                prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -344,6 +347,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -351,9 +357,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -395,6 +405,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -402,9 +415,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -424,7 +439,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -443,6 +459,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -520,3 +537,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 55843d8325348..988d682d36be3 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -3,7 +3,7 @@
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 """Inference-only Qwen2-RM model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -85,6 +85,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -92,9 +95,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         logits, _ = self.score(hidden_states)
         return logits
 
@@ -105,7 +110,8 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["lm_head."])
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 2335baf459771..cfc90cdab01e4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -21,9 +21,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
-from functools import partial
+from functools import cached_property, partial
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Tuple, Type, TypedDict, Union)
+                    Optional, Set, Tuple, Type, TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -39,9 +39,8 @@
     make_batched_images, make_batched_videos, smart_resize)
 
 from vllm.attention import AttentionMetadata
-from vllm.attention.selector import _Backend
 from vllm.config import VllmConfig
-from vllm.distributed import get_pp_group, parallel_state
+from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
@@ -50,29 +49,25 @@
 from vllm.model_executor.layers.activation import QuickGELU
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
-from vllm.model_executor.layers.quantization import (GPTQConfig,
-                                                     GPTQMarlinConfig,
-                                                     QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.qwen2 import Qwen2Model
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict,
-                                    MultiModalKwargs)
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData
+from vllm.platforms import _Backend
+from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.processor import cached_get_processor
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (PPMissingLayer, get_vit_attn_backend,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, maybe_prefix)
+from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
+                    init_vllm_registered_model, maybe_prefix)
 
 logger = init_logger(__name__)
 
@@ -260,7 +255,7 @@ def __init__(
                                       prefix=f"{prefix}.proj")
 
         # Detect attention implementation.
-        self.attn_backend: _Backend = get_vit_attn_backend()
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
         if self.attn_backend not in {
                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
         }:
@@ -507,6 +502,8 @@ def __init__(
         mlp_ratio: float = vision_config.mlp_ratio
 
         self.spatial_merge_size = spatial_merge_size
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
 
         self.patch_embed = Qwen2VisionPatchEmbed(
             patch_size=patch_size,
@@ -596,6 +593,53 @@ def forward(
         x = self.merger(x)
         return x
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith("qkv.weight"):
+                    visual_num_heads = self.num_heads
+                    visual_embed_dim = self.embed_dim
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size,
+                                                       visual_embed_dim)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
+                elif name.endswith("qkv.bias"):
+                    visual_num_heads = self.num_heads
+                    visual_embed_dim = self.embed_dim
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1)
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 # === Vision input helpers === #
 
@@ -1069,7 +1113,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        pooler_config = vllm_config.model_config.pooler_config
         multimodal_config = vllm_config.model_config.multimodal_config
         assert not cache_config.enable_prefix_caching, \
             "Qwen2-VL currently does not support prefix caching"
@@ -1084,31 +1127,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=maybe_prefix(prefix, "visual"),
         )
 
-        self.model = Qwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
 
-        if get_pp_group().is_last_rank:
-            if config.tie_word_embeddings:
-                self.lm_head = self.model.embed_tokens
-            else:
-                self.lm_head = ParallelLMHead(config.vocab_size,
-                                              config.hidden_size,
-                                              quant_config=quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "lm_head"))
-        else:
-            self.lm_head = PPMissingLayer()
-
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
         self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
 
     def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
         # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
@@ -1237,6 +1270,55 @@ def _merge_multimodal_embeddings(
         inputs_embeds[mask, :] = multimodal_embeddings
         return inputs_embeds
 
+    def get_multimodal_embeddings(
+            self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
+
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        video_input = self._parse_and_validate_video_input(**kwargs)
+        if image_input is None and video_input is None:
+            return None
+
+        # We make a tuple of each embedding with its modality string. This is a
+        # temporary workaround for models to handle mixed modalities when
+        # get_multimodal_embeddings and get_input_embeddings are called
+        # separately.
+        # TODO(ywang96): Add support for mixed-modality inference for v1.
+        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
+
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            multimodal_embeddings.append((image_embeds, "image"))
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            multimodal_embeddings.append((video_embeds, "video"))
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[List[Tuple[NestedTensors,
+                                                   str]]] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            for embeddings, modality in multimodal_embeddings:
+                if modality == "image":
+                    inputs_embeds = self._merge_multimodal_embeddings(
+                        input_ids,
+                        inputs_embeds,
+                        embeddings,
+                        placeholder_token_id=self.config.image_token_id,
+                    )
+                if modality == "video":
+                    inputs_embeds = self._merge_multimodal_embeddings(
+                        input_ids,
+                        inputs_embeds,
+                        embeddings,
+                        placeholder_token_id=self.config.video_token_id,
+                    )
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1244,6 +1326,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for Qwen2-VL.
@@ -1265,44 +1348,28 @@ def forward(
             video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
                 `None` if no videos are passed.
         """
+
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            video_input = self._parse_and_validate_video_input(**kwargs)
-
-            if image_input is None and video_input is None:
-                inputs_embeds = None
-            else:
-                if uses_mrope(self.config):
-                    assert positions.ndim == 2 and positions.size(0) == 3, (
-                        "multimodal section rotary embedding requires "
-                        f"(3, seq_len) positions, but got {positions.size()}")
-
-                inputs_embeds = self.model.embed_tokens(input_ids)
-
-                if image_input is not None:
-                    image_embeds = self._process_image_input(image_input)
-                    inputs_embeds = self._merge_multimodal_embeddings(
-                        input_ids,
-                        inputs_embeds,
-                        image_embeds,
-                        placeholder_token_id=self.config.image_token_id,
-                    )
 
-                if video_input is not None:
-                    video_embeds = self._process_video_input(video_input)
-                    inputs_embeds = self._merge_multimodal_embeddings(
-                        input_ids,
-                        inputs_embeds,
-                        video_embeds,
-                        placeholder_token_id=self.config.video_token_id,
-                    )
-
-                input_ids = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            # We need to check for usage of mrope here in case there is
+            # multimodal data.
+            # TODO (ywang96): move this to model runner in V1.
+            if multimodal_embeddings is not None and uses_mrope(self.config):
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}")
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
 
-        hidden_states = self.model(
+        hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
             kv_caches=kv_caches,
@@ -1312,83 +1379,28 @@ def forward(
         )
         return hidden_states
 
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
 
     def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "up_proj", 1),
-            ("gate_up_proj", "gate_proj", 0),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                if "visual" in name and name.endswith("qkv.weight"):
-                    visual_num_heads = self.config.vision_config.num_heads
-                    visual_embed_dim = self.config.vision_config.embed_dim
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size,
-                                                       visual_embed_dim)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
-                elif "visual" in name and name.endswith("qkv.bias"):
-                    visual_num_heads = self.config.vision_config.num_heads
-                    visual_embed_dim = self.config.vision_config.embed_dim
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1)
-                try:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    param = params_dict[name]
-                except KeyError:
-                    raise ValueError(f"Unexpected weight: {name}") from None
-
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        hf_to_vllm_mapper = WeightsMapper(
+            orig_to_new_prefix={
+                "lm_head.": "language_model.lm_head.",
+                "model.": "language_model.model.",
+            })
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 22c2e328bfb65..68a2467a813a1 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -20,9 +20,11 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
-from .interfaces import (has_inner_state, is_attention_free,
-                         supports_multimodal, supports_pp)
-from .interfaces_base import is_embedding_model, is_text_generation_model
+from .adapters import as_embedding_model
+from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
+                         supports_cross_encoding, supports_multimodal,
+                         supports_pp)
+from .interfaces_base import is_pooling_model, is_text_generation_model
 
 logger = init_logger(__name__)
 
@@ -39,6 +41,7 @@
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
+    "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
@@ -47,12 +50,14 @@
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
+    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
     "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
+    "GritLM": ("gritlm", "GritLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
@@ -73,6 +78,7 @@
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
+    "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
     "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
@@ -89,7 +95,8 @@
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
-    "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
+    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+    "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
     # [Encoder-decoder]
     "BartModel": ("bart", "BartForConditionalGeneration"),
     "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
@@ -100,29 +107,42 @@
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
     "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
+    "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
-    "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"),
-    "LlamaModel": ("llama", "LlamaEmbeddingModel"),
+    "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
+    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
+    "GritLM": ("gritlm", "GritLM"),
+    "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all
         k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
         if arch == "LlamaForCausalLM"
     },
-    "MistralModel": ("llama", "LlamaEmbeddingModel"),
+    "MistralModel": ("llama", "LlamaForCausalLM"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
     "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"),  # noqa: E501
+    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
-    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration") # noqa: E501,
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+}
+
+_CROSS_ENCODER_MODELS = {
+    "BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
+    "RobertaForSequenceClassification": ("roberta",
+                                         "RobertaForSequenceClassification"),
+    "XLMRobertaForSequenceClassification": ("roberta",
+                                            "RobertaForSequenceClassification"),
 }
 
 _MULTIMODAL_MODELS = {
     # [Decoder-only]
+    "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
@@ -135,6 +155,7 @@
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
+    "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
@@ -159,6 +180,7 @@
 _VLLM_MODELS = {
     **_TEXT_GENERATION_MODELS,
     **_EMBEDDING_MODELS,
+    **_CROSS_ENCODER_MODELS,
     **_MULTIMODAL_MODELS,
     **_SPECULATIVE_DECODING_MODELS,
 }
@@ -191,22 +213,37 @@
 
 @dataclass(frozen=True)
 class _ModelInfo:
+    architecture: str
     is_text_generation_model: bool
-    is_embedding_model: bool
+    is_pooling_model: bool
+    supports_cross_encoding: bool
     supports_multimodal: bool
     supports_pp: bool
     has_inner_state: bool
     is_attention_free: bool
+    is_hybrid: bool
 
     @staticmethod
     def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
+        is_pooling_model_ = is_pooling_model(model)
+        if not is_pooling_model_:
+            try:
+                as_embedding_model(model)
+            except Exception:
+                pass
+            else:
+                is_pooling_model_ = True
+
         return _ModelInfo(
+            architecture=model.__name__,
             is_text_generation_model=is_text_generation_model(model),
-            is_embedding_model=is_embedding_model(model),
+            is_pooling_model=is_pooling_model_,
+            supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_pp=supports_pp(model),
             has_inner_state=has_inner_state(model),
             is_attention_free=is_attention_free(model),
+            is_hybrid=is_hybrid(model),
         )
 
 
@@ -380,13 +417,13 @@ def _normalize_archs(
     def inspect_model_cls(
         self,
         architectures: Union[str, List[str]],
-    ) -> _ModelInfo:
+    ) -> Tuple[_ModelInfo, str]:
         architectures = self._normalize_archs(architectures)
 
         for arch in architectures:
             model_info = self._try_inspect_model_cls(arch)
             if model_info is not None:
-                return model_info
+                return (model_info, arch)
 
         return self._raise_for_unsupported(architectures)
 
@@ -407,33 +444,57 @@ def is_text_generation_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
-        return self.inspect_model_cls(architectures).is_text_generation_model
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_text_generation_model
 
-    def is_embedding_model(
+    def is_pooling_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
-        return self.inspect_model_cls(architectures).is_embedding_model
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_pooling_model
+
+    def is_cross_encoder_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_cross_encoding
 
     def is_multimodal_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
-        return self.inspect_model_cls(architectures).supports_multimodal
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_multimodal
 
     def is_pp_supported_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
-        return self.inspect_model_cls(architectures).supports_pp
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_pp
+
+    def model_has_inner_state(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.has_inner_state
 
-    def model_has_inner_state(self, architectures: Union[str,
-                                                         List[str]]) -> bool:
-        return self.inspect_model_cls(architectures).has_inner_state
+    def is_attention_free_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_attention_free
 
-    def is_attention_free_model(self, architectures: Union[str,
-                                                           List[str]]) -> bool:
-        return self.inspect_model_cls(architectures).is_attention_free
+    def is_hybrid_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_hybrid
 
 
 ModelRegistry = _ModelRegistry({
@@ -489,4 +550,4 @@ def _run() -> None:
 
 
 if __name__ == "__main__":
-    _run()
\ No newline at end of file
+    _run()
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index c1dcdd36ec3de..ba1a78ac640fd 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -6,10 +6,18 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
+from vllm.model_executor.layers.pooler import CrossEncodingPooler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
-from vllm.sequence import IntermediateTensors
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.transformers_utils.config import (
+    get_cross_encoder_activation_function)
+
+from .interfaces import SupportsCrossEncoding
 
 
 class RobertaEmbedding(nn.Module):
@@ -39,34 +47,93 @@ def __init__(self, config: RobertaConfig):
     def forward(
         self,
         input_ids: torch.Tensor,
-        position_ids: Optional[torch.Tensor] = None,
+        seq_lens: torch.Tensor,
+        position_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         input_shape = input_ids.size()
-
-        # Input embeddings.
         inputs_embeds = self.word_embeddings(input_ids)
 
-        # TODO: figure out if there is a better way
-        # to make to make position ids start at padding_idx + 1
+        # Replace position ids because in RoBERTa models
+        # they have to start at padding_idx + 1 and ignore
+        # existing padding tokens
         # References:
         # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
         # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
-        position_ids += self.padding_idx + 1
+        pos_list = []
+        token_list = []
+        offset = 0
+        for seq_len in seq_lens:
+            pos_list.append(position_ids[offset:offset + seq_len])
+            token_list.append(input_ids[offset:offset + seq_len])
+            offset += seq_len
+
+        new_pos_list = []
+        for positions, tokens in zip(pos_list, token_list):
+            # Verify assumption that incoming position are
+            # always a sequence from 0 to N.
+            expected_pos = torch.arange(positions.size()[0],
+                                        dtype=torch.long,
+                                        device=inputs_embeds.device)
+            assert torch.equal(positions, expected_pos)
+            new_pos_list.append(
+                create_position_ids_from_input_ids(tokens, self.padding_idx))
+        position_ids = torch.cat(new_pos_list)
 
         # Position embeddings.
         position_embeddings = self.position_embeddings(position_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape,
+                                         dtype=torch.long,
+                                         device=inputs_embeds.device)
 
-        # Token type embeddings. (TODO: move off hotpath?)
-        token_type_embeddings = self.token_type_embeddings(
-            torch.zeros(input_shape,
-                        dtype=torch.long,
-                        device=inputs_embeds.device))
-
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
         embeddings = inputs_embeds + token_type_embeddings + position_embeddings
         embeddings = self.LayerNorm(embeddings)
         return embeddings
 
 
+# Adapted from transformers
+def create_position_ids_from_input_ids(input_ids,
+                                       padding_idx,
+                                       past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+
+    incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) +
+                           past_key_values_length) * mask
+
+    return incremental_indices.long() + padding_idx
+
+
+# Adapted from transformers
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.out_proj(x)
+        return x
+
+
 class RobertaEmbeddingModel(BertEmbeddingModel):
     """A model that uses Roberta to provide embedding functionalities.
 
@@ -85,6 +152,62 @@ def _build_model(self,
                          prefix=prefix,
                          embedding_class=RobertaEmbedding)
 
+
+class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
+    """A model that uses Roberta to provide embedding functionalities.
+
+   This class encapsulates the BertModel and provides an interface for
+   embedding operations and customized pooling functions.
+
+   Attributes:
+       roberta: An instance of BertModel used for forward operations.
+       _pooler: An instance of Pooler used for pooling operations.
+   """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.default_activation_function = \
+            get_cross_encoder_activation_function(config)
+
+        self.num_labels = config.num_labels
+        self.roberta = BertModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "bert"),
+                                 embedding_class=RobertaEmbedding,
+                                 add_pooling_layer=False)
+        self.classifier = RobertaClassificationHead(config)
+        self._pooler = CrossEncodingPooler(config, self.classifier)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("roberta."):
+                    yield (name[len("roberta."):], weight)
+                else:
+                    self_weights.append((name, weight))
+
+        self.roberta.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
     def forward(
         self,
         input_ids: Optional[torch.Tensor],
@@ -93,25 +216,12 @@ def forward(
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
-        # Verify assumption that position are always a sequence from
-        # 0 to N. (Actually here we just check 0 and N to simplify).
-        # This is important to fix the position which are assumed to
-        # start from padding_idx + 1 instead of 0 in the Roberta models.
-        assert hasattr(attn_metadata, "seq_lens_tensor")
-        cumulative = attn_metadata.seq_lens_tensor.cumsum(dim=0)
-        start_pos = torch.cat(
-            (torch.tensor([0], device=attn_metadata.seq_lens_tensor.device),
-             cumulative[:-1]))
-        assert len(torch.nonzero(positions[start_pos])) == 0
-        end_pos = cumulative - 1
-        last_tokens = attn_metadata.seq_lens_tensor - 1
-        assert len(torch.nonzero(positions[end_pos] - last_tokens)) == 0
-
-        return super().forward(input_ids=input_ids,
-                               positions=positions,
-                               kv_caches=kv_caches,
-                               attn_metadata=attn_metadata,
-                               intermediate_tensors=intermediate_tensors,
-                               inputs_embeds=inputs_embeds)
+        return self.roberta(input_ids=input_ids,
+                            position_ids=positions,
+                            kv_caches=kv_caches,
+                            inputs_embeds=inputs_embeds,
+                            intermediate_tensors=intermediate_tensors,
+                            attn_metadata=attn_metadata,
+                            token_type_ids=token_type_ids)
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index acaf4afdecfe5..6fb9e2cc4584f 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -2,15 +2,15 @@
 within a vision language model."""
 
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
 from PIL import Image
 from torch import nn
 from transformers import SiglipVisionConfig
-from transformers.models.siglip.modeling_siglip import SiglipSdpaAttention
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import DecoderOnlyInputs, token_inputs
@@ -24,15 +24,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
+                                   repeat_and_pad_placeholder_tokens,
+                                   resolve_visual_encoder_outputs)
 from vllm.sequence import SequenceData
 
-try:
-    from xformers import ops as xops
-    USE_XFORMERS_OPS = True
-except ImportError:
-    USE_XFORMERS_OPS = False
-
 
 def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     # Since interpolation is applied, the image size need not be divisible
@@ -254,7 +249,7 @@ def forward(self,
         return embeddings
 
 
-class SiglipParallelAttention(nn.Module):
+class SiglipAttention(nn.Module):
 
     def __init__(
         self,
@@ -293,32 +288,18 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
         """Input shape: Batch x Time x Channel"""
-        batch_size, q_len, _ = hidden_states.size()
-
         qkv_states, _ = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
 
-        query_states = query_states.view(batch_size, q_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-        key_states = key_states.view(batch_size, q_len,
-                                     self.num_heads_per_partition,
-                                     self.head_dim)
-        value_states = value_states.view(batch_size, q_len,
-                                         self.num_heads_per_partition,
-                                         self.head_dim)
-
-        out = xops.memory_efficient_attention_forward(query_states,
-                                                      key_states,
-                                                      value_states,
-                                                      p=self.dropout,
-                                                      scale=self.scale)
-        out = out.view(batch_size, q_len, -1)
+        out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
 
         return attn_output, None
@@ -372,17 +353,11 @@ def __init__(
 
         self.embed_dim = config.hidden_size
 
-        num_heads = config.num_attention_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        if USE_XFORMERS_OPS and num_heads % tp_size == 0:
-            self.self_attn = SiglipParallelAttention(
-                config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.self_attn",
-            )
-        else:
-            self.self_attn = SiglipSdpaAttention(config)
-
+        self.self_attn = SiglipAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
         self.layer_norm1 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
@@ -439,11 +414,19 @@ def __init__(
     def forward(
         self,
         inputs_embeds: torch.Tensor,
-    ) -> torch.Tensor:
+        return_all_hidden_states: bool,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        hidden_states_pool = []
         hidden_states = inputs_embeds
+
         for encoder_layer in self.layers:
             hidden_states, _ = encoder_layer(hidden_states)
-
+            if return_all_hidden_states:
+                hidden_states_pool.append(hidden_states)
+        # If we have multiple feature sample layers, we return all hidden
+        # states in order and grab the ones we need by index.
+        if return_all_hidden_states:
+            return hidden_states_pool
         return hidden_states
 
 
@@ -498,6 +481,7 @@ def __init__(
         embed_dim = config.hidden_size
 
         self.embeddings = SiglipVisionEmbeddings(config)
+
         self.encoder = SiglipEncoder(
             config,
             quant_config=quant_config,
@@ -535,23 +519,33 @@ def forward(
         self,
         pixel_values: torch.Tensor,
         interpolate_pos_encoding: bool = True,
+        feature_sample_layers: Optional[list[int]] = None,
     ) -> torch.Tensor:
+
         hidden_states = self.embeddings(
             pixel_values,
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
-        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+        return_all_hidden_states = feature_sample_layers is not None
 
-        if self.post_layernorm is None:
-            return encoder_outputs
+        # Produces either the last layer output or all of the hidden states,
+        # depending on if we have feature_sample_layers or not
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=return_all_hidden_states,
+        )
+
+        # Handle post-norm (if applicable) and stacks feature layers if needed
+        encoder_outputs = resolve_visual_encoder_outputs(
+            encoder_outputs, feature_sample_layers, self.post_layernorm,
+            self.config.num_hidden_layers)
 
-        last_hidden_state = self.post_layernorm(encoder_outputs)
-        # TODO: add this back when pooled_output is used in inference
+        # TODO: add this back when pooled_output is used in inference.
         # if self.use_head:
-        # pooled_output = self.head(last_hidden_state)
+        # pooled_output = self.head(encoder_outputs)
 
-        return last_hidden_state
+        return encoder_outputs
 
 
 class SiglipVisionModel(nn.Module):
@@ -569,10 +563,6 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        num_heads = config.num_attention_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0
-
         self.vision_model = SiglipVisionTransformer(
             config,
             quant_config,
@@ -588,20 +578,24 @@ def forward(
         self,
         pixel_values: torch.Tensor,
         interpolate_pos_encoding: bool = False,
+        feature_sample_layers: Optional[list[int]] = None,
     ) -> torch.Tensor:
         return self.vision_model(
             pixel_values=pixel_values,
             interpolate_pos_encoding=interpolate_pos_encoding,
+            feature_sample_layers=feature_sample_layers,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
             ("qkv_proj", "v_proj", "v"),
-        ] if self.shard_weight else []
+        ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         layer_count = len(self.vision_model.encoder.layers)
 
         for name, loaded_weight in weights:
@@ -619,8 +613,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
+                name = name.replace(weight_name, param_name)
 
-                param = params_dict[name.replace(weight_name, param_name)]
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -629,3 +624,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 4f03ca501fb68..caae0b65d7d10 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -21,7 +21,7 @@
 # limitations under the License.
 """Inference-only Solar model compatible with HuggingFace weights."""
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -167,6 +167,7 @@ def __init__(
             num_kv_heads=self.num_kv_heads,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=f"{prefix}.attn",
         )
 
     def forward(
@@ -442,10 +443,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
+        self.sampler = get_sampler()
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -456,9 +458,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  inputs_embeds)
         return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -475,7 +479,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -485,6 +490,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -500,6 +506,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
                 continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
@@ -533,6 +540,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 1125f9e9f9617..6b2107bef0a66 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -18,7 +18,7 @@
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
 """Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -77,7 +77,8 @@ class StablelmAttention(nn.Module):
     def __init__(self,
                  config: PretrainedConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -131,7 +132,8 @@ def __init__(self,
                               self.scaling,
                               num_kv_heads=self.num_key_value_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -155,9 +157,13 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
-        self.self_attn = StablelmAttention(config, cache_config, quant_config)
+        self.self_attn = StablelmAttention(config,
+                                           cache_config,
+                                           quant_config,
+                                           prefix=f"{prefix}.self_attn")
         self.mlp = StablelmMLP(config, quant_config)
         norm_eps = getattr(config, "norm_eps",
                            getattr(config, "layer_norm_eps", 1e-05))
@@ -207,8 +213,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: StablelmDecoderLayer(config, cache_config,
-                                                quant_config),
+            lambda prefix: StablelmDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         norm_eps = getattr(config, "norm_eps",
@@ -218,6 +224,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -225,9 +234,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -265,6 +278,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -272,9 +288,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -294,7 +312,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -304,6 +323,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -335,3 +355,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index ce7a7957f52c4..22189a517d313 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch Starcoder2 model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -37,7 +37,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -52,7 +53,8 @@ class Starcoder2Attention(nn.Module):
     def __init__(self,
                  config: Starcoder2Config,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.config = config
 
@@ -105,7 +107,8 @@ def __init__(self,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
 
     def forward(
         self,
@@ -154,12 +157,14 @@ class Starcoder2DecoderLayer(nn.Module):
     def __init__(self,
                  config: Starcoder2Config,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = Starcoder2Attention(config,
                                              cache_config,
-                                             quant_config=quant_config)
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.self_attn")
         self.mlp = Starcoder2MLP(config, quant_config=quant_config)
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
                                             eps=config.norm_epsilon)
@@ -213,7 +218,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Starcoder2DecoderLayer(
-                config, cache_config, quant_config=quant_config),
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
             prefix=f"{prefix}.layers",
         )
         self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
@@ -221,6 +227,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -228,9 +237,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -273,6 +286,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -280,9 +296,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -302,7 +320,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -311,6 +330,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -326,6 +346,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
                 if self.config.tie_word_embeddings and "lm_head.weight" in name:
                     continue
                 if is_pp_missing_parameter(name, self):
@@ -334,3 +358,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
new file mode 100644
index 0000000000000..39c9103527f01
--- /dev/null
+++ b/vllm/model_executor/models/telechat2.py
@@ -0,0 +1,131 @@
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Iterable, Set, Tuple
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel
+
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    is_pp_missing_parameter)
+
+
+class TeleChat2Model(LlamaModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # 1. Initialize the LlamaModel with bias
+        vllm_config.model_config.hf_config.bias = True
+        vllm_config.model_config.hf_config.mlp_bias = True
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # 2. Remove the bias from the qkv_proj and gate_up_proj based on config
+        # Telechat2's gate_up_proj and qkv_proj don't have bias
+        # see: https://github.com/vllm-project/vllm/pull/10311#issuecomment-2490297566
+        for layer in self.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.qkv_proj.bias = None
+                layer.self_attn.qkv_proj.skip_bias_add = True
+                layer.mlp.gate_up_proj.bias = None
+                layer.mlp.gate_up_proj.skip_bias_add = True
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            ('gate_up_proj', 'gate_proj', 0),
+            ('gate_up_proj', 'up_proj', 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        total_num_heads = self.config.n_head
+        head_dim = self.config.hidden_size // total_num_heads
+        for name, loaded_weight in weights:
+            if "self_attn.key_value" in name:
+                k_weight = []
+                v_weight = []
+                for i in range(total_num_heads):
+                    start = i * head_dim * 2
+                    k_weight.append(loaded_weight[start:start + head_dim, :])
+                    v_weight.append(loaded_weight[start + head_dim:start +
+                                                  2 * head_dim:])
+                k_weight = torch.cat(k_weight, dim=0)
+                v_weight = torch.cat(v_weight, dim=0)
+                name = name.replace("key_value", "qkv_proj")
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, k_weight, "k")
+                weight_loader(param, v_weight, "v")
+            elif "query" in name:
+                name = name.replace("query", "qkv_proj")
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, "q")
+            else:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class TeleChat2ForCausalLM(LlamaForCausalLM):
+
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+
+        hf_to_vllm_mapper = WeightsMapper(
+            orig_to_new_prefix={
+                "transformer.": "model.",
+            },
+            orig_to_new_substr={
+                ".h.": ".layers.",
+                ".self_attention.": ".self_attn.",
+                ".word_embeddings.": ".embed_tokens.",
+                ".dense.": ".o_proj.",
+                ".ln_f.": ".norm.",
+            },
+        )
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 9fde22c016de0..ebaa8a4c4f38a 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,41 +3,39 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
-                    TypedDict, Union, cast)
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
+from transformers import BatchFeature
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             NestedTensors)
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataDict,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings_from_map)
 
-_AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
 
 
@@ -72,64 +70,18 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
     return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
 
 
-def dummy_seq_data_for_ultravox(
-    ctx: InputContext,
-    seq_len: int,
-    audio_count: int,
-):
-    audio_length = min(get_ultravox_max_audio_tokens(ctx),
-                       seq_len // audio_count)
+class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
-    return SequenceData.from_prompt_token_counts(
-        (_AUDIO_PLACEHOLDER_TOKEN, audio_length * audio_count),
-        (0, seq_len - audio_length * audio_count)), {
-            "audio":
-            consecutive_placeholder_ranges(num_items=audio_count,
-                                           item_size=audio_length)
-        }
-
-
-def dummy_audio_for_ultravox(
-    ctx: InputContext,
-    audio_count: int,
-):
-    feature_extractor = whisper_feature_extractor(ctx)
-    audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1)
-    return {"audio": [audio_and_sr] * audio_count}
-
-
-def dummy_data_for_ultravox(
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-):
-    audio_count = mm_counts["audio"]
-    seq_data, ranges = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
-    mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
-
-    return DummyData(seq_data, mm_dict, ranges)
-
-
-def input_mapper_for_ultravox(ctx: InputContext, data: object):
-    if not isinstance(data, list):
-        data = [data]
-
-    if len(data) == 0:
-        return MultiModalKwargs()
-
-    # If the audio inputs are embeddings, no need for preprocessing
-    if is_list_of(data, torch.Tensor, check="all"):
-        return MultiModalKwargs({"audio_embeds": data})
-
-    audio_features = []
-    for audio_input in data:
-        if not isinstance(audio_input, tuple):
-            raise NotImplementedError(
-                f"Unsupported data type: {type(audio_input)}")
-
-        (audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], audio_input)
-        feature_extractor = whisper_feature_extractor(ctx)
+    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
+        return self._get_hf_processor().audio_processor.feature_extractor
 
+    def _resample_audio(
+        self,
+        audio: np.ndarray,
+        sr: int,
+    ) -> Dict[str, Union[np.ndarray, int]]:
+        # resample audio to the model's sampling rate
+        feature_extractor = self._get_feature_extractor()
         if sr != feature_extractor.sampling_rate:
             try:
                 import librosa
@@ -140,78 +92,92 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
                                      orig_sr=sr,
                                      target_sr=feature_extractor.sampling_rate)
             sr = feature_extractor.sampling_rate
+        return {"audio": audio, "sampling_rate": sr}
 
-        minimum_audio_length = feature_extractor.n_fft // 2 + 1
-        if len(audio) < minimum_audio_length:
-            # Not enough audio; pad it.
-            audio = np.pad(audio, (0, minimum_audio_length - len(audio)))
-
-        single_audio_features = feature_extractor(
-            audio, sampling_rate=sr, padding="longest",
-            return_tensors="pt")["input_features"]
-
-        # Remove the batch dimension because we're wrapping it in a list.
-        audio_features.append(single_audio_features.squeeze(0))
-
-    return MultiModalKwargs({"audio_features": audio_features})
-
+    def _apply_hf_processor(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data or not mm_data.get("audio", None):
+            return super()._apply_hf_processor(prompt, mm_data,
+                                               mm_processor_kwargs)
+
+        audio_data = mm_data["audio"]
+        if not isinstance(audio_data, list):
+            audio_data = [audio_data]
+
+        # Ultravox processor doesn't support multiple inputs,
+        # therefore we need to input text and audio one by one
+        tokenizer = self._get_tokenizer()
+        audio_features, audio_token_len = [], []
+        processed_inputs = {}
+        for audio, sr in audio_data:
+            data = self._resample_audio(audio, sr)
+            processed_inputs = super()._apply_hf_processor(
+                prompt, data, mm_processor_kwargs)
+            prompt = tokenizer.decode(processed_inputs["input_ids"][0],
+                                      skip_special_tokens=False)
+            audio_features.append(
+                processed_inputs.pop("audio_values").squeeze(0))
+            audio_token_len.append(
+                processed_inputs.pop("audio_token_len").item())
+
+        return dict(
+            **processed_inputs,
+            audio_features=audio_features,
+            audio_token_len=audio_token_len,
+        )
+
+    def _get_processor_data(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        # Ultravox uses "audio" instead of "audios" as calling keyword
+        processor_data, passthrough_data = super()._get_processor_data(mm_data)
+        if "audios" in processor_data:
+            processor_data["audio"] = processor_data.pop("audios")
+        return processor_data, passthrough_data
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        placeholder = hf_processor.audio_token_replacement
+
+        def get_replacement_ultravox(item_idx: int):
+            audio_token_len = hf_inputs["audio_token_len"][item_idx]
+            return placeholder * audio_token_len
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target="<|audio|>",
+                replacement=get_replacement_ultravox,
+            )
+        ]
 
-def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "audio" not in multi_modal_data:
-        return inputs
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self._get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
 
-    if "multi_modal_placeholders" in inputs and "audio" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
+        audio_count = mm_counts["audio"]
+        audio = np.zeros(audio_len)
+        data = {"audio": [(audio, sampling_rate)] * audio_count}
 
-    feature_extractor = whisper_feature_extractor(ctx)
-    audios = multi_modal_data["audio"]
-    if not isinstance(audios, list):
-        audios = [audios]
-
-    audio_token_counts = []
-    for audio in audios:
-        if isinstance(audio, torch.Tensor):
-            audio_num_tokens = audio.shape[1]
-            audio_token_counts.append(audio_num_tokens)
-        else:
-            audio_data, sample_rate = audio
-            audio_length = audio_data.shape[0]
-            if sample_rate != feature_extractor.sampling_rate:
-                # Account for resampling.
-                adjustment = feature_extractor.sampling_rate / sample_rate
-                audio_length = math.ceil(adjustment * audio_length)
-
-            feature_extractor_output_length = math.ceil(
-                (audio_length - (feature_extractor.hop_length - 1)) /
-                feature_extractor.hop_length)
-
-            uv_config = ctx.get_hf_config(UltravoxConfig)
-            audio_num_tokens = min(
-                max(
-                    1,
-                    math.ceil(feature_extractor_output_length /
-                              (uv_config.stack_factor * 2))),
-                get_ultravox_max_audio_tokens(ctx))
-            audio_token_counts.append(audio_num_tokens)
-
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN,
-        repeat_count=audio_token_counts,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"audio": ranges})
+        return ProcessorInputs(
+            prompt_text="<|audio|>" * audio_count,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 class StackAudioFrames(nn.Module):
@@ -332,11 +298,9 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_ultravox)
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "audio", get_ultravox_max_audio_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_ultravox)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_ultravox)
+@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -360,9 +324,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 ))
         self.multi_modal_projector = UltravoxProjector(config)
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
         if config.text_model_id is not None:
             # this prefix is not for initialization, but for loading weights
             # note the trailing dot
@@ -449,10 +414,36 @@ def _process_audio_input(
 
         return result
 
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return None
+        audio_embeddings = self._process_audio_input(audio_input)
+        return audio_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+
+            # TODO(ywang96): use merge_multimodal_embeddings after
+            # v0 is deprecated
+            merge_multimodal_embeddings_from_map(
+                inputs_embeds, multimodal_embeddings,
+                attn_metadata.multi_modal_placeholder_index_maps["audio"])
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
-                intermediate_tensors: Optional[torch.Tensor],
+                intermediate_tensors: Optional[torch.Tensor] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for Ultravox
 
@@ -466,30 +457,28 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         Args:
             audio_features: A batch of audio inputs [B, N, 80, M].
         """
+
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            audio_input = self._parse_and_validate_audio_input(**kwargs)
-            if audio_input is not None:
-                audio_embeddings = self._process_audio_input(audio_input)
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                merge_multimodal_embeddings_from_map(
-                    inputs_embeds, audio_embeddings,
-                    attn_metadata.multi_modal_placeholder_index_maps["audio"])
-                input_ids = None
-            else:
-                inputs_embeds = None
-
-        hidden_states = self.language_model.model(
-            input_ids=input_ids,
-            positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds)
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            # TODO(ywang96): remove attn_metadata from get_input_embeddings
+            # after v0 is deprecated
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings,
+                                                      attn_metadata)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -504,10 +493,11 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         hf_to_vllm_mapper = WeightsMapper(
             orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
 
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["audio_tower."])
-        loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 1d51885f9094a..269b66806adf4 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,7 +1,7 @@
 import itertools
 from dataclasses import dataclass, field
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Protocol, Tuple, Union, overload)
+from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional,
+                    Protocol, Set, Tuple, Union, overload)
 
 import torch
 import torch.nn as nn
@@ -9,15 +9,15 @@
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
-from vllm.attention.selector import (_Backend, backend_name_to_enum,
+from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
-from vllm.platforms import current_platform
+from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available
+from vllm.utils import is_pin_memory_available, print_warning_once
 
 logger = init_logger(__name__)
 
@@ -172,8 +172,16 @@ def _load_module(
         if module != self.module:
             module_load_weights = getattr(module, "load_weights", None)
             if callable(module_load_weights):
-                module_load_weights(weights)
-                return
+                loaded_params = module_load_weights(weights)
+                if loaded_params is None:
+                    logger.warning(
+                        "Unable to collect loaded parameters "
+                        "for module %s", module)
+                else:
+                    yield from map(
+                        lambda x: self._get_qualname(base_prefix, x),
+                        loaded_params,
+                    )
 
         child_modules = dict(module.named_children())
         child_params = dict(module.named_parameters(recurse=False))
@@ -222,26 +230,36 @@ def load_weights(
         weights: Iterable[Tuple[str, torch.Tensor]],
         *,
         mapper: Optional[WeightsMapper] = None,
-    ) -> List[str]:
+    ) -> Set[str]:
         if mapper is not None:
             weights = mapper.apply(weights)
 
-        autoloaded_weights = list(self._load_module("", self.module, weights))
+        autoloaded_weights = set(self._load_module("", self.module, weights))
         return autoloaded_weights
 
 
 def init_vllm_registered_model(
-    hf_config: PretrainedConfig,
     vllm_config: VllmConfig,
+    *,
     prefix: str = "",
+    hf_config: Optional[PretrainedConfig] = None,
+    architectures: Optional[list[str]] = None,
 ) -> nn.Module:
     """
     Helper function to initialize an inner model registered to vLLM,
     based on the arguments passed to the outer vLLM model.
     """
     from vllm.model_executor.model_loader.loader import _initialize_model
-    vllm_config = vllm_config.with_hf_config(hf_config)
-    return _initialize_model(vllm_config, prefix)
+
+    if hf_config is None and architectures is not None:
+        # So that the architectures field is overridden
+        hf_config = vllm_config.model_config.hf_config
+
+    if hf_config is not None:
+        vllm_config = vllm_config.with_hf_config(hf_config,
+                                                 architectures=architectures)
+
+    return _initialize_model(vllm_config=vllm_config, prefix=prefix)
 
 
 @overload
@@ -355,8 +373,7 @@ def embed_multimodal(
     input_ids: torch.Tensor,
     multimodal_token_id: int,
     get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
-    get_multimodal_embeds: Callable[[torch.Tensor], Union[torch.Tensor,
-                                                          List[torch.Tensor]]],
+    multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]],
 ) -> torch.Tensor:
     """
     Embed token IDs and multimodal inputs and combine their embeddings.
@@ -373,8 +390,6 @@ def embed_multimodal(
     is_text = ~is_multimodal
 
     text_embeds = get_text_embeds(input_ids[is_text])
-    multimodal_embeds = get_multimodal_embeds(input_ids[is_multimodal])
-
     merged_embeds = torch.empty(
         (input_ids.shape[0], text_embeds.shape[1]),
         dtype=text_embeds.dtype,
@@ -394,16 +409,42 @@ def merge_multimodal_embeddings(
     input_ids: torch.Tensor,
     inputs_embeds: torch.Tensor,
     multimodal_embeddings: NestedTensors,
-    placeholder_token_id: int,
+    placeholder_token_id: Union[int, List[int]],
 ) -> torch.Tensor:
     """
     Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
     positions in ``inputs_embeds`` corresponding to placeholder tokens in
     ``input_ids``.
+    
+    ``placeholder_token_id`` can be a list of token ids (e.g, token ids 
+    of img_start, img_break, and img_end tokens) when needed: This means 
+    the order of these tokens in the ``input_ids`` MUST MATCH the order of 
+    their embeddings in ``multimodal_embeddings`` since we need to 
+    slice-merge instead of individually scattering.
+
+    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
+    - T is text token
+    - S is image start token
+    - I is image embedding token
+    - B is image break token
+    - E is image end token.
+    
+    Then the image embeddings (that correspond to I's) from vision encoder 
+    must be padded with embeddings of S, B, and E in the same order of 
+    input_ids for a correct embedding merge.
 
     Note:
         This updates ``inputs_embeds`` in place.
     """
+    if isinstance(placeholder_token_id, list):
+        placeholder_token_id = torch.tensor(placeholder_token_id,
+                                            device=input_ids.device)
+        return _merge_multimodal_embeddings(
+            inputs_embeds,
+            torch.isin(input_ids, placeholder_token_id),
+            multimodal_embeddings,
+        )
+
     return _merge_multimodal_embeddings(
         inputs_embeds,
         (input_ids == placeholder_token_id),
@@ -562,31 +603,11 @@ def make_empty_intermediate_tensors(
     return make_empty_intermediate_tensors
 
 
-class LLMWrapper(nn.Module):
+def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
     """
-    To align with the key names of LoRA trained with PEFT, we need to add an
-    additional layer to the llm's implementation.
+    Get the available attention backend for Vision Transformer.
     """
-
-    def __init__(self, llm: nn.Module, name: str) -> None:
-        super().__init__()
-        self.model_name = name
-        setattr(self, name, llm)
-
-    def __getattr__(self, key: str):
-        llm = super().__getattr__(self.model_name)
-        if key == self.model_name:
-            return llm
-
-        return getattr(llm, key)
-
-    # We need to explicitly override this
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        llm = super().__getattr__(self.model_name)
-        return llm(*args, **kwargs)
-
-
-def get_vit_attn_backend() -> _Backend:
+    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
     selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
     if selected_backend is None:
         backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
@@ -595,17 +616,18 @@ def get_vit_attn_backend() -> _Backend:
     if selected_backend is None:
         # For Volta and Turing GPUs, use xformers instead.
         device_available = current_platform.has_device_capability(80)
-        if device_available:
+        if device_available and support_fa:
             from transformers.utils import is_flash_attn_2_available
             if is_flash_attn_2_available():
                 selected_backend = _Backend.FLASH_ATTN
             else:
-                logger.warning(
+                print_warning_once(
                     "Current `vllm-flash-attn` has a bug inside vision module, "
                     "so we use xformers backend instead. You can run "
                     "`pip install flash-attn` to use flash-attention backend.")
                 selected_backend = _Backend.XFORMERS
-        elif current_platform.is_cpu():
+        elif current_platform.is_cpu() or current_platform.is_rocm():
+            # ROCM doesn't support xformers
             selected_backend = _Backend.TORCH_SDPA
         else:
             selected_backend = _Backend.XFORMERS
@@ -623,3 +645,24 @@ def maybe_prefix(prefix: str, name: str) -> str:
         The string "prefix.name" if prefix was non-empty, otherwise just "name".
     """
     return name if not prefix else f"{prefix}.{name}"
+
+
+def extract_layer_index(layer_name: str) -> int:
+    """
+    Extract the layer index from the module name.
+    Examples:
+    - "encoder.layers.0" -> 0
+    - "encoder.layers.1.self_attn" -> 1
+    - "2.self_attn" -> 2
+    - "model.encoder.layers.0.sub.1" -> ValueError
+    """
+    subnames = layer_name.split(".")
+    int_vals: List[int] = []
+    for subname in subnames:
+        try:
+            int_vals.append(int(subname))
+        except ValueError:
+            continue
+    assert len(int_vals) == 1, (f"layer name {layer_name} should"
+                                " only contain one integer")
+    return int_vals[0]
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
deleted file mode 100644
index 153527da20d75..0000000000000
--- a/vllm/model_executor/models/xverse.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Adapted from
-# https://huggingface.co/xverse/XVERSE-7B/blob/main/modeling_xverse.py
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Xverse model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from transformers import PretrainedConfig
-
-from vllm.attention import Attention, AttentionMetadata
-from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
-
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
-
-
-class XverseMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class XverseAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
-        bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        # partition the KV heads across multiple tensor parallel GPUs.
-        assert self.total_num_kv_heads % tp_size == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=bias,
-            quant_config=quant_config,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=bias,
-            quant_config=quant_config,
-        )
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class XverseDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = XverseAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            quant_config=quant_config,
-            bias=getattr(config, "bias", False),
-            cache_config=cache_config,
-        )
-        self.mlp = XverseMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            quant_config=quant_config,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-@support_torch_compile
-class XverseModel(nn.Module):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-        )
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: XverseDecoderLayer(config, cache_config,
-                                              quant_config),
-            prefix=f"{prefix}.layers",
-        )
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors],
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
-            residual = None
-        else:
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-                residual,
-            )
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class XverseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
-    embedding_modules = {
-        "embed_tokens": "input_embeddings",
-        "lm_head": "output_embeddings",
-    }
-    embedding_padding_modules = ["lm_head"]
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-
-        self.config = config
-        self.lora_config = lora_config
-
-        self.quant_config = quant_config
-        self.model = XverseModel(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if ("rotary_emb.inv_freq" in name
-                    or "rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 84f35f75a0c32..1df8f84ed4093 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -454,6 +454,7 @@ def from_sampling_metadata(
         if do_penalties:
             for seq_group in sampling_metadata.seq_groups:
                 seq_ids = seq_group.seq_ids
+                sampling_params = seq_group.sampling_params
                 if (seq_group.is_prompt
                         and sampling_params.prompt_logprobs is not None):
                     prefill_len = len(seq_group.prompt_logprob_indices)
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 03a5f3a91f7a1..928c31a2f2843 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -27,18 +27,3 @@
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
 ]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "MultiModalInputs":
-        msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
-               "The original name will take another meaning in an upcoming "
-               "version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return MultiModalKwargs
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 6eec660e42ac4..fe77a4635f7d8 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -7,7 +7,7 @@
 
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
-from vllm.utils import (get_allowed_kwarg_only_overrides,
+from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
                         resolve_mm_processor_kwargs)
 
 if TYPE_CHECKING:
@@ -54,8 +54,8 @@ class MultiModalPlugin(ABC):
     """
 
     def __init__(self) -> None:
-        self._input_mappers: Dict[Type[nn.Module], MultiModalInputMapper] = {}
-        self._max_mm_tokens: Dict[Type[nn.Module], MultiModalTokensCalc] = {}
+        self._input_mappers = ClassRegistry[nn.Module, MultiModalInputMapper]()
+        self._max_mm_tokens = ClassRegistry[nn.Module, MultiModalTokensCalc]()
 
     @abstractmethod
     def get_data_key(self) -> str:
@@ -99,7 +99,7 @@ def register_input_mapper(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_mappers:
+            if self._input_mappers.contains(model_cls, strict=True):
                 logger.warning(
                     "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
@@ -194,7 +194,7 @@ def register_max_multimodal_tokens(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._max_mm_tokens:
+            if self._max_mm_tokens.contains(model_cls, strict=True):
                 logger.warning(
                     "Model class %s already calculates maximum number of "
                     "tokens in %s. It is overwritten by the new one.",
@@ -226,16 +226,16 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
+        from vllm.model_executor.models import supports_multimodal
 
         model_cls, _ = get_model_architecture(model_config)
 
-        if model_cls not in self._input_mappers:
+        if not supports_multimodal(model_cls):
             return 0
 
         max_mm_tokens = self._max_mm_tokens.get(model_cls)
         if max_mm_tokens is None:
-            raise KeyError(f"No maximum number of multi-modal tokens is given "
-                           f"for model class {model_cls.__name__} in {self}.")
+            return 0
 
         if callable(max_mm_tokens):
             mm_processor_kwargs = get_allowed_kwarg_only_overrides(
@@ -326,26 +326,47 @@ def from_seq_group(
             src_ranges  = []
             dest_ranges = []
         """
-        if (not seq_group.multi_modal_data
-                or not seq_group.multi_modal_placeholders):
-            return seq_group.multi_modal_data, {}
+        seq_mm_data = seq_group.multi_modal_data
+        seq_mm_placeholders = seq_group.multi_modal_placeholders
+
+        if not seq_mm_data or not seq_mm_placeholders:
+            return seq_mm_data, {}
+
+        # For merged processor, we directly use mm_kwargs as mm_data
+        if isinstance(seq_mm_data, MultiModalKwargs):
+            placeholder_maps = dict[str, MultiModalPlaceholderMap]()
+
+            for modality, placeholders in seq_mm_placeholders.items():
+                placeholder_map = MultiModalPlaceholderMap()
+
+                if positions:
+                    placeholder_map.append_items_from_seq_group(
+                        positions,
+                        # Dummy, since we don't care about intersecting items
+                        [None] * len(placeholders),
+                        placeholders,
+                    )
+
+                placeholder_maps[modality] = placeholder_map
+
+            return seq_mm_data, placeholder_maps
 
-        mm_data = {**seq_group.multi_modal_data}
-        placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict(
+        mm_data = {**seq_mm_data}
+        placeholder_maps = defaultdict[str, MultiModalPlaceholderMap](
             MultiModalPlaceholderMap)
 
-        for (
-                modality,
-                placeholders,
-        ) in seq_group.multi_modal_placeholders.items():
+        for modality, placeholders in seq_mm_placeholders.items():
             mm_items = mm_data.pop(modality)
             if not isinstance(mm_items, list):
                 mm_items = [mm_items]
 
             if positions:
-                intersecting_items = placeholder_maps[
-                    modality].append_items_from_seq_group(
-                        positions, mm_items, placeholders)
+                intersecting_items = placeholder_maps[modality] \
+                    .append_items_from_seq_group(
+                        positions,
+                        mm_items,
+                        placeholders,
+                    )
 
                 if intersecting_items:
                     mm_data[modality] = intersecting_items
@@ -433,18 +454,3 @@ def index_map(self) -> "IndexMap":
 
         return MultiModalPlaceholderMap.IndexMap(src=src_indices,
                                                  dest=dest_indices)
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "MultiModalInputs":
-        msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
-               "The original name will take another meaning in an upcoming "
-               "version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return MultiModalKwargs
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 64a4c58d5509c..c00943a5f26d9 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -6,7 +6,7 @@
 import torch
 import torch.types
 from PIL.Image import Image
-from typing_extensions import TypeAlias
+from typing_extensions import NotRequired, TypeAlias
 
 from vllm.utils import JSONTree, is_list_of, json_map_leaves
 
@@ -96,7 +96,8 @@ class PlaceholderRange(TypedDict):
     """The length of the placeholder."""
 
 
-NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor]
+NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor,
+                      Tuple[torch.Tensor, ...]]
 """
 Uses a list instead of a tensor if the dimensions of each element do not match.
 """
@@ -203,21 +204,20 @@ class MultiModalInputsV2(TypedDict):
     """The type of inputs."""
 
     prompt: str
-    """
-    The original, unprocessed prompt text.
-
-    Note:
-        Since prompt text is not required by vLLM internals, we leave this
-        unprocessed to save CPU computation. You can still call
-        :code:`tokenizer.decode(prompt_token_ids)` to get the processed text.
-    """
+    """The processed prompt text."""
 
     prompt_token_ids: List[int]
     """The processed token IDs which includes placeholder tokens."""
 
+    token_type_ids: NotRequired[List[int]]
+    """The token type IDs of the prompt."""
+
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
+    mm_hashes: NotRequired[List[str]]
+    """The hashes of the multi-modal data."""
+
     mm_placeholders: MultiModalPlaceholderDict
     """
     For each modality, information about the placeholder tokens in
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 88a924da174a6..339e193eefe20 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,97 +1,245 @@
-from dataclasses import dataclass
-from functools import lru_cache, partial
-from typing import (Any, Callable, Collection, Generic, List, Mapping,
-                    Optional, TypedDict, TypeVar, final)
-
-from transformers import BatchFeature
-from typing_extensions import TypeAlias
-
-from vllm.inputs import InputProcessingContext
+import re
+from abc import ABC, abstractmethod
+from collections import UserDict
+from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
+
+import numpy as np
+import torch
+from PIL.Image import Image
+from transformers import BatchFeature, ProcessorMixin
+from typing_extensions import assert_never
+
+from vllm.inputs import DummyData, InputProcessingContext
+from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import is_list_of
+from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
 
 from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
                      MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
                      VideoItem)
 
-_T = TypeVar("_T")
+logger = init_logger(__name__)
 
-ReplacementFunc: TypeAlias = Callable[[_T, BatchFeature, int], List[int]]
-"""
-Given the original data item, HF-processed data, and index of the processed
-item, output the replacement token IDs to be allocated in vLLM.
-"""
+_S = TypeVar("_S", str, list[int])
+_PromptSeq = Union[str, list[int]]
 
 
 @dataclass
-class ModalityProcessingMetadata(Generic[_T]):
-    placeholder_replacements: Mapping[str, ReplacementFunc]
+class PromptReplacement:
+    modality: str
+    """The modality for which the replacement is made"""
+
+    target: _PromptSeq
+    """The text or token sequence to find and replace."""
+
+    replacement: Union[Callable[[int], _PromptSeq],
+                       _PromptSeq] = field(repr=False)
     """
-    A dictionary where each item represents the original placeholder in the
-    prompt text and the corresponding replacement.
+    Given the index of the processed item within :attr:`modality`, output the
+    replacement text or token sequence.
+
+    For convenience, you can pass in the replacement instead of a function
+    if it does not depend on the input.
     """
 
+    def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
+        return _BoundPromptReplacement(
+            tokenizer=tokenizer,
+            modality=self.modality,
+            _target=self.target,
+            _replacement=self.replacement,
+        )
+
 
-class MultiModalProcessingMetadataBuiltins(TypedDict, total=False):
-    """Type annotations for modality types predefined by vLLM."""
+def _encode(
+    tokenizer: AnyTokenizer,
+    text: str,
+    *,
+    add_special_tokens: bool = False,
+) -> list[int]:
+    """
+    Backend-agnostic equivalent of HF's
+    :code:`tokenizer.encode(text, add_special_tokens=...)`.
+    """
+    if isinstance(tokenizer, MistralTokenizer):
+        return tokenizer.tokenizer.encode(text,
+                                          bos=add_special_tokens,
+                                          eos=add_special_tokens)
 
-    image: ModalityProcessingMetadata[ImageItem]
-    video: ModalityProcessingMetadata[VideoItem]
-    audio: ModalityProcessingMetadata[AudioItem]
+    return tokenizer.encode(text, add_special_tokens=add_special_tokens)
 
 
-MultiModalProcessingMetadata: TypeAlias = \
-    Mapping[str, ModalityProcessingMetadata[Any]]
-"""
-A dictionary containing an entry for each modality type to process.
+@lru_cache(maxsize=2048)
+def _cached_encode(
+    tokenizer: AnyTokenizer,
+    text: str,
+    *,
+    add_special_tokens: bool = False,
+) -> list[int]:
+    return _encode(tokenizer, text, add_special_tokens=add_special_tokens)
 
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding_multimodal_plugin>`.
-"""
 
-MultiModalMultiData: TypeAlias = List[_T]
-"""
-A list of data items, where the number of data items allowed
-per modality is restricted by :code:`--limit-mm-per-prompt`.
-"""
+def _decode(
+    tokenizer: AnyTokenizer,
+    token_ids: list[int],
+    *,
+    skip_special_tokens: bool = False,
+) -> str:
+    """
+    Backend-agnostic equivalent of HF's
+    :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`.
+    """
+    return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
 
 
-@final
-class MultiModalMultiDataBuiltins(TypedDict, total=False):
-    """Type annotations for modality types predefined by vLLM."""
+@lru_cache(maxsize=2048)
+def _cached_decode(
+    tokenizer: AnyTokenizer,
+    token_ids: tuple[int, ...],
+    *,
+    skip_special_tokens: bool = False,
+) -> str:
+    return _decode(tokenizer,
+                   list(token_ids),
+                   skip_special_tokens=skip_special_tokens)
 
-    image: MultiModalMultiData[ImageItem]
-    """The input images."""
 
-    video: MultiModalMultiData[VideoItem]
-    """The input videos."""
+class _HasModalityAttr(Protocol):
+    modality: str
 
-    audio: MultiModalMultiData[AudioItem]
-    """The input audios."""
 
+class _HasModalityProp(Protocol):
 
-MultiModalMultiDataDict: TypeAlias = Mapping[str, MultiModalMultiData[Any]]
-"""
-A dictionary containing an entry for each modality type to input.
+    @property
+    def modality(self) -> str:
+        ...
 
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalMultiDataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding_multimodal_plugin>`.
-"""
 
+_M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
+
+
+def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
+    """Convenience function to apply :func:`full_groupby` based on modality."""
+    return full_groupby(values, key=lambda x: x.modality)
+
+
+@dataclass
+class _BoundPromptSequence:
+    tokenizer: AnyTokenizer = field(repr=False)
+
+    _text: Optional[str]
+    _token_ids: Optional[list[int]]
+
+    def __post_init__(self) -> None:
+        if self._text is None and self._token_ids is None:
+            raise ValueError("At least one of 'text' and 'token_ids' must be "
+                             "specified")
+
+    @property
+    def text(self) -> str:
+        if self._text is None:
+            assert self._token_ids is not None
+            self._text = _cached_decode(self.tokenizer, tuple(self._token_ids))
+
+        return self._text
+
+    @property
+    def token_ids(self) -> list[int]:
+        if self._token_ids is None:
+            assert self._text is not None
+            self._token_ids = _cached_encode(self.tokenizer, self._text)
+
+        return self._token_ids
+
+
+@dataclass
+class _BoundPromptReplacement:
+    tokenizer: AnyTokenizer = field(repr=False)
+    modality: str
+
+    _target: _PromptSeq
+    _replacement: Union[Callable[[int], _PromptSeq],
+                        _PromptSeq] = field(repr=False)
+
+    def __post_init__(self) -> None:
+        self._replacement_cache = dict[int, _BoundPromptSequence]()
+
+    @property
+    def target(self) -> _BoundPromptSequence:
+        target = self._target
+
+        return _BoundPromptSequence(
+            tokenizer=self.tokenizer,
+            _text=target if isinstance(target, str) else None,
+            _token_ids=target if isinstance(target, list) else None,
+        )
 
-def to_multi_format(data: MultiModalDataDict) -> MultiModalMultiDataDict:
+    def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+        replacement = self._replacement
+        if callable(replacement):
+            cache_key = item_idx
+            if cache_key in self._replacement_cache:
+                return self._replacement_cache[cache_key]
+
+            replacement = replacement(item_idx)
+        else:
+            cache_key = None
+
+        bound_replacement = _BoundPromptSequence(
+            tokenizer=self.tokenizer,
+            _text=replacement if isinstance(replacement, str) else None,
+            _token_ids=replacement if isinstance(replacement, list) else None,
+        )
+
+        if cache_key is not None:
+            self._replacement_cache[cache_key] = bound_replacement
+
+        return bound_replacement
+
+
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class MultiModalDataItems(UserDict[str, list[Any]]):
+    """
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
+    """
+
+    @property
+    def image(self) -> list[ImageItem]:
+        return self["image"]
+
+    @property
+    def video(self) -> list[VideoItem]:
+        return self["video"]
+
+    @property
+    def audio(self) -> list[AudioItem]:
+        return self["audio"]
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.image[item_idx]
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+
+def to_multi_format(data: MultiModalDataDict) -> MultiModalDataItems:
     """
-    Convert a :class:`MultiModalDataDict` containing single data items
-    to a :class:`MultiModalMultiDataDict` containing multiple data items
-    per entry.
+    Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
     """
-    multi_data: Mapping[str, MultiModalMultiData[Any]] = {}
+    multi_data = MultiModalDataItems()
 
     for k, v in data.items():
         # yapf: disable
@@ -107,102 +255,299 @@ def to_multi_format(data: MultiModalDataDict) -> MultiModalMultiDataDict:
     return multi_data
 
 
-def encode_no_special_tokens(
-    tokenizer: AnyTokenizer,
-    text: str,
-) -> List[int]:
+class _TokenMatch(NamedTuple):
+    start_idx: int
+    end_idx: int
+
+
+def iter_token_matches(
+    token_ids: list[int],
+    match_ids: list[int],
+) -> Iterable[_TokenMatch]:
     """
-    Backend-agnostic equivalent of HF's
-    :code:`tokenizer.encode(text, add_special_tokens=False)`.
+    Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
+
+    Note that empty matches are ignored.
     """
-    if isinstance(tokenizer, MistralTokenizer):
-        return tokenizer.tokenizer.encode(text, bos=False, eos=False)
+    prompt_len = len(token_ids)
+    match_len = len(match_ids)
 
-    return tokenizer.encode(text, add_special_tokens=False)
+    if match_len == 0:
+        return
 
+    start_idx = 0
+    while start_idx < prompt_len - match_len + 1:
+        end_idx = start_idx + match_len
 
-@lru_cache
-def candidate_placeholders(
-    tokenizer: AnyTokenizer,
-    placeholder_text: str,
-) -> Collection[List[int]]:
-    """Generate token ID sequences that may represent a placeholder text."""
-    # When the placeholder text is not mapped to a special token ID,
-    # it may be tokenized differently based on whether it is at the start/end
-    # of the string. So, we go through each combination of whether the text
-    # is at the start and end boundaries of the string
-
-    # Matches the placeholder when it is in the middle of the string
-    start_id, = encode_no_special_tokens(tokenizer, "a")
-    end_id, = encode_no_special_tokens(tokenizer, "b")
-
-    candidate_basic = encode_no_special_tokens(tokenizer, placeholder_text)
-
-    start_id_, *candidate_a = encode_no_special_tokens(
-        tokenizer,
-        f"a{placeholder_text}",
-    )
-    assert start_id == start_id_
-
-    start_id_, *candidate_ab, end_id_ = encode_no_special_tokens(
-        tokenizer,
-        f"a{placeholder_text}b",
-    )
-    assert start_id == start_id_ and end_id == end_id_
-
-    *candidate_b, end_id_ = encode_no_special_tokens(
-        tokenizer,
-        f"{placeholder_text}b",
-    )
-    assert end_id == end_id_
-
-    # Remove duplicates (need to convert to tuple to be hashable)
-    unique_candidates = {
-        tuple(c)
-        for c in [candidate_basic, candidate_a, candidate_ab, candidate_b]
-    }
-
-    # Convert back to list
-    return [list(c) for c in unique_candidates]
-
-
-def apply_placeholders(
-    token_ids: List[int],
-    placeholder_ids: List[int],
-    get_replacement_ids: Callable[[], List[int]],
-) -> Optional[PlaceholderRange]:
+        if token_ids[start_idx:end_idx] == match_ids:
+            yield _TokenMatch(start_idx=start_idx, end_idx=end_idx)
+
+            # Exclude overlapping matches
+            start_idx = end_idx
+        else:
+            start_idx += 1
+
+
+@dataclass(repr=False)
+class _PromptReplacementMatch(ABC):
+    prompt_repl: _BoundPromptReplacement
+
+    @property
+    def modality(self) -> str:
+        return self.prompt_repl.modality
+
+    @property
+    @abstractmethod
+    def start_idx(self) -> int:
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def end_idx(self) -> int:
+        raise NotImplementedError
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r}, "
+                f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})")
+
+
+@dataclass(repr=False)
+class _PromptReplacementTokenMatch(_PromptReplacementMatch):
+    match: _TokenMatch
+
+    @property
+    def start_idx(self) -> int:
+        return self.match.start_idx
+
+    @property
+    def end_idx(self) -> int:
+        return self.match.end_idx
+
+
+@dataclass(repr=False)
+class _PromptReplacementTextMatch(_PromptReplacementMatch):
+    match: re.Match[str]
+
+    @property
+    def start_idx(self) -> int:
+        return self.match.start()
+
+    @property
+    def end_idx(self) -> int:
+        return self.match.end()
+
+
+class _PlaceholderInfo(NamedTuple):
+    modality: str
+    start_idx: int
+    replacement: list[int]
+
+    @property
+    def length(self) -> int:
+        return len(self.replacement)
+
+    def to_range(self) -> PlaceholderRange:
+        return PlaceholderRange(
+            offset=self.start_idx,
+            length=self.length,
+        )
+
+
+def find_token_matches(
+    prompt: list[int],
+    prompt_repls: Sequence[_BoundPromptReplacement],
+) -> list[_PromptReplacementTokenMatch]:
+    """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
+    return [
+        _PromptReplacementTokenMatch(prompt_repl, match)
+        for prompt_repl in prompt_repls
+        for match in iter_token_matches(prompt, prompt_repl.target.token_ids)
+    ]
+
+
+def find_text_matches(
+    prompt: str,
+    prompt_repls: Sequence[_BoundPromptReplacement],
+) -> list[_PromptReplacementTextMatch]:
+    """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
+    return [
+        _PromptReplacementTextMatch(prompt_repl, match)
+        for prompt_repl in prompt_repls
+        for match in re.finditer(re.escape(prompt_repl.target.text), prompt)
+    ]
+
+
+def _resolve_matches(
+    prompt: _PromptSeq,
+    matches: Sequence[_PromptReplacementMatch],
+) -> list[_PromptReplacementMatch]:
     """
-    Find the first occurrence of :code:`placeholder_ids`,
-    and replace it with the output of :code:`get_replacement_ids`.
+    Resolve :code:`matches` to ensure that there are no overlapping matches,
+    and sort them such that earlier matches take priority over later ones.
+    """
+    seen_matches: list[Optional[_PromptReplacementMatch]] = [None
+                                                             ] * len(prompt)
+
+    for match in matches:
+        for idx in range(match.start_idx, match.end_idx):
+            if seen_matches[idx] is not None:
+                raise ValueError("Found overlapping matches "
+                                 f"({seen_matches[idx]} and {match}) "
+                                 f"at index={idx} of prompt={prompt}")
+
+            seen_matches[idx] = match
+
+    return sorted(matches, key=lambda x: x.start_idx)
+
+
+def _replace_matches(
+    prompt: _S,
+    matches: Sequence[_PromptReplacementMatch],
+    mm_item_counts: Mapping[str, int],
+) -> list[_S]:
+    out_seqs = list[_S]()
+    prev_end_idx = 0
+    next_idx_by_modality = {modality: 0 for modality in mm_item_counts}
+
+    for match in _resolve_matches(prompt, matches):
+        modality = match.modality
+
+        item_idx = next_idx_by_modality[modality]
+        if item_idx >= mm_item_counts[modality]:
+            continue
+
+        start_idx = match.start_idx
+        end_idx = match.end_idx
+
+        repl_info = match.prompt_repl
+        replacement = repl_info.get_replacement(item_idx)
+
+        if isinstance(prompt, str):
+            repl_seq = replacement.text
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
+        else:
+            repl_seq = replacement.token_ids
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
+
+        prev_end_idx = end_idx
+        next_idx_by_modality[modality] += 1
+
+    out_seqs.append(prompt[prev_end_idx:])
+
+    return out_seqs
+
+
+def replace_token_matches(
+    prompt: list[int],
+    matches: Sequence[_PromptReplacementTokenMatch],
+    mm_item_counts: Mapping[str, int],
+) -> list[int]:
+    """Apply :code:`prompt_repls` to :code:`prompt`."""
+    if not matches:
+        return prompt
+
+    token_id_seqs = _replace_matches(prompt, matches, mm_item_counts)
+
+    return flatten_2d_lists(token_id_seqs)
+
+
+def replace_text_matches(
+    prompt: str,
+    matches: Sequence[_PromptReplacementTextMatch],
+    mm_item_counts: Mapping[str, int],
+) -> str:
+    """Apply :code:`prompt_repls` to :code:`prompt`."""
+    if not matches:
+        return prompt
+
+    texts = _replace_matches(prompt, matches, mm_item_counts)
+
+    return "".join(texts)
+
+
+def _iter_modality_placeholders(
+    prompt: list[int],
+    modality: str,
+    modality_repls: Sequence[_BoundPromptReplacement],
+    modal_item_count: int,
+) -> Iterable[_PlaceholderInfo]:
+    if modal_item_count == 0:
+        return
+
+    prompt_len = len(prompt)
+    item_index = 0
 
-    This function updates :code:`token_ids` in place.
+    start_idx = 0
+    while start_idx < prompt_len:
+        found = False
+
+        for repl_info in modality_repls:
+            replacement = repl_info.get_replacement(item_index)
+            repl_tokens = replacement.token_ids
+            repl_len = len(repl_tokens)
+            end_idx = start_idx + repl_len
+
+            if repl_len == 0 or end_idx > prompt_len:
+                continue
+
+            if prompt[start_idx:end_idx] == repl_tokens:
+                yield _PlaceholderInfo(
+                    modality=modality,
+                    start_idx=start_idx,
+                    replacement=repl_tokens,
+                )
+
+                item_index += 1
+                if item_index >= modal_item_count:
+                    return
+
+                # Exclude overlapping matches
+                start_idx = end_idx
+                found = True
+                break
+
+        if not found:
+            start_idx += 1
+
+
+def iter_placeholders(
+    prompt_repls: Sequence[_BoundPromptReplacement],
+    prompt: list[int],
+    mm_item_counts: Mapping[str, int],
+) -> Iterable[_PlaceholderInfo]:
+    """
+    Yield each set of placeholder tokens found in :code:`prompt`.
+
+    Note that empty matches are ignored.
     """
-    placeholder_length = len(placeholder_ids)
+    repls_by_modality = dict(full_groupby_modality(prompt_repls))
 
-    for start_idx in range(len(token_ids) - placeholder_length + 1):
-        if token_ids[start_idx:placeholder_length] == placeholder_ids:
-            token_ids[start_idx:placeholder_length] = get_replacement_ids()
+    for modality, modal_item_count in mm_item_counts.items():
+        if modality in repls_by_modality:
+            yield from _iter_modality_placeholders(
+                prompt,
+                modality,
+                repls_by_modality[modality],
+                modal_item_count,
+            )
 
-            return PlaceholderRange(offset=start_idx,
-                                    length=placeholder_length)
 
-    return None
+class ProcessorInputs(NamedTuple):
+    """Keyword arguments to :meth:`BaseMultiModalProcessor`"""
+    prompt_text: str
+    mm_data: MultiModalDataDict
+    mm_processor_kwargs: Mapping[str, object]
 
 
-class MultiModalProcessor:
+class BaseMultiModalProcessor(ABC):
     """
-    Helper class to process multi-modal inputs to be used in vLLM.
+    Abstract base class to process multi-modal inputs to be used in vLLM.
     """
 
-    def __init__(
-        self,
-        ctx: InputProcessingContext,
-        metadata: MultiModalProcessingMetadata,
-    ) -> None:
+    def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__()
 
         self.ctx = ctx
-        self.metadata = metadata
 
     def __call__(
         self,
@@ -212,62 +557,278 @@ def __call__(
     ) -> MultiModalInputsV2:
         return self.apply(prompt, mm_data, mm_processor_kwargs)
 
-    def apply(
+    def _get_hf_processor(self) -> ProcessorMixin:
+        """
+        Subclasses can add keyword arguments to this method to accept
+        additional kwargs from model config or user inputs.
+        """
+        return self.ctx.get_hf_processor()
+
+    def _get_tokenizer(self) -> AnyTokenizer:
+        return self.ctx.tokenizer
+
+    @abstractmethod
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        """
+        Given the original multi-modal items for this modality
+        and HF-processed data, output the replacements to perform.
+
+        Note:
+            Even when the HF processor already performs replacement for us,
+            we still use this replacement information to determine
+            the placeholder token positions for each multi-modal item.
+        """
+        raise NotImplementedError
+
+    def _find_placeholders(
+        self,
+        all_prompt_repls: Sequence[_BoundPromptReplacement],
+        new_token_ids: list[int],
+        mm_item_counts: Mapping[str, int],
+    ) -> list[_PlaceholderInfo]:
+        return list(
+            iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
+
+    def _get_processor_data(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> BatchFeature:
+        processor_data = dict[str, Any]()
+        passthrough_data = dict[str, Any]()
+        for k, v in mm_data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            if k in ("image", "video", "audio"):
+                if isinstance(v, torch.Tensor) and v.ndim == 3:
+                    # Pass through embedding inputs (single)
+                    passthrough_data[f"{k}_embeds"] = [v]
+                elif is_list_of(v, torch.Tensor) and v[0].ndim == 2:
+                    # Pass through embedding inputs (multi)
+                    passthrough_data[f"{k}_embeds"] = v
+                else:
+                    # Map keys to plural form, e.g.: image -> images
+                    processor_data[f"{k}s"] = v
+            else:
+                processor_data[k] = v
+        return processor_data, passthrough_data
+
+    def _apply_hf_processor(
         self,
         prompt: str,
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
-        tokenizer = self.ctx.tokenizer
-        hf_processor = self.ctx.get_hf_processor()
+    ) -> BatchFeature:
+        # some mm_processor_kwargs may be used in processor initialization
+        # instead of processor call
+        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
+
+        processor_data, passthrough_data = self._get_processor_data(mm_data)
 
-        processed_inputs = hf_processor(
-            text=prompt,  # type: ignore
-            **mm_data,
-            **mm_processor_kwargs,
+        assert callable(hf_processor)
+        mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(
+            hf_processor,
+            mm_processor_kwargs,
         )
-        new_token_ids, = processed_inputs.pop("input_ids").tolist()
-        mm_kwargs = MultiModalKwargs(processed_inputs)
 
-        mm_placeholders: Mapping[str, List[PlaceholderRange]] = {}
+        try:
+            hf_inputs = hf_processor(
+                text=prompt,  # type: ignore
+                **processor_data,
+                **mm_processor_kwargs,
+                return_tensors="pt",
+            )
+        except Exception as exc:
+            data = dict(text=prompt, **processor_data)
+
+            raise RuntimeError(
+                f"Failed to apply {type(hf_processor).__name__} "
+                f"on data={data} with kwargs={mm_processor_kwargs}") from exc
+
+        hf_inputs.update(passthrough_data)
+
+        return hf_inputs
 
-        for modality, orig_inputs in to_multi_format(mm_data).items():
-            assert isinstance(orig_inputs, list)
+    def _bind_prompt_replacements(
+        self,
+        prompt_repls: list[PromptReplacement],
+    ) -> list[_BoundPromptReplacement]:
+        tokenizer = self._get_tokenizer()
 
-            metadata = self.metadata[modality]
-            placeholder_replacements = metadata.placeholder_replacements
+        return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls]
 
-            modality_placeholders: List[PlaceholderRange] = []
+    def _apply_prompt_replacements(
+        self,
+        token_ids: list[int],
+        prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_item_counts: Mapping[str, int],
+    ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
+        tokenizer = self._get_tokenizer()
+
+        token_matches = find_token_matches(token_ids, prompt_repls)
+
+        # If the search text does not represent a special token,
+        # it may have different token IDs in the prompt, because
+        # the tokens may go across the boundaries of the search text.
+        # ----
+        # e.g. when searching for "foo" in "food", if "food" itself makes
+        # up a token, then the token ID of "foo" will not appear at all
+        # ----
+        # Since it is inefficient to search for all possible tokenizations
+        # of the search text in the prompt, we instead perform string
+        # replacement on the decoded token IDs, then encode them back.
+        if all(
+            len(matches) >= mm_item_counts[modality]
+            for modality, matches in full_groupby_modality(token_matches)
+        ):  # yapf: disable
+            token_ids = replace_token_matches(
+                token_ids,
+                token_matches,
+                mm_item_counts,
+            )
+
+            text = _decode(tokenizer, token_ids)
+            matched_repls = [match.prompt_repl for match in token_matches]
+        else:
+            text = _decode(tokenizer, token_ids)
 
-            for item_idx, orig_item in enumerate(orig_inputs):
-                for match_text, replace_fn in placeholder_replacements.items():
-                    candidates = candidate_placeholders(tokenizer, match_text)
-                    get_replacement_ids = partial(
-                        replace_fn,
-                        orig_item,
-                        processed_inputs,
-                        item_idx,
-                    )
+            text_matches = find_text_matches(text, prompt_repls)
+            text = replace_text_matches(
+                text,
+                text_matches,
+                mm_item_counts,
+            )
 
-                    for match_ids in candidates:
-                        # TODO(youkaichao): Don't update new_token_ids
-                        placeholders = apply_placeholders(
-                            new_token_ids,
-                            match_ids,
-                            get_replacement_ids,
-                        )
+            token_ids = _encode(tokenizer, text)
+            matched_repls = [match.prompt_repl for match in text_matches]
 
-                        if placeholders is not None:
-                            modality_placeholders.append(placeholders)
+        placeholders = self._find_placeholders(matched_repls, token_ids,
+                                               mm_item_counts)
 
-            # yapf: disable
-            mm_placeholders[modality] = modality_placeholders  # type: ignore[index]
-            # yapf: enable
+        return token_ids, text, placeholders
+
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        """
+        Process multi-modal inputs to be used in vLLM.
+
+        The main steps are:
+
+        1. Apply HF Processor on prompt text and multi-modal data together,
+           outputting token IDs and processed tensors.
+        2. Find and replace sequences in the token IDs with placeholder tokens.
+           The number of placeholder tokens equals the feature size of the
+           multi-modal data outputted by the multi-modal encoder.
+        3. Extract information about the placeholder tokens from the
+           processed token IDs.
+        """
+        tokenizer = self._get_tokenizer()
+
+        hf_inputs = self._apply_hf_processor(prompt_text, mm_data,
+                                             mm_processor_kwargs)
+        prompt_ids, = hf_inputs.pop("input_ids").tolist()
+        mm_kwargs = MultiModalKwargs(hf_inputs)
+
+        mm_items = to_multi_format(mm_data)
+        prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
+                                                     mm_processor_kwargs)
+        all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
+
+        # If HF processor already inserts placeholder tokens,
+        # there is no need for us to insert them
+        mm_item_counts = {m: len(items) for m, items in mm_items.items()}
+        all_placeholders = self._find_placeholders(all_prompt_repls,
+                                                   prompt_ids, mm_item_counts)
+
+        if all_placeholders:
+            prompt_text = _decode(tokenizer, prompt_ids)
+        else:
+            (
+                prompt_ids,
+                prompt_text,
+                all_placeholders,
+            ) = self._apply_prompt_replacements(
+                prompt_ids,
+                all_prompt_repls,
+                mm_item_counts,
+            )
+
+        mm_placeholders = {
+            modality: [item.to_range() for item in items]
+            for modality, items in full_groupby_modality(all_placeholders)
+        }
 
         return MultiModalInputsV2(
             type="multimodal",
-            prompt=prompt,
-            prompt_token_ids=new_token_ids,
+            prompt=prompt_text,
+            prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_placeholders=mm_placeholders,
         )
+
+    @abstractmethod
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        """
+        Build the multi-modal portion of the input which, after processing,
+        results in `mm_max_tokens` in :meth:`get_dummy_data`.
+        """
+        raise NotImplementedError
+
+    def get_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_max_tokens: Mapping[str, int],
+    ) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        processor_inputs = self._get_dummy_mm_inputs(mm_counts)
+        mm_inputs = self.apply(*processor_inputs)
+
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+
+        total_placeholders_by_modality = dict[str, int]()
+        for modality, placeholders in placeholders_by_modality.items():
+            num_placeholders = sum(item["length"] for item in placeholders)
+            max_tokens = mm_max_tokens[modality]
+
+            if num_placeholders != max_tokens:
+                logger.warning(
+                    "The processed dummy data has a total of %d placeholder "
+                    "tokens for the '%s' modality, which is not the expected "
+                    "%d tokens.", num_placeholders, modality, max_tokens)
+
+            total_placeholders_by_modality[modality] = num_placeholders
+
+        total_len = len(prompt_token_ids)
+        if total_len > seq_len:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain multi-modal "
+                "inputs to fail during inference, even when the input text is "
+                "short. To avoid this, you should increase `max_model_len`, "
+                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
+                total_len, total_placeholders_by_modality)
+
+        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
+
+        return DummyData(
+            seq_data=SequenceData.from_seqs(prompt_token_ids),
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=placeholders_by_modality,
+        )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index b992442d3b314..6cd79d414c978 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -9,12 +9,13 @@
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import ClassRegistry
 
 from .audio import AudioPlugin
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
-from .processing import MultiModalProcessor
+from .processing import BaseMultiModalProcessor
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -25,7 +26,7 @@
 N = TypeVar("N", bound=Type[nn.Module])
 
 MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext],
-                                                 MultiModalProcessor]
+                                                 BaseMultiModalProcessor]
 """
 Constructs a :class:`MultiModalProcessor` instance from the context.
 
@@ -62,8 +63,8 @@ def __init__(
             plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
         self._plugins = {p.get_data_key(): p for p in plugins}
 
-        self._processor_factories: Dict[Type[nn.Module],
-                                        MultiModalProcessorFactory] = {}
+        self._processor_factories = ClassRegistry[nn.Module,
+                                                  MultiModalProcessorFactory]()
 
         # This is used for non-multimodal models
         self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
@@ -199,9 +200,29 @@ def register_max_image_tokens(
         """
         return self.register_max_multimodal_tokens("image", max_mm_tokens)
 
-    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
+    def get_max_tokens_per_item_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
         """
-        Get the maximum number of multi-modal tokens
+        Get the maximum number of tokens per data item from each modality
+        for profiling the memory usage of a model.
+
+        Note:
+            This is currently directly used only in V1.
+        """
+
+        return {
+            key: plugin.get_max_multimodal_tokens(model_config)
+            for key, plugin in self._plugins.items()
+        }
+
+    def get_max_tokens_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens from each modality
         for profiling the memory usage of a model.
 
         See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
@@ -211,9 +232,23 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         limits_per_plugin = self._limits_by_model[model_config]
 
-        return sum((limits_per_plugin[key] *
-                    plugin.get_max_multimodal_tokens(model_config))
-                   for key, plugin in self._plugins.items())
+        return {
+            key: limits_per_plugin[key] * max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
+        }
+
+    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
+        """
+        Get the maximum number of multi-modal tokens
+        for profiling the memory usage of a model.
+
+        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
+
+        Note:
+            This should be called after :meth:`init_mm_limits_per_prompt`.
+        """
+        return sum(self.get_max_tokens_by_modality(model_config).values())
 
     def init_mm_limits_per_prompt(
         self,
@@ -269,7 +304,8 @@ def register_processor(
         factory: MultiModalProcessorFactory,
     ):
         """
-        Register a multi-modal processor to a model class.
+        Register a multi-modal processor to a model class. The processor
+        is constructed lazily, hence a factory method should be passed.
 
         When the model receives multi-modal data, the provided function is
         invoked to transform the data into a dictionary of model inputs.
@@ -280,9 +316,9 @@ def register_processor(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._processor_factories:
+            if self._processor_factories.contains(model_cls, strict=True):
                 logger.warning(
-                    "Model class %s already has an input mapper "
+                    "Model class %s already has a multi-modal processor "
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
@@ -306,7 +342,7 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         tokenizer: AnyTokenizer,
-    ) -> MultiModalProcessor:
+    ) -> BaseMultiModalProcessor:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 40194716bbf94..c898ca4e6573e 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import numpy.typing as npt
+import torch
 from PIL import Image
 
 import vllm.envs as envs
@@ -392,6 +393,49 @@ def encode_video_base64(frames: npt.NDArray):
     return ",".join(base64_frames)
 
 
+def resolve_visual_encoder_outputs(
+    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
+    feature_sample_layers: Optional[list[int]],
+    post_layer_norm: Optional[torch.nn.LayerNorm],
+    max_possible_layers: int,
+) -> torch.Tensor:
+    """Given the outputs a visual encoder module that may correspond to the
+    output of the last layer, or a list of hidden states to be stacked,
+    handle post normalization and resolve it into a single output tensor.
+
+    Args:
+        encoder_outputs: Output of encoder's last layer or all hidden states.
+        feature_sample_layers: Optional layer indices to grab from the encoder
+            outputs; if provided, encoder outputs must be a list.
+        post_layer_norm: Post norm to apply to the output of the encoder.
+        max_possible_layers: Total layers in the fully loaded visual encoder.
+
+    """
+    if feature_sample_layers is None:
+        if post_layer_norm is not None:
+            return post_layer_norm(encoder_outputs)
+        return encoder_outputs
+
+    # Get the hidden states corresponding to the layer indices.
+    # Negative values are relative to the full visual encoder,
+    # so offset them depending on how many layers were loaded.
+    # NOTE: this assumes that encoder_outputs contains a list
+    # of hidden states in the same order as the encoder layers
+    # that produced them.
+    offset = max_possible_layers - len(encoder_outputs)
+    hs_pool = [
+        encoder_outputs[layer_idx]
+        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
+        for layer_idx in feature_sample_layers
+    ]
+
+    # Apply post-norm on the final hidden state if we are using it
+    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
+    if post_layer_norm is not None and uses_last_layer:
+        hs_pool[-1] = post_layer_norm(encoder_outputs)
+    return torch.cat(hs_pool, dim=-1)
+
+
 # Utilities for input processors
 _T = TypeVar("_T", str, int)
 
@@ -491,11 +535,13 @@ def repeat_and_pad_placeholder_tokens(
     return new_prompt, new_token_ids, placeholder_ranges
 
 
-def consecutive_placeholder_ranges(num_items: int,
-                                   item_size: int) -> List[PlaceholderRange]:
+def consecutive_placeholder_ranges(
+        num_items: int,
+        item_size: int,
+        initial_offset: int = 0) -> List[PlaceholderRange]:
     """Returns a list of consecutive PlaceholderRanges of a fixed size"""
 
     return [
-        PlaceholderRange(offset=i * item_size, length=item_size)
-        for i in range(num_items)
+        PlaceholderRange(offset=initial_offset + i * item_size,
+                         length=item_size) for i in range(num_items)
     ]
diff --git a/vllm/outputs.py b/vllm/outputs.py
index badf50d0602d6..2ecdf74ee59b3 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,10 +1,14 @@
 import time
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, Generic, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Union
 
+import torch
+from typing_extensions import TypeVar, deprecated
+
 from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind
 from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
                            SequenceGroup, SequenceGroupBase, SequenceStatus)
@@ -52,19 +56,28 @@ def __repr__(self) -> str:
 
 
 @dataclass
-class EmbeddingOutput:
-    """The output data of one completion output of a request.
+class PoolingOutput:
+    """The output data of one pooling output of a request.
 
     Args:
-        embedding: The embedding vector, which is a list of floats. The
-        length of vector depends on the model as listed in the embedding guide.
+        data: The extracted hidden states.
     """
-
-    embedding: List[float]
+    data: torch.Tensor
 
     def __repr__(self) -> str:
-        return (f"EmbeddingOutput("
-                f"embedding={len(self.embedding)})")
+        return (f"PoolingOutput(data={self.data})")
+
+    def __eq__(self, other: object) -> bool:
+        return (isinstance(other, self.__class__) and bool(
+            (self.data == other.data).all()))
+
+    @property
+    @deprecated("`LLM.encode()` now stores raw outputs in the `data` "
+                "attribute. To return embeddings, use `LLM.embed()`. "
+                "To return class probabilities, use `LLM.classify()` "
+                "and access the `probs` attribute. ")
+    def embedding(self) -> list[float]:
+        return self.data.tolist()
 
 
 class RequestOutput:
@@ -103,10 +116,13 @@ def __init__(
         encoder_prompt: Optional[str] = None,
         encoder_prompt_token_ids: Optional[List[int]] = None,
         num_cached_tokens: Optional[int] = None,
+        *,
+        multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
     ) -> None:
         self.request_id = request_id
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
+        self.multi_modal_placeholders = multi_modal_placeholders or {}
         self.prompt_logprobs = prompt_logprobs
         self.outputs = outputs
         self.finished = finished
@@ -275,17 +291,26 @@ def from_seq_group(
         finished_time = time.time() if finished else None
         seq_group.set_finished_time(finished_time)
 
-        init_args = (seq_group.request_id, prompt, prompt_token_ids,
-                     prompt_logprobs, outputs, finished, seq_group.metrics,
-                     seq_group.lora_request, encoder_prompt,
-                     encoder_prompt_token_ids, num_cached_tokens)
+        init_kwargs = {
+            "request_id": seq_group.request_id,
+            "prompt": prompt,
+            "prompt_token_ids": prompt_token_ids,
+            "prompt_logprobs": prompt_logprobs,
+            "outputs": outputs,
+            "finished": finished,
+            "metrics": seq_group.metrics,
+            "lora_request": seq_group.lora_request,
+            "encoder_prompt": encoder_prompt,
+            "encoder_prompt_token_ids": encoder_prompt_token_ids,
+            "num_cached_tokens": num_cached_tokens,
+            "multi_modal_placeholders": seq_group.multi_modal_placeholders
+        }
 
         if use_cache:
             request_output = seq_group.cached_request_output
-            request_output.__init__(*init_args)  # type: ignore
-
+            request_output.__init__(**init_kwargs)  # type: ignore
         else:
-            request_output = cls(*init_args)
+            request_output = cls(**init_kwargs)  # type: ignore
 
         return request_output
 
@@ -300,51 +325,55 @@ def __repr__(self) -> str:
                 f"finished={self.finished}, "
                 f"metrics={self.metrics}, "
                 f"lora_request={self.lora_request}, "
-                f"num_cached_tokens={self.num_cached_tokens})")
+                f"num_cached_tokens={self.num_cached_tokens}, "
+                f"multi_modal_placeholders={self.multi_modal_placeholders})")
+
+
+_O = TypeVar("_O", default=PoolingOutput)
 
 
-class EmbeddingRequestOutput:
+class PoolingRequestOutput(Generic[_O]):
     """
-    The output data of an embedding request to the LLM.
+    The output data of a pooling request to the LLM.
 
     Args:
-        request_id (str): A unique identifier for the embedding request.
-        outputs (EmbeddingOutput): The embedding results for the given input.
+        request_id (str): A unique identifier for the pooling request.
+        outputs (PoolingOutput): The pooling results for the given input.
         prompt_token_ids (List[int]): A list of token IDs used in the prompt.
-        finished (bool): A flag indicating whether the embedding is completed.
+        finished (bool): A flag indicating whether the pooling is completed.
     """
 
-    def __init__(self, request_id: str, outputs: "EmbeddingOutput",
+    def __init__(self, request_id: str, outputs: _O,
                  prompt_token_ids: List[int], finished: bool):
         self.request_id = request_id
         self.prompt_token_ids = prompt_token_ids
         self.finished = finished
         self.outputs = outputs
 
-    @classmethod
-    def from_seq_group(cls,
-                       seq_group: 'SequenceGroup') -> "EmbeddingRequestOutput":
-        if seq_group.embeddings is None:
-            raise ValueError(
-                "Embeddings are missing in seq_group for EmbeddingRequest.")
-        output = EmbeddingOutput(seq_group.embeddings)
+    @staticmethod
+    def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
+        pooled_data = seq_group.pooled_data
+        assert pooled_data is not None
+
+        output = PoolingOutput(pooled_data)
         prompt_token_ids = seq_group.prompt_token_ids
         finished = seq_group.is_finished()
 
-        return cls(seq_group.request_id, output, prompt_token_ids, finished)
+        return PoolingRequestOutput(seq_group.request_id, output,
+                                    prompt_token_ids, finished)
 
     def __repr__(self):
         """
-        Returns a string representation of an EmbeddingRequestOutput instance.
+        Returns a string representation of an PoolingRequestOutput instance.
 
         The representation includes the request_id and the number of outputs,
-        providing a quick overview of the embedding request's results.
+        providing a quick overview of the pooling request's results.
 
         Returns:
-            str: A string representation of the EmbeddingRequestOutput instance.
+            str: A string representation of the PoolingRequestOutput instance.
         """
-        return (f"EmbeddingRequestOutput(request_id='{self.request_id}', "
-                f"outputs={repr(self.outputs)}, "
+        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
+                f"outputs={self.outputs!r}, "
                 f"prompt_token_ids={self.prompt_token_ids}, "
                 f"finished={self.finished})")
 
@@ -355,10 +384,123 @@ class RequestOutputFactory:
     def create(seq_group: SequenceGroup,
                seq_id_to_seq_group: Dict[str, SequenceGroupBase],
                use_cache: bool = False):
-        # Determine the type based on a condition, for example:
-        if hasattr(seq_group,
-                   'embeddings') and seq_group.embeddings is not None:
-            return EmbeddingRequestOutput.from_seq_group(seq_group)
+        if seq_group.pooled_data is not None:
+            return PoolingRequestOutput.from_seq_group(seq_group)
         else:
             return RequestOutput.from_seq_group(seq_group, use_cache,
                                                 seq_id_to_seq_group)
+
+
+@dataclass
+class EmbeddingOutput:
+    """The output data of one embedding output of a request.
+
+    Args:
+        embedding: The embedding vector, which is a list of floats.
+        Its length depends on the hidden dimension of the model.
+    """
+    embedding: list[float]
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D embedding vector")
+
+        return EmbeddingOutput(pooled_data.tolist())
+
+    @property
+    def hidden_size(self) -> int:
+        return len(self.embedding)
+
+    def __repr__(self) -> str:
+        return f"EmbeddingOutput(hidden_size={self.hidden_size})"
+
+
+class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return EmbeddingRequestOutput(
+            request_id=request_output.request_id,
+            outputs=EmbeddingOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
+
+
+@dataclass
+class ClassificationOutput:
+    """The output data of one classification output of a request.
+
+    Args:
+        probs: The probability vector, which is a list of floats.
+        Its length depends on the number of classes.
+    """
+    probs: list[float]
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D probability vector")
+
+        return ClassificationOutput(pooled_data.tolist())
+
+    @property
+    def num_classes(self) -> int:
+        return len(self.probs)
+
+    def __repr__(self) -> str:
+        return f"ClassificationOutput(num_classes={self.num_classes})"
+
+
+class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return ClassificationRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ClassificationOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
+
+
+@dataclass
+class ScoringOutput:
+    """The output data of one scoring output of a request.
+
+    Args:
+        score: The similarity score, which is a scalar value.
+    """
+    score: float
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 0:
+            raise ValueError("pooled_data should be a scalar score")
+
+        return ScoringOutput(pooled_data.item())
+
+    def __repr__(self) -> str:
+        return f"ScoringOutput(score={self.score})"
+
+    @property
+    @deprecated("`LLM.score()` now returns scalar scores. "
+                "Please access it via the `score` attribute. ")
+    def embedding(self) -> list[float]:
+        return [self.score]
+
+
+class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return ScoringRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ScoringOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 9e740837381f8..419237c252ffd 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,4 +1,5 @@
-from .interface import Platform, PlatformEnum, UnspecifiedPlatform
+from .interface import _Backend  # noqa: F401
+from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform
 
 current_platform: Platform
 
@@ -27,7 +28,15 @@
     finally:
         pynvml.nvmlShutdown()
 except Exception:
-    pass
+    # CUDA is supported on Jetson, but NVML may not be.
+    import os
+
+    def cuda_is_jetson() -> bool:
+        return os.path.isfile("/etc/nv_tegra_release") \
+            or os.path.exists("/sys/class/tegra-firmware")
+
+    if cuda_is_jetson():
+        is_cuda = True
 
 is_rocm = False
 
@@ -111,4 +120,4 @@
 else:
     current_platform = UnspecifiedPlatform()
 
-__all__ = ['Platform', 'PlatformEnum', 'current_platform']
+__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum']
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 5243f59203afc..d95a2b4cd5565 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,20 +1,108 @@
+from typing import TYPE_CHECKING, Optional
+
 import psutil
 import torch
 
-from .interface import Platform, PlatformEnum
+from vllm.logger import init_logger
+
+from .interface import Platform, PlatformEnum, _Backend
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
 
 
 class CpuPlatform(Platform):
     _enum = PlatformEnum.CPU
+    device_name: str = "cpu"
+    device_type: str = "cpu"
+    dispatch_key: str = "CPU"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
         return "cpu"
 
+    @classmethod
+    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+        if selected_backend != _Backend.TORCH_SDPA:
+            logger.info("Cannot use %s backend on CPU.", selected_backend)
+        return _Backend.TORCH_SDPA
+
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         return psutil.virtual_memory().total
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        import vllm.envs as envs
+        from vllm.utils import GiB_bytes
+        model_config = vllm_config.model_config
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # If the feature combo become valid
+        if not model_config.enforce_eager:
+            logger.warning(
+                "CUDA graph is not supported on CPU, fallback to the eager "
+                "mode.")
+            model_config.enforce_eager = True
+
+        cache_config = vllm_config.cache_config
+
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+
+        if kv_cache_space >= 0:
+            if kv_cache_space == 0:
+                cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
+                logger.warning(
+                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
+                    "for CPU backend is not set, using 4 by default.")
+            else:
+                cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa
+        else:
+            raise RuntimeError(
+                "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
+                f" {kv_cache_space}, expect a positive integer value.")
+
+        scheduler_config = vllm_config.scheduler_config
+        if ((scheduler_config.chunked_prefill_enabled
+             or cache_config.enable_prefix_caching)
+                and model_config.dtype == torch.half):
+            logger.warning("Chunked-prefill on the CPU backend only does not"
+                           " support fp16 for now, cast to bf16.")
+            model_config.dtype = torch.bfloat16
+
+        parallel_config = vllm_config.parallel_config
+        if (parallel_config.distributed_executor_backend is not None
+                and parallel_config.distributed_executor_backend != "mp"):
+            logger.warning(("%s is not supported on CPU, fallback to mp "
+                            "distributed executor backend."),
+                           parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "mp"
+        if parallel_config.worker_cls == "auto":
+            if vllm_config.speculative_config:
+                parallel_config.worker_cls = \
+                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                parallel_config.sd_worker_cls = \
+                    "vllm.worker.cpu_worker.CPUWorker"
+            else:
+                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on CPU.")
+        return False
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 9c5212ace1346..3c5350b778345 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -4,16 +4,25 @@
 
 import os
 from functools import lru_cache, wraps
-from typing import Callable, List, Tuple, TypeVar
+from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar,
+                    Union)
 
 import pynvml
 import torch
 from typing_extensions import ParamSpec
 
+# import custom ops, trigger op registration
+import vllm._C  # noqa
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
 logger = init_logger(__name__)
 
 _P = ParamSpec("_P")
@@ -31,10 +40,23 @@
 # see https://github.com/huggingface/diffusers/issues/9704 for details
 torch.backends.cuda.enable_cudnn_sdp(False)
 
-# NVML utils
-# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
-# all the related functions work on real physical device ids.
-# the major benefit of using NVML is that it will not initialize CUDA
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        if device_ids == [""]:
+            msg = (
+                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
+                " GPU support is disabled. If you are using ray, please unset"
+                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
+                " worker/actor. "
+                "Check https://github.com/vllm-project/vllm/issues/8402 for"
+                " more information.")
+            raise RuntimeError(msg)
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
 
 
 def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
@@ -50,79 +72,123 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
     return wrapper
 
 
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
-
-
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_physical_device_name(device_id: int = 0) -> str:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return pynvml.nvmlDeviceGetName(handle)
+class CudaPlatformBase(Platform):
+    _enum = PlatformEnum.CUDA
+    device_name: str = "cuda"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
 
+    @classmethod
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        raise NotImplementedError
 
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_physical_device_total_memory(device_id: int = 0) -> int:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
 
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
 
-@with_nvml_context
-def warn_if_different_devices():
-    device_ids: int = pynvml.nvmlDeviceGetCount()
-    if device_ids > 1:
-        device_names = [get_physical_device_name(i) for i in range(device_ids)]
-        if len(set(device_names)) > 1 and os.environ.get(
-                "CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager:
             logger.warning(
-                "Detected different devices in the system: \n%s\nPlease"
-                " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
-                "avoid unexpected behavior.", "\n".join(device_names))
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
 
+    @classmethod
+    def is_full_nvlink(cls, device_ids: List[int]) -> bool:
+        raise NotImplementedError
 
-try:
-    from sphinx.ext.autodoc.mock import _MockModule
+    @classmethod
+    def log_warnings(cls):
+        pass
 
-    if not isinstance(pynvml, _MockModule):
-        warn_if_different_devices()
-except ModuleNotFoundError:
-    warn_if_different_devices()
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                    parallel_config.sd_worker_cls = \
+                        "vllm.worker.worker.Worker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
 
 
-def device_id_to_physical_device_id(device_id: int) -> int:
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-        if device_ids == [""]:
-            raise RuntimeError("CUDA_VISIBLE_DEVICES is set to empty string,"
-                               " which means GPU support is disabled.")
-        physical_device_id = device_ids[device_id]
-        return int(physical_device_id)
-    else:
-        return device_id
-
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+class NvmlCudaPlatform(CudaPlatformBase):
 
-class CudaPlatform(Platform):
-    _enum = PlatformEnum.CUDA
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        try:
+            physical_device_id = device_id_to_physical_device_id(device_id)
+            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+            return DeviceCapability(major=major, minor=minor)
+        except RuntimeError:
+            return None
 
     @classmethod
-    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
-        physical_device_id = device_id_to_physical_device_id(device_id)
-        major, minor = get_physical_device_capability(physical_device_id)
-        return DeviceCapability(major=major, minor=minor)
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def has_device_capability(
+        cls,
+        capability: Union[Tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        try:
+            return super().has_device_capability(capability, device_id)
+        except RuntimeError:
+            return False
 
     @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
     def get_device_name(cls, device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)
-        return get_physical_device_name(physical_device_id)
+        return cls._get_physical_device_name(physical_device_id)
 
     @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         physical_device_id = device_id_to_physical_device_id(device_id)
-        return get_physical_device_total_memory(physical_device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
 
     @classmethod
     @with_nvml_context
@@ -138,13 +204,86 @@ def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
                 if i < j:
                     try:
                         p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                            handle, peer_handle,
-                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                            handle,
+                            peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
+                        )
                         if p2p_status != pynvml.NVML_P2P_STATUS_OK:
                             return False
                     except pynvml.NVMLError:
                         logger.exception(
-                            "NVLink detection failed. This is normal if your"
-                            " machine has no NVLink equipped.")
+                            "NVLink detection failed. This is normal if"
+                            " your machine has no NVLink equipped.")
                         return False
         return True
+
+    @classmethod
+    def _get_physical_device_name(cls, device_id: int = 0) -> str:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        return pynvml.nvmlDeviceGetName(handle)
+
+    @classmethod
+    @with_nvml_context
+    def log_warnings(cls):
+        device_ids: int = pynvml.nvmlDeviceGetCount()
+        if device_ids > 1:
+            device_names = [
+                cls._get_physical_device_name(i) for i in range(device_ids)
+            ]
+            if (len(set(device_names)) > 1
+                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
+                logger.warning(
+                    "Detected different devices in the system: \n%s\nPlease"
+                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
+                    "avoid unexpected behavior.",
+                    "\n".join(device_names),
+                )
+
+
+class NonNvmlCudaPlatform(CudaPlatformBase):
+
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+        logger.exception(
+            "NVLink detection not possible, as context support was"
+            " not found. Assuming no NVLink available.")
+        return False
+
+
+# Autodetect either NVML-enabled or non-NVML platform
+# based on whether NVML is available.
+nvml_available = False
+try:
+    try:
+        pynvml.nvmlInit()
+        nvml_available = True
+    except Exception:
+        # On Jetson, NVML is not supported.
+        nvml_available = False
+finally:
+    if nvml_available:
+        pynvml.nvmlShutdown()
+
+CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
+
+try:
+    from sphinx.ext.autodoc.mock import _MockModule
+
+    if not isinstance(pynvml, _MockModule):
+        CudaPlatform.log_warnings()
+except ModuleNotFoundError:
+    CudaPlatform.log_warnings()
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 170cfff94f90d..0a44f2b74163a 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,11 +1,60 @@
+from typing import TYPE_CHECKING, Optional
+
 import torch
 
-from .interface import Platform, PlatformEnum
+from vllm.logger import init_logger
+
+from .interface import Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
 
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
+    device_name: str = "hpu"
+    device_type: str = "hpu"
+    dispatch_key: str = "HPU"
+
+    @classmethod
+    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+        return _Backend.HPU_ATTN
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
 
     @staticmethod
     def inference_mode():
         return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+
+        scheduler_config = vllm_config.scheduler_config
+        if scheduler_config.is_multi_step:
+            raise NotImplementedError(
+                "Multi-step execution is not implemented for HPU")
+
+        if vllm_config.speculative_config is not None:
+            raise NotImplementedError(
+                "Speculative decoding is not implemented for HPU")
+
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
+
+        # NOTE(kzawora): default block size for Gaudi should be 128
+        # smaller sizes still work, but very inefficiently
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on HPU.")
+        return False
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 81d8bdae2383c..4150b0cdf836a 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,10 +1,40 @@
 import enum
+import platform
 import random
-from typing import NamedTuple, Optional, Tuple, Union
+from platform import uname
+from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
 import torch
 
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
+
+
+class _Backend(enum.Enum):
+    FLASH_ATTN = enum.auto()
+    FLASH_ATTN_VLLM_V1 = enum.auto()
+    XFORMERS = enum.auto()
+    ROCM_FLASH = enum.auto()
+    TORCH_SDPA = enum.auto()
+    OPENVINO = enum.auto()
+    FLASHINFER = enum.auto()
+    HPU_ATTN = enum.auto()
+    PALLAS = enum.auto()
+    IPEX = enum.auto()
+    NO_ATTENTION = enum.auto()
+
 
 class PlatformEnum(enum.Enum):
     CUDA = enum.auto()
@@ -18,6 +48,14 @@ class PlatformEnum(enum.Enum):
     UNSPECIFIED = enum.auto()
 
 
+class CpuArchEnum(enum.Enum):
+    X86 = enum.auto()
+    ARM = enum.auto()
+    POWERPC = enum.auto()
+    OTHER = enum.auto()
+    UNKNOWN = enum.auto()
+
+
 class DeviceCapability(NamedTuple):
     major: int
     minor: int
@@ -37,6 +75,13 @@ def to_int(self) -> int:
 
 class Platform:
     _enum: PlatformEnum
+    device_name: str
+    device_type: str
+    # available dispatch keys:
+    # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
+    # use "CPU" as a fallback for platforms not registered in PyTorch
+    dispatch_key: str = "CPU"
+    supported_quantization: list[str] = []
 
     def is_cuda(self) -> bool:
         return self._enum == PlatformEnum.CUDA
@@ -66,6 +111,11 @@ def is_cuda_alike(self) -> bool:
         """Stateless version of :func:`torch.cuda.is_available`."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
+    @classmethod
+    def get_default_attn_backend(cls, selected_backend: _Backend):
+        """Get the default attention backend of a device."""
+        return None
+
     @classmethod
     def get_device_capability(
         cls,
@@ -107,6 +157,13 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
         """Get the total memory of a device in bytes."""
         raise NotImplementedError
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        """
+        Check if the current platform supports async output.
+        """
+        raise NotImplementedError
+
     @classmethod
     def inference_mode(cls):
         """A device-specific wrapper of `torch.inference_mode`.
@@ -129,6 +186,59 @@ def seed_everything(cls, seed: int) -> None:
         np.random.seed(seed)
         torch.manual_seed(seed)
 
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        """
+        Check and update the configuration for the current platform.
+
+        It can raise an exception if the configuration is not compatible with
+        the current platform, or it can update the configuration to make it
+        compatible with the current platform.
+
+        The config is passed by reference, so it can be modified in place.
+        """
+        pass
+
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        """
+        Verify whether the quantization is supported by the current platform.
+        """
+        if cls.supported_quantization and \
+            quant not in cls.supported_quantization:
+            raise ValueError(
+                f"{quant} quantization is currently not supported in "
+                f"{cls.device_name}.")
+
+    @classmethod
+    def get_cpu_architecture(cls) -> CpuArchEnum:
+        """
+        Determine the CPU architecture of the current system.
+        Returns CpuArchEnum indicating the architecture type.
+        """
+        machine = platform.machine().lower()
+
+        if machine in ("x86_64", "amd64", "i386", "i686"):
+            return CpuArchEnum.X86
+        elif machine.startswith("arm") or machine.startswith("aarch"):
+            return CpuArchEnum.ARM
+        elif machine.startswith("ppc"):
+            return CpuArchEnum.POWERPC
+
+        return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        """Checks whether pin memory is available on the current platform."""
+        if in_wsl():
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+            return False
+        return True
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
+    device_type = ""
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 07d8398eda525..a4bbbd27c8a89 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,9 +1,45 @@
+from typing import TYPE_CHECKING, Optional
+
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
 
 class NeuronPlatform(Platform):
     _enum = PlatformEnum.NEURON
+    device_name: str = "neuron"
+    device_type: str = "neuron"
+    supported_quantization: list[str] = ["neuron_quant"]
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
         return "neuron"
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = \
+                "vllm.worker.neuron_worker.NeuronWorker"
+
+        cache_config = vllm_config.cache_config
+        if cache_config:
+            # neuron needs block_size = max_model_len
+            vllm_config.cache_config.block_size = \
+                vllm_config.model_config.max_model_len
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Neuron.")
+        return False
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 31fe3f1fcbfe4..16eb8dc81efc2 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -1,33 +1,140 @@
+from typing import TYPE_CHECKING, Optional
+
 import torch
 
 import vllm.envs as envs
 from vllm.logger import init_logger
 
-from .interface import Platform, PlatformEnum
+from .interface import Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
 
 logger = init_logger(__name__)
 
+try:
+    import openvino as ov
+    import openvino.properties.hint as hints
+except ImportError as e:
+    logger.warning("Failed to import OpenVINO with %r", e)
+
 
 class OpenVinoPlatform(Platform):
     _enum = PlatformEnum.OPENVINO
+    device_name: str = "openvino"
+    device_type: str = "openvino"
+    dispatch_key: str = "CPU"
+
+    @classmethod
+    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+        if selected_backend != _Backend.OPENVINO:
+            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
+        return _Backend.OPENVINO
 
     @classmethod
-    def get_device_name(self, device_id: int = 0) -> str:
+    def get_device_name(cls, device_id: int = 0) -> str:
         return "openvino"
 
     @classmethod
-    def inference_mode(self):
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def inference_mode(cls):
         return torch.inference_mode(mode=True)
 
     @classmethod
-    def is_openvino_cpu(self) -> bool:
+    def is_openvino_cpu(cls) -> bool:
         return "CPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_openvino_gpu(self) -> bool:
+    def is_openvino_gpu(cls) -> bool:
         return "GPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_pin_memory_available(self) -> bool:
+    def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on OpenViNO.")
         return False
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        from vllm.utils import GiB_bytes
+
+        parallel_config = vllm_config.parallel_config
+        assert (
+            parallel_config.world_size == 1
+        ), "OpenVINOExecutor only supports single CPU socket currently."
+
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = \
+                "vllm.worker.openvino_worker.OpenVINOWorker"
+
+        # check and update model config
+        model_config = vllm_config.model_config
+        if model_config.dtype != torch.float32:
+            logger.warning(
+                f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}."  # noqa: G004, E501
+            )
+            model_config.dtype = torch.float32
+        if not model_config.enforce_eager:
+            logger.warning(
+                "CUDA graph is not supported on OpenVINO backend, fallback to "
+                "the eager mode.")
+            model_config.enforce_eager = True
+
+        # check and update cache config
+        ov_core = ov.Core()
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
+            if not OpenVinoPlatform.is_openvino_cpu():
+                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
+                            "ignored for GPU, f16 data type will be used.")
+                cache_config.cache_dtype = ov.Type.f16
+            else:
+                logger.info("KV cache type is overridden to u8 via "
+                            "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
+                cache_config.cache_dtype = ov.Type.u8
+        else:
+            if OpenVinoPlatform.is_openvino_cpu():
+                ov_device = envs.VLLM_OPENVINO_DEVICE
+                inference_precision = ov_core.get_property(
+                    ov_device, hints.inference_precision)
+                if inference_precision == ov.Type.bf16:
+                    cache_config.cache_dtype = ov.Type.bf16
+                else:
+                    cache_config.cache_dtype = ov.Type.f16
+            else:
+                cache_config.cache_dtype = ov.Type.f16
+
+        if OpenVinoPlatform.is_openvino_cpu():
+            if cache_config.block_size != 32:
+                logger.info(
+                    f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
+                )
+                cache_config.block_size = 32
+        else:
+            if cache_config.block_size != 16:
+                logger.info(
+                    f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
+                )
+                cache_config.block_size = 16
+
+        kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
+        if kv_cache_space >= 0:
+            if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
+                cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
+                logger.warning(
+                    "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
+                    "for OpenVINO backend is not set, using 4 by default.")
+            else:
+                cache_config.openvino_kvcache_space_bytes = (  # type: ignore
+                    kv_cache_space * GiB_bytes)
+        else:
+            raise RuntimeError(
+                "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
+                f" {kv_cache_space}, expect a positive integer value.")
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index fd8afc92b0f28..7778b565372cb 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,14 +1,32 @@
 import os
 from functools import lru_cache
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
 
 logger = init_logger(__name__)
 
+try:
+    import vllm._C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._C with %r", e)
+
+# import custom ops, trigger op registration
+try:
+    import vllm._rocm_C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._rocm_C with %r", e)
+
 if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
     logger.warning("`fork` method is not supported by ROCm. "
                    "VLLM_WORKER_MULTIPROC_METHOD is overridden to"
@@ -18,6 +36,25 @@
 
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
+    device_name: str = "rocm"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
+    supported_quantization: list[str] = [
+        "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
+        "fbgemm_fp8", "gguf"
+    ]
+
+    @classmethod
+    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+        selected_backend = (_Backend.ROCM_FLASH if selected_backend
+                            == _Backend.FLASH_ATTN else selected_backend)
+        if selected_backend == _Backend.ROCM_FLASH:
+            if not cls.has_device_capability(90):
+                # not Instinct series GPUs.
+                logger.info("flash_attn is not supported on NAVI GPUs.")
+        else:
+            logger.info("%s is not supported in AMD GPUs.", selected_backend)
+        return _Backend.ROCM_FLASH
 
     @classmethod
     @lru_cache(maxsize=8)
@@ -34,3 +71,42 @@ def get_device_name(cls, device_id: int = 0) -> str:
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         device_props = torch.cuda.get_device_properties(device_id)
         return device_props.total_memory
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                parallel_config.worker_cls = \
+                    "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                parallel_config.worker_cls = \
+                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                parallel_config.sd_worker_cls = \
+                    "vllm.worker.worker.Worker"
+            else:
+                parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        super().verify_quantization(quant)
+        if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+            logger.warning(
+                "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
+                " is not set, enabling VLLM_USE_TRITON_AWQ.")
+        envs.VLLM_USE_TRITON_AWQ = True
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 8d0ce47df4040..77f5c8401424b 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,24 +1,31 @@
-import os
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
-import vllm.envs as envs
-from vllm.compilation.levels import CompilationLevel
-from vllm.plugins import set_torch_compile_backend
+from vllm.logger import init_logger
 
-from .interface import Platform, PlatformEnum
+from .interface import Platform, PlatformEnum, _Backend
 
-if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
-    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE)
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
 
-assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE,\
-     "TPU does not support Inductor."
-
-set_torch_compile_backend("openxla")
+logger = init_logger(__name__)
 
 
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
+    device_name: str = "tpu"
+    device_type: str = "tpu"
+    dispatch_key: str = "XLA"
+    supported_quantization: list[str] = ["tpu_int8"]
+
+    @classmethod
+    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+        if selected_backend != _Backend.PALLAS:
+            logger.info("Cannot use %s backend on TPU.", selected_backend)
+        return _Backend.PALLAS
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
@@ -28,6 +35,40 @@ def get_device_name(cls, device_id: int = 0) -> str:
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         raise NotImplementedError
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        from vllm.config import CompilationLevel
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        compilation_config = vllm_config.compilation_config
+        if compilation_config.level == CompilationLevel.NO_COMPILATION:
+            # TPU does not support NO_COMPILATION
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+        assert compilation_config.level < CompilationLevel.PIECEWISE,\
+            "TPU does not support Inductor."
+
+        if compilation_config.backend == "":
+            compilation_config.backend = "openxla"
+
+        assert vllm_config.speculative_config is None, \
+            "TPU does not support speculative decoding"
+
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                parallel_config.worker_cls = \
+                    "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
+            else:
+                parallel_config.worker_cls = "vllm.worker.tpu_worker.TPUWorker"
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 106e8eddf458f..78e17c2afec65 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -1,10 +1,30 @@
+from typing import TYPE_CHECKING, Optional
+
 import torch
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from vllm.logger import init_logger
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
 
 
 class XPUPlatform(Platform):
     _enum = PlatformEnum.XPU
+    device_name: str = "xpu"
+    device_type: str = "xpu"
+    dispatch_key: str = "XPU"
+
+    @classmethod
+    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+        if selected_backend != _Backend.IPEX:
+            logger.info("Cannot use %s backend on XPU.", selected_backend)
+        return _Backend.IPEX
 
     @staticmethod
     def get_device_capability(device_id: int = 0) -> DeviceCapability:
@@ -21,6 +41,49 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
         device_props = torch.xpu.get_device_properties(device_id)
         return device_props.total_memory
 
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
     @staticmethod
     def inference_mode():
         return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        # check and update model config
+        model_config = vllm_config.model_config
+        if model_config.dtype == torch.bfloat16:
+            logger.warning(
+                "bfloat16 is not fully supported on XPU, casting to float16.")
+            model_config.dtype = torch.float16
+        if not model_config.enforce_eager:
+            logger.warning(
+                "CUDA graph is not supported on XPU, fallback to the eager "
+                "mode.")
+            model_config.enforce_eager = True
+
+        if vllm_config.speculative_config is not None:
+            raise NotImplementedError(
+                "XPU does not support speculative decoding")
+
+        # check and update parallel config
+        parallel_config = vllm_config.parallel_config
+        if (parallel_config.distributed_executor_backend is not None
+                and parallel_config.distributed_executor_backend != "ray"):
+            logger.warning(
+                "%s is not supported on XPU, fallback to ray distributed"
+                " executor backend.",
+                parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "ray"
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on XPU.")
+        return False
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 8373e11cfff9f..17f604ea0e202 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,23 +1,52 @@
 import logging
-from typing import TYPE_CHECKING, Callable, Optional, Union
+import os
 
-import vllm.envs as envs
+import torch
 
-if TYPE_CHECKING:
-    from vllm.compilation.config import CompilationConfig
-    from vllm.config import VllmConfig
-else:
-    CompilationConfig = None
-    VllmConfig = None
+import vllm.envs as envs
+from vllm.platforms import current_platform
 
 logger = logging.getLogger(__name__)
 
+# make sure one process only loads plugins once
+plugins_loaded = False
+
 
 def load_general_plugins():
     """WARNING: plugins can be loaded for multiple times in different
     processes. They should be designed in a way that they can be loaded
     multiple times without causing issues.
     """
+
+    # all processes created by vllm will load plugins,
+    # and here we can inject some common environment variables
+    # for all processes.
+
+    # see https://github.com/vllm-project/vllm/issues/10480
+    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+    # see https://github.com/vllm-project/vllm/issues/10619
+    torch._inductor.config.compile_threads = 1
+    if current_platform.is_xpu():
+        # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158  # noqa
+        os.environ['TORCH_COMPILE_DISABLE'] = 'True'
+    if current_platform.is_hpu():
+        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
+        # does not support torch.compile
+        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
+        # torch.compile support
+        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
+        if is_lazy:
+            # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+            torch._dynamo.config.disable = True
+            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
+            # requires enabling lazy collectives
+            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
+            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
+
+    global plugins_loaded
+    if plugins_loaded:
+        return
+    plugins_loaded = True
     import sys
     if sys.version_info < (3, 10):
         from importlib_metadata import entry_points
@@ -27,37 +56,24 @@ def load_general_plugins():
     allowed_plugins = envs.VLLM_PLUGINS
 
     discovered_plugins = entry_points(group='vllm.general_plugins')
+    if len(discovered_plugins) == 0:
+        logger.debug("No plugins found.")
+        return
+    logger.info("Available plugins:")
+    for plugin in discovered_plugins:
+        logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value,
+                    plugin.group)
+    if allowed_plugins is None:
+        logger.info("all available plugins will be loaded.")
+        logger.info("set environment variable VLLM_PLUGINS to control"
+                    " which plugins to load.")
+    else:
+        logger.info("plugins to load: %s", allowed_plugins)
     for plugin in discovered_plugins:
-        logger.info("Found general plugin: %s", plugin.name)
         if allowed_plugins is None or plugin.name in allowed_plugins:
             try:
                 func = plugin.load()
                 func()
-                logger.info("Loaded general plugin: %s", plugin.name)
+                logger.info("plugin %s loaded.", plugin.name)
             except Exception:
-                logger.exception("Failed to load general plugin: %s",
-                                 plugin.name)
-
-
-_torch_compile_backend: Optional[Union[Callable, str]] = None
-
-
-def set_torch_compile_backend(backend: Union[Callable, str]):
-    global _torch_compile_backend
-    _torch_compile_backend = backend
-
-
-def get_torch_compile_backend() -> Optional[Union[Callable, str]]:
-    return _torch_compile_backend
-
-
-_compilation_config: Optional[CompilationConfig] = None
-
-
-def set_compilation_config(config: Optional[CompilationConfig]):
-    global _compilation_config
-    _compilation_config = config
-
-
-def get_compilation_config() -> Optional[CompilationConfig]:
-    return _compilation_config
+                logger.exception("Failed to load plugin %s", plugin.name)
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 9d9f427e807f6..33babfebdca1e 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -72,6 +72,9 @@ class LayerwiseProfileResults(profile):
     _model_stats_tree: List[_StatsTreeNode] = field(init=False)
     _summary_stats_tree: List[_StatsTreeNode] = field(init=False)
 
+    # profile metadata
+    num_running_seqs: Optional[int] = None
+
     def __post_init__(self):
         self._build_correlation_map()
         self._build_module_tree()
@@ -127,6 +130,9 @@ def export_summary_stats_table_csv(self, filename: str):
 
     def convert_stats_to_dict(self) -> str:
         return {
+            "metadata": {
+                "num_running_seqs": self.num_running_seqs
+            },
             "summary_stats":
             self._convert_stats_tree_to_dict(self._summary_stats_tree),
             "model_stats":
@@ -338,7 +344,15 @@ def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]):
 
 class layerwise_profile(profile):
 
-    def __init__(self):
+    def __init__(self, num_running_seqs: Optional[int] = None):
+        """
+        layerwise profile constructor.
+
+        Args:
+            num_running_seqs (Optional[int], optional): When given,
+            num_running_seqs will be passed to LayerProfileResults for metadata
+            update. Defaults to None.
+        """
         super().__init__(
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             record_shapes=True,
@@ -346,9 +360,13 @@ def __init__(self):
             with_modules=True,
             experimental_config=_ExperimentalConfig(verbose=True))
 
+        self.num_running_seqs = num_running_seqs
+
     def __enter__(self):
         return super().__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         super().__exit__(exc_type, exc_val, exc_tb)
-        self.results = LayerwiseProfileResults(self.profiler.kineto_results)
+        self.results = LayerwiseProfileResults(
+            self.profiler.kineto_results,
+            num_running_seqs=self.num_running_seqs)
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 5c6df5aaf5446..fc77f3ca529b2 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -293,8 +293,9 @@ def __post_init__(self) -> None:
                 raise ValueError(
                     f"best_of must be greater than or equal to n, "
                     f"got n={self.n} and best_of={self.best_of}.")
-            self._real_n = self.n
-            self.n = self.best_of
+            if not self._real_n:
+                self._real_n = self.n
+                self.n = self.best_of
 
         if 0 < self.temperature < _MAX_TEMP:
             logger.warning(
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 4e4c071784287..a51c21cfa29e7 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -9,6 +9,7 @@
 from openai import OpenAI
 from openai.types.chat import ChatCompletionMessageParam
 
+import vllm.version
 from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.openai.api_server import run_server
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -143,6 +144,11 @@ def main():
     env_setup()
 
     parser = FlexibleArgumentParser(description="vLLM CLI")
+    parser.add_argument('-v',
+                        '--version',
+                        action='version',
+                        version=vllm.version.__version__)
+
     subparsers = parser.add_subparsers(required=True, dest="subparser")
 
     serve_parser = subparsers.add_parser(
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 3b41d25a2fe42..cc3d96fc93a79 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -449,6 +449,10 @@ def prompt_token_ids(self) -> List[int]:
     def prompt_embeds(self) -> Optional[torch.Tensor]:
         return self.inputs.prompt_embeds
 
+    @property
+    def token_type_ids(self) -> List[int]:
+        return self.inputs.token_type_ids
+
     @property
     def multi_modal_data(self) -> "MultiModalDataDict":
         return self.inputs.multi_modal_data
@@ -523,6 +527,19 @@ def hash_of_block(self, logical_idx: int) -> int:
         hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
         return hash((hashed_tokens, self.lora_int_id))
 
+    def extra_hash(self) -> Optional[int]:
+        """
+        This function computes an extra hash for a sequence, specifically
+        designed for prefix caching mode. The final sequence hash is determined
+        by applying token_ids from the sequence's blocks.
+        """
+        if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
+            return None
+
+        # NOTE: If there are additional factors influencing the block aside from
+        # token_ids, include them as input parameters to the hash.
+        return hash((self.prompt_adapter_id, self.lora_int_id))
+
     def num_hashed_tokens_of_block(self, logical_idx: int):
         return logical_idx * self.block_size + self.block_size
 
@@ -579,6 +596,9 @@ def get_num_new_tokens(self) -> int:
             return 1
         return self.data.get_num_uncomputed_tokens()
 
+    def get_num_computed_tokens(self) -> int:
+        return self.data.get_num_computed_tokens()
+
     def is_prefill(self) -> bool:
         return self.data.stage == SequenceStage.PREFILL
 
@@ -610,10 +630,9 @@ class SequenceGroup:
         sampling_params: The sampling parameters used to generate the outputs.
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
-        embeddings: The embeddings vectors of the prompt of the sequence group
-            for an embedding model.
-        pooling_params: The pooling parameters used to generate the pooling
-            for an embedding model.
+        pooling_params: The parameters used to generate the pooler
+            for a pooling model.
+        pooled_data: The extracted hidden states from a pooling model.
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
         trace_headers: OpenTelemetry trace headers.
@@ -628,8 +647,8 @@ def __init__(
         arrival_time: float,
         sampling_params: Optional[SamplingParams] = None,
         lora_request: Optional[LoRARequest] = None,
-        embeddings: Optional[List[float]] = None,
         pooling_params: Optional[PoolingParams] = None,
+        pooled_data: Optional[torch.Tensor] = None,
         encoder_seq: Optional[Sequence] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -651,8 +670,8 @@ def __init__(
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
-        self.embeddings = embeddings
         self.pooling_params = pooling_params
+        self.pooled_data = pooled_data
         self.prompt_adapter_request = prompt_adapter_request
         self.encoder_seq = encoder_seq
         self.trace_headers = trace_headers
@@ -684,6 +703,10 @@ def encoder_prompt_token_ids(self) -> Optional[List[int]]:
         return (self.encoder_seq.prompt_token_ids
                 if self.encoder_seq is not None else None)
 
+    @property
+    def token_type_ids(self) -> Optional[List[int]]:
+        return self.first_seq.token_type_ids
+
     @property
     def multi_modal_data(self) -> MultiModalDataDict:
         return self.first_seq.multi_modal_data
@@ -906,6 +929,7 @@ class SequenceGroupMetadata(
         default_factory=lambda: SequenceGroupState())
     # "MultiModalDataDict" types. We have to use Any due to msgspec
     # doesn't allow to have union of 2 different dicts.
+    token_type_ids: Optional[List[int]] = None
     multi_modal_data: Optional[Any] = None
     multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
@@ -1021,8 +1045,8 @@ class CompletionSequenceGroupOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    __metaclass__ = SequenceGroupOutput
     """The model output associated with a completion sequence group."""
+    __metaclass__ = SequenceGroupOutput
     samples: List[SequenceOutput]
     # Prompt logprob for each prompt query token.
     prompt_logprobs: Optional[PromptLogprobs]
@@ -1038,23 +1062,24 @@ def __eq__(self, other: object) -> bool:
                 and self.prompt_logprobs == other.prompt_logprobs)
 
 
-class EmbeddingSequenceGroupOutput(
+class PoolingSequenceGroupOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True,  # type: ignore[call-arg]
 ):
-    """The model output associated with an embedding sequence group."""
+    """The model output associated with a pooling sequence group."""
     __metaclass__ = SequenceGroupOutput
-    embeddings: List[int]
+    # Annotated as Any to be compatible with msgspec
+    # The actual type is in SequenceGroup.pooled_data
+    data: Any
 
     def __repr__(self) -> str:
-        return (f"EmbeddingSequenceGroupOutput("
-                f"embeddings_shape={len(self.embeddings)})")
+        return f"PoolingSequenceGroupOutput(data={self.data}"
 
     def __eq__(self, other: object) -> bool:
-        if not isinstance(other, EmbeddingSequenceGroupOutput):
+        if not isinstance(other, PoolingSequenceGroupOutput):
             raise NotImplementedError()
-        return self.embeddings == other.embeddings
+        return self.data == other.data
 
 
 # cannot use msgspec.Struct here because Dynamo does not support it
@@ -1073,7 +1098,7 @@ def __getitem__(self, key: Union[str, slice]):
         elif isinstance(key, slice):
             return self.__class__({k: v[key] for k, v in self.tensors.items()})
 
-    def __setitem__(self, key: str, value):
+    def __setitem__(self, key: str, value: torch.Tensor):
         self.tensors[key] = value
 
     def __len__(self):
@@ -1090,17 +1115,13 @@ class PoolerOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """The output from a pooling operation in the embedding model."""
-    outputs: List[EmbeddingSequenceGroupOutput]
-
-    # lazy import to avoid circular import
-    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
-    spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None
+    """The output from a pooling operation in the pooling model."""
+    outputs: List[PoolingSequenceGroupOutput]
 
-    def __getitem__(self, idx: int) -> EmbeddingSequenceGroupOutput:
+    def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
         return self.outputs[idx]
 
-    def __setitem__(self, idx: int, value):
+    def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
         self.outputs[idx] = value
 
     def __len__(self):
@@ -1373,8 +1394,8 @@ def add_request(request_id: str, engine, params, **kwargs):
             arrival_time=seq_group.arrival_time,
             sampling_params=original_params,
             lora_request=seq_group.lora_request,
-            embeddings=seq_group.embeddings,
             pooling_params=seq_group.pooling_params,
+            pooled_data=seq_group.pooled_data,
             encoder_seq=seq_group.encoder_seq,
             trace_headers=seq_group.trace_headers,
             prompt_adapter_request=seq_group.prompt_adapter_request,
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 25ef27b8378f0..01b9cdad963da 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -307,28 +307,16 @@ def _create_target_seq_group_metadata(
         token_ids_to_score = self._get_token_ids_to_score(
             proposal_token_ids[batch_index])
 
-        # Use simpler sampling parameters apart from for final token
-        # (in particular don't do seeded sampling) since those sampled tokens
-        # aren't used.
-        # We don't replace the sampling_params in the greedy case because
-        # this also controls whether the probs get modified in the sampler
-        # (see use of _modify_greedy_probs_inplace there).
         sampling_params = input_seq_group_metadata.sampling_params
-        non_bonus_sampling_params = DEFAULT_SIMPLE_SAMPLING_PARAMS \
-            if sampling_params.temperature else sampling_params
-
         target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        last_index = len(token_ids_to_score) - 1
         for i, token_ids in enumerate(token_ids_to_score):
-            target_sampling_params = sampling_params if i == last_index \
-                else non_bonus_sampling_params
             target_seq_group_metadata_list.append(
                 self._create_single_target_seq_group_metadata(
                     input_seq_group_metadata,
                     input_seq_id,
                     next(target_seq_ids_iter),
                     token_ids,
-                    sampling_params=target_sampling_params,
+                    sampling_params=sampling_params,
                 ))
 
         return target_seq_group_metadata_list
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index cd4d7eb0e6e4e..fe5fd39f42ac9 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -20,8 +20,9 @@
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalKwargs
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
-                                      ModelRunner)
+from vllm.worker.model_runner_base import (ModelRunnerBase,
+                                           ModelRunnerInputBase,
+                                           ModelRunnerWrapperBase)
 
 logger = init_logger(__name__)
 
@@ -33,7 +34,7 @@
 allow_gpu_advance_step = True
 
 
-class TP1DraftModelRunner(ModelRunner):
+class TP1DraftModelRunner(ModelRunnerWrapperBase):
     """Specialized model runner for speculative decoding draft model.
     Since the draft model always execute k forward passes consecutively to
     generate k speculative tokens in a single speculative decoding step,
@@ -46,13 +47,14 @@ class TP1DraftModelRunner(ModelRunner):
        any broadcasting inside execute_model).
     """
 
-    def __init__(self, *args, **kwargs):
-        if kwargs.get("return_hidden_states"):
+    def __init__(self, model_runner: ModelRunnerBase):
+        if hasattr(
+                model_runner,
+                "return_hidden_states") and model_runner.return_hidden_states:
             raise ValueError(
                 "return_hidden_states is not supported for TP1DraftModelRunner."
             )
-
-        super().__init__(*args, **kwargs)
+        super().__init__(model_runner)
 
         self.indices_of_seq_with_bonus_tokens = None
 
@@ -73,10 +75,8 @@ def _update_sampling_metadata(self, sampling_metadata, num_seqs,
             assert seq_group.prompt_logprob_indices == []  # No prompt
             assert seq_group.sample_indices == [i]  # Simple
 
-    def _gpu_advance_step(
-            self, model_input: ModelInputForGPUWithSamplingMetadata,
-            last_output: SamplerOutput
-    ) -> ModelInputForGPUWithSamplingMetadata:
+    def _gpu_advance_step(self, model_input: ModelRunnerInputBase,
+                          last_output: SamplerOutput) -> ModelRunnerInputBase:
         # Currently, we expect "decode mode" only
         assert not model_input.is_prompt
 
@@ -168,7 +168,7 @@ def set_indices_of_seq_with_bonus_tokens(self,
     @torch.inference_mode()
     def execute_model(
         self,
-        model_input: ModelInputForGPUWithSamplingMetadata,
+        model_input: ModelRunnerInputBase,
         kv_caches: List[torch.Tensor],
         previous_hidden_states: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
@@ -273,7 +273,8 @@ def execute_model(
                 if previous_hidden_states is not None else {}
 
             # Run model
-            with set_forward_context(model_input.attn_metadata):
+            with set_forward_context(model_input.attn_metadata,
+                                     self.vllm_config):
                 hidden_states = model_executable(
                     input_ids=model_input.input_tokens,
                     positions=model_input.input_positions,
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index 029f56460f5c1..a4fe0f13c8db1 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional, Set
+from typing import Optional, Set, Union
 
 import torch
 
@@ -75,9 +75,11 @@ def get_spec_proposals(
 
 class SpeculativeScorer(ABC):
 
-    def __init__(self, scorer_worker: WorkerBase, device: str,
-                 vocab_size: int):
+    def __init__(self, scorer_worker: WorkerBase,
+                 device: Union[torch.device, str], vocab_size: int):
         self._scorer_worker = scorer_worker
+        if isinstance(device, torch.device):
+            device = device.type
         self._device = device
         self._vocab_size = vocab_size
 
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 0d233f393cb8c..1ab691a7ef047 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -9,21 +9,22 @@
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker import Worker
+from vllm.worker.worker_base import WorkerWrapperBase
 
 
-class MedusaWorker(NonLLMProposerWorkerBase, Worker):
+class MedusaWorker(NonLLMProposerWorkerBase, WorkerWrapperBase):
     """Worker for Medusa.
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super().__init__(kwargs.get("vllm_config"))
+        self.init_worker(*args, **kwargs)
 
         # Lazy initialization list.
         self._proposer: Top1Proposer
 
     def init_device(self):
-        super().init_device()
+        self.worker.init_device()
 
         self._proposer = Top1Proposer(
             weakref.proxy(self),  # type: ignore[arg-type]
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 89ccaba70e93c..03dc46600d8a9 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -1,11 +1,12 @@
 import time
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import msgspec
 import torch
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
+from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
 
@@ -81,8 +82,20 @@ def init_gpu_tensors(self, rank: int) -> None:
         self._rank = rank
         self._copy_stream = torch.cuda.Stream()
 
+    def init_tensors(self,
+                     rank: int,
+                     device_type: Union[torch.device, str] = 'cuda') -> None:
+        self._rank = rank
+        if isinstance(device_type, torch.device):
+            device_type = device_type.type
+        if device_type == 'cuda':
+            self._copy_stream = torch.cuda.Stream()
+
     def maybe_collect_rejsample_metrics(
             self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
+        # currently using cuda.Event, skip for any non_cuda_alike platform
+        if not current_platform.is_cuda_alike():
+            return None
 
         # If a copy was initiated in the previous call, collect and return.
         if self._in_flight_copy is not None:
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index f49b98f5c9528..676ac5eb3609d 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -5,17 +5,21 @@
 import torch
 
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData,
                            SequenceGroupMetadata)
-from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+
+if current_platform.is_cuda_alike():
+    from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker import Worker
+from vllm.worker.worker_base import WorkerWrapperBase
 
 
-class MultiStepWorker(Worker, ProposerWorkerBase):
+class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
     """The MultiStepWorker is equivalent to a Worker except that it allows
     multiple forward passes in a single call, assuming the scheduler has
     allocated enough space to store the additional KV. This reduces overhead
@@ -28,13 +32,14 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super().__init__(kwargs.get("vllm_config"))
+        self.init_worker(*args, **kwargs)
 
         # Lazy initialization list.
         self._proposer: SpeculativeProposer
 
     def init_device(self) -> None:
-        super().init_device()
+        self.worker.init_device()
 
         self._proposer = Top1Proposer(
             weakref.proxy(self),  # type: ignore[arg-type]
@@ -51,6 +56,18 @@ def set_should_modify_greedy_probs_inplace(self) -> None:
         self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
             True)
 
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        return self.worker.determine_num_available_blocks()
+
+    def get_cache_block_size_bytes(self) -> int:
+        return self.worker.get_cache_block_size_bytes()
+
+    def initialize_cache(self, *args, **kwargs) -> None:
+        self.worker.initialize_cache(*args, **kwargs)
+
+    def execute_model(self, *args, **kwargs) -> List[SamplerOutput]:
+        return self.worker.execute_model(*args, **kwargs)
+
     @torch.inference_mode()
     def sampler_output(
         self,
@@ -75,7 +92,7 @@ def sampler_output(
 
         # Run model sample_len times.
         model_outputs: List[SamplerOutput] = []
-        if isinstance(
+        if current_platform.is_cuda_alike() and isinstance(
                 self.model_runner, TP1DraftModelRunner
         ) and self.model_runner.supports_gpu_multi_step(expanded_request):
             # Here we run the draft_model_runner with multi-step prepare
@@ -92,7 +109,7 @@ def sampler_output(
             # and other restrictions that are part of DraftModelRunner's
             # supports_gpu_multi_step(..)
             for _ in range(sample_len):
-                model_output: List[SamplerOutput] = super().execute_model(
+                model_output: List[SamplerOutput] = self.worker.execute_model(
                     execute_model_req=expanded_request)
                 assert (len(model_output) == 1
                         ), "composing multistep workers not supported"
@@ -103,6 +120,9 @@ def sampler_output(
                     indices_of_seq_with_bonus_tokens)
                 model_outputs.append(model_output)
 
+        # move indices to device to avoid stream sync
+        indices_of_seq_with_bonus_tokens = torch.tensor(
+            indices_of_seq_with_bonus_tokens, device=self.device)
         filtered_model_outputs = self._filter_model_output(
             model_outputs, indices_of_seq_with_bonus_tokens)
         return filtered_model_outputs, True
@@ -172,7 +192,7 @@ def _expand_execute_model_request(
     @staticmethod
     def _filter_model_output(
             expanded_batch_outputs: List[SamplerOutput],
-            output_indices_to_retain: List[int]) -> List[SamplerOutput]:
+            output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]:
         """
         Filters the model output to include only the specified sequence
         outputs. This method contracts the expanded batch output from the
@@ -182,8 +202,8 @@ def _filter_model_output(
         Args:
             expanded_batch_output (List[SamplerOutput]): The expanded output
                 batch from the model.
-            output_indices_to_retain (List[int]): Indices of the model outputs
-                to retain.
+            output_indices_to_retain (torch.Tensor): Indices of the model
+                outputs to retain.
 
         Returns:
             List[SamplerOutput]: A list containing the filtered model 
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index debb3b2d5ec30..bb6b99135580e 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -22,6 +22,7 @@ def __init__(self, *args, **kwargs):
         # Get local_rank/vocab_size from kwargs attribute
         self.local_rank = kwargs["local_rank"]
         self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size()
+        self.device_type = kwargs.get("device_type", "cuda")
 
         # Lazy initialization list.
         self._proposer: Top1Proposer
@@ -34,7 +35,7 @@ def set_ngram_window_size(self, ngram_prompt_lookup_min: int,
         self.ngram_prompt_lookup_min = ngram_prompt_lookup_min
 
     def init_device(self):
-        self.device = torch.device(f"cuda:{self.local_rank}")
+        self.device = torch.device(f"{self.device_type}:{self.local_rank}")
         self.load_model = lambda *args, **kwargs: None
 
         # Current NGramWorker only supports Top1Proposer
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index b57742c2ebfdd..2689802161987 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -14,12 +14,16 @@
     SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler)
 from vllm.model_executor.layers.typical_acceptance_sampler import (
     TypicalAcceptanceSampler)
+from vllm.platforms import current_platform
 from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
                            CompletionSequenceGroupOutput, ExecuteModelRequest,
                            HiddenStates, SequenceGroupMetadata,
                            get_all_seq_ids_and_request_ids)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
-from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+
+if current_platform.is_cuda_alike():
+    from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.medusa_worker import MedusaWorker
@@ -36,8 +40,8 @@
                                    get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
                                    split_batch_by_proposal_len)
-from vllm.worker.worker import Worker
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
+from vllm.worker.worker_base import (LoraNotSupportedWorkerBase, WorkerBase,
+                                     WorkerWrapperBase)
 
 logger = init_logger(__name__)
 
@@ -50,10 +54,18 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     speculative_config: SpeculativeConfig = vllm_config.speculative_config
     assert speculative_config is not None
 
+    if vllm_config.parallel_config.pipeline_parallel_size > 1:
+        raise NotImplementedError("Speculative decoding is currently "
+                                  "incompatible with pipeline parallelism")
+
     draft_worker_kwargs = kwargs.copy()
 
     kwargs["model_runner_cls"] = TargetModelRunner
-    target_worker = Worker(*args, **kwargs)
+    target_worker_config = copy.deepcopy(vllm_config)
+    target_worker_config.parallel_config.worker_cls =\
+        target_worker_config.parallel_config.sd_worker_cls
+    target_worker = WorkerWrapperBase(vllm_config=target_worker_config)
+    target_worker.init_worker(*args, **kwargs)
     # Set the disable_logprobs variable in the TargetModelRunner instance
     # as per its value specified in the SpeculativeConfig.
     target_worker.model_runner.disable_logprobs =\
@@ -65,6 +77,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
         draft_worker_config.model_config,
         vllm_config.load_config,
     )
+    speculative_config.draft_parallel_config.worker_cls =\
+        draft_worker_config.parallel_config.sd_worker_cls
     draft_worker_config.parallel_config = speculative_config.draft_parallel_config  # noqa
     # TODO allow draft-model specific load config.
 
@@ -94,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
-# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.rst
 # If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """Worker which implements speculative decoding.
@@ -125,7 +139,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     @classmethod
     def create_worker(
         cls,
-        scorer_worker: Worker,
+        scorer_worker: WorkerBase,
         draft_worker_kwargs: Dict[str, Any],
         disable_mqa_scorer: bool,
         disable_by_batch_size: Optional[int],
@@ -145,6 +159,8 @@ def create_worker(
         draft_parallel_config: ParallelConfig = draft_worker_kwargs[
             'vllm_config'].parallel_config
         if ngram_prompt_lookup_max > 0:
+            draft_worker_kwargs[
+                "device_type"] = scorer_worker.device_config.device.type
             proposer_worker = NGramWorker(**draft_worker_kwargs)
             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                                   ngram_prompt_lookup_max)
@@ -158,8 +174,9 @@ def create_worker(
                 proposer_worker = MedusaWorker(**draft_worker_kwargs)
             else:
                 if draft_tp == 1:
-                    draft_worker_kwargs[
-                        "model_runner_cls"] = TP1DraftModelRunner
+                    if current_platform.is_cuda_alike():
+                        draft_worker_kwargs[
+                            "model_runner_cls"] = TP1DraftModelRunner
                 else:
                     if draft_model_config.hf_config.model_type == "eagle":
                         raise NotImplementedError(
@@ -306,8 +323,9 @@ def init_device(self) -> None:
         self.scorer_worker.load_model()
         self.proposer_worker.load_model()
 
-        self._metrics.init_gpu_tensors(self.rank)
-        self.spec_decode_sampler.init_gpu_tensors(self.rank)
+        self._metrics.init_tensors(self.rank, device_type=self.device)
+        self.spec_decode_sampler.init_tensors(self.rank,
+                                              device_type=self.device)
 
         scorer_cls: Type[SpeculativeScorer]
         if self.disable_mqa_scorer:
@@ -408,7 +426,20 @@ def execute_model(
         disable_all_speculation = self._should_disable_all_speculation(
             execute_model_req)
         num_lookahead_slots = execute_model_req.num_lookahead_slots
-
+        all_prompt = True
+        atleast_one_prompt = False
+        all_zero_spec_tokens = True
+        for sgm in execute_model_req.seq_group_metadata_list:
+            all_prompt = all_prompt and sgm.is_prompt
+            atleast_one_prompt = atleast_one_prompt or sgm.is_prompt
+            all_zero_spec_tokens = all_zero_spec_tokens and (
+                sgm.num_speculative_tokens == 0)
+
+        if all_prompt and execute_model_req.seq_group_metadata_list:
+            assert num_lookahead_slots == 0, (
+                "Prompt only runs should have num_lookahead_slots equal to 0. "
+                "This should never happen, please file a bug at "
+                "https://github.com/vllm-project/vllm/issues")
         # Speculative decoding is disabled in the following cases:
         # 1. Prefill phase: Speculative decoding is not
         #    used during the prefill phase.
@@ -419,11 +450,8 @@ def execute_model(
         # In any of these cases, the proposer and scorer workers
         # are called normally.
         # We expect `num_speculative_tokens` to be None for prefills.
-        no_spec = all(
-            sgm.is_prompt for sgm in execute_model_req.seq_group_metadata_list
-        ) or num_lookahead_slots == 0 or disable_all_speculation or all(
-            sgm.num_speculative_tokens == 0
-            for sgm in execute_model_req.seq_group_metadata_list)
+        no_spec = (num_lookahead_slots == 0 or disable_all_speculation
+                   or all_zero_spec_tokens)
 
         # Broadcast how many lookahead slots are scheduled for this step, and
         # whether all speculation is disabled, to all non-driver workers.
@@ -442,6 +470,15 @@ def execute_model(
             num_lookahead_slots=num_lookahead_slots,
             no_spec=no_spec,
             disable_all_speculation=disable_all_speculation,
+            # When both chunked prefill and speculative decoding are enabled
+            # it is possible that the same batch contains both prefill
+            # and decodes. If that happens in the scorer we run the batch
+            # as one single forward pass. However, in the proposer we
+            # run them as 2 different batches - one for prefill and
+            # the other for decodes. The variable indicates to the non-driver
+            # worker that there are prefills as part of the speculative batch
+            # and hence it needs to run an extra prefill forward pass.
+            run_spec_proposer_for_prefill=atleast_one_prompt,
         )
         broadcast_tensor_dict(broadcast_dict, src=self._driver_rank)
 
@@ -653,6 +690,8 @@ def _run_non_driver_rank(self) -> bool:
 
         if not data["no_spec"]:
             self.scorer_worker.execute_model()
+            if data["run_spec_proposer_for_prefill"]:
+                self.proposer_worker.execute_model()
 
         return True
 
@@ -1090,11 +1129,11 @@ def get_cache_block_size_bytes(self):
         raise NotImplementedError
 
     def start_profile(self):
-        if isinstance(self.scorer_worker, Worker):
+        if isinstance(self.scorer_worker, WorkerBase):
             self.scorer_worker.start_profile()
 
     def stop_profile(self):
-        if isinstance(self.scorer_worker, Worker):
+        if isinstance(self.scorer_worker, WorkerBase):
             self.scorer_worker.stop_profile()
 
 
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
index e61cde5b17f20..56540744b73a9 100644
--- a/vllm/spec_decode/target_model_runner.py
+++ b/vllm/spec_decode/target_model_runner.py
@@ -1,12 +1,12 @@
 from typing import List, Optional
 
-from vllm.config import VllmConfig
 from vllm.sequence import SequenceGroupMetadata
-from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
-                                      ModelRunner)
+from vllm.worker.model_runner_base import (ModelRunnerBase,
+                                           ModelRunnerInputBase,
+                                           ModelRunnerWrapperBase)
 
 
-class TargetModelRunner(ModelRunner):
+class TargetModelRunner(ModelRunnerWrapperBase):
     """Specialized model runner for speculative decoding target model.
     In speculative decoding, the log probabilities selected finally may not
     be the same ones as selected by the target model sampling. This means
@@ -18,32 +18,21 @@ class TargetModelRunner(ModelRunner):
     requested or not. 
     """
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        return_hidden_states: bool = False,
-    ):
+    def __init__(self, model_runner: ModelRunnerBase):
         # An internal boolean member variable to indicate if token log
         # probabilities are needed or not.
+        super().__init__(model_runner)
         self.disable_logprobs = True
-        super().__init__(
-            vllm_config=vllm_config,
-            kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=is_driver_worker,
-            return_hidden_states=return_hidden_states,
-        )
 
     def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForGPUWithSamplingMetadata:
-        model_input: ModelInputForGPUWithSamplingMetadata = super(
-        ).prepare_model_input(seq_group_metadata_list, virtual_engine,
-                              finished_requests_ids)
+        finished_requests_ids: Optional[List[str]] = None,
+    ) -> ModelRunnerInputBase:
+        model_input: ModelRunnerInputBase =\
+            self.model_runner.prepare_model_input(
+            seq_group_metadata_list, virtual_engine, finished_requests_ids)
         # If token log probabilities is disabled then skip generating sampler
         # CPU output. We directly serialize the GPU sampled_token_id tensors
         # as needed. If log probabilities is enabled then synchronize all the
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 193ef870dfceb..da8706658d09a 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                            PromptLogprobs, SequenceGroupMetadata,
                            SequenceOutput)
@@ -247,11 +248,14 @@ def nvtx_range(msg, *args, **kwargs):
     Arguments:
         msg (string): message to associate with the range
     """
-    torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
-    try:
+    if current_platform.is_cuda_alike():
+        torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
+        try:
+            yield
+        finally:
+            torch.cuda.nvtx.range_pop()
+    else:
         yield
-    finally:
-        torch.cuda.nvtx.range_pop()
 
 
 class Timer:
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 054845584c2ef..4529cf27ef565 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,5 +1,6 @@
 import enum
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Optional, Type, Union
 
@@ -9,6 +10,7 @@
 from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError,
                                    RepositoryNotFoundError,
                                    RevisionNotFoundError)
+from torch import nn
 from transformers import GenerationConfig, PretrainedConfig
 from transformers.models.auto.image_processing_auto import (
     get_image_processor_config)
@@ -27,10 +29,12 @@
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
                                              NemotronConfig, NVLM_D_Config,
-                                             RWConfig, SolarConfig,
+                                             Olmo2Config, RWConfig,
+                                             SolarConfig, Telechat2Config,
                                              UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
+from vllm.utils import resolve_obj_by_qualname
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -38,6 +42,7 @@
     from transformers import AutoConfig
 
 MISTRAL_CONFIG_NAME = "params.json"
+HF_TOKEN = os.getenv('HF_TOKEN', None)
 
 logger = init_logger(__name__)
 
@@ -60,7 +65,9 @@
     "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
+    "olmo2": Olmo2Config,
     "solar": SolarConfig,
+    "telechat": Telechat2Config,
     "ultravox": UltravoxConfig,
     **_CONFIG_REGISTRY_OVERRIDE_HF
 }
@@ -72,8 +79,8 @@ class ConfigFormat(str, enum.Enum):
     MISTRAL = "mistral"
 
 
-def file_or_path_exists(model: Union[str, Path], config_name, revision,
-                        token) -> bool:
+def file_or_path_exists(model: Union[str, Path], config_name: str,
+                        revision: Optional[str]) -> bool:
     if Path(model).exists():
         return (Path(model) / config_name).is_file()
 
@@ -88,7 +95,10 @@ def file_or_path_exists(model: Union[str, Path], config_name, revision,
     # NB: file_exists will only check for the existence of the config file on
     # hf_hub. This will fail in offline mode.
     try:
-        return file_exists(model, config_name, revision=revision, token=token)
+        return file_exists(model,
+                           config_name,
+                           revision=revision,
+                           token=HF_TOKEN)
     except huggingface_hub.errors.OfflineModeIsEnabled:
         # Don't raise in offline mode, all we know is that we don't have this
         # file cached.
@@ -107,6 +117,15 @@ def patch_rope_scaling(config: PretrainedConfig) -> None:
 
 
 def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None:
+    if "rope_type" in rope_scaling and "type" in rope_scaling:
+        rope_type = rope_scaling["rope_type"]
+        rope_type_legacy = rope_scaling["type"]
+        if rope_type != rope_type_legacy:
+            raise ValueError(
+                f"Found conflicts between 'rope_type={rope_type}' (modern "
+                f"field) and 'type={rope_type_legacy}' (legacy field). "
+                "You should only specify one of them.")
+
     if "rope_type" not in rope_scaling and "type" in rope_scaling:
         rope_scaling["rope_type"] = rope_scaling["type"]
         logger.info("Replacing legacy 'type' key with 'rope_type'")
@@ -147,7 +166,6 @@ def get_config(
     revision: Optional[str] = None,
     code_revision: Optional[str] = None,
     config_format: ConfigFormat = ConfigFormat.AUTO,
-    token: Optional[str] = None,
     **kwargs,
 ) -> PretrainedConfig:
     # Separate model folder from file path for GGUF models
@@ -159,19 +177,20 @@ def get_config(
 
     if config_format == ConfigFormat.AUTO:
         if is_gguf or file_or_path_exists(
-                model, HF_CONFIG_NAME, revision=revision, token=token):
+                model, HF_CONFIG_NAME, revision=revision):
             config_format = ConfigFormat.HF
-        elif file_or_path_exists(model,
-                                 MISTRAL_CONFIG_NAME,
-                                 revision=revision,
-                                 token=token):
+        elif file_or_path_exists(model, MISTRAL_CONFIG_NAME,
+                                 revision=revision):
             config_format = ConfigFormat.MISTRAL
         else:
             # If we're in offline mode and found no valid config format, then
             # raise an offline mode error to indicate to the user that they
             # don't have files cached and may need to go online.
             # This is conveniently triggered by calling file_exists().
-            file_exists(model, HF_CONFIG_NAME, revision=revision, token=token)
+            file_exists(model,
+                        HF_CONFIG_NAME,
+                        revision=revision,
+                        token=HF_TOKEN)
 
             raise ValueError(f"No supported config format found in {model}")
 
@@ -180,7 +199,7 @@ def get_config(
             model,
             revision=revision,
             code_revision=code_revision,
-            token=token,
+            token=HF_TOKEN,
             **kwargs,
         )
 
@@ -192,7 +211,7 @@ def get_config(
                 model,
                 revision=revision,
                 code_revision=code_revision,
-                token=token,
+                token=HF_TOKEN,
                 **kwargs,
             )
         else:
@@ -202,7 +221,7 @@ def get_config(
                     trust_remote_code=trust_remote_code,
                     revision=revision,
                     code_revision=code_revision,
-                    token=token,
+                    token=HF_TOKEN,
                     **kwargs,
                 )
             except ValueError as e:
@@ -220,7 +239,7 @@ def get_config(
                     raise e
 
     elif config_format == ConfigFormat.MISTRAL:
-        config = load_params_config(model, revision, token=token, **kwargs)
+        config = load_params_config(model, revision, token=HF_TOKEN, **kwargs)
     else:
         raise ValueError(f"Unsupported config format: {config_format}")
 
@@ -242,8 +261,7 @@ def get_config(
 
 def get_hf_file_to_dict(file_name: str,
                         model: Union[str, Path],
-                        revision: Optional[str] = 'main',
-                        token: Optional[str] = None):
+                        revision: Optional[str] = 'main'):
     """
     Downloads a file from the Hugging Face Hub and returns 
     its contents as a dictionary.
@@ -252,7 +270,6 @@ def get_hf_file_to_dict(file_name: str,
     - file_name (str): The name of the file to download.
     - model (str): The name of the model on the Hugging Face Hub.
     - revision (str): The specific version of the model. 
-    - token (str): The Hugging Face authentication token.
 
     Returns:
     - config_dict (dict): A dictionary containing 
@@ -262,8 +279,7 @@ def get_hf_file_to_dict(file_name: str,
 
     if file_or_path_exists(model=model,
                            config_name=file_name,
-                           revision=revision,
-                           token=token):
+                           revision=revision):
 
         if not file_path.is_file():
             try:
@@ -282,9 +298,7 @@ def get_hf_file_to_dict(file_name: str,
     return None
 
 
-def get_pooling_config(model: str,
-                       revision: Optional[str] = 'main',
-                       token: Optional[str] = None):
+def get_pooling_config(model: str, revision: Optional[str] = 'main'):
     """
     This function gets the pooling and normalize 
     config from the model - only applies to 
@@ -301,8 +315,7 @@ def get_pooling_config(model: str,
     """
 
     modules_file_name = "modules.json"
-    modules_dict = get_hf_file_to_dict(modules_file_name, model, revision,
-                                       token)
+    modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
 
     if modules_dict is None:
         return None
@@ -318,8 +331,7 @@ def get_pooling_config(model: str,
     if pooling:
 
         pooling_file_name = "{}/config.json".format(pooling["path"])
-        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision,
-                                           token)
+        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
         pooling_type_name = next(
             (item for item, val in pooling_dict.items() if val is True), None)
 
@@ -354,8 +366,8 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
 
 
 def get_sentence_transformer_tokenizer_config(model: str,
-                                              revision: Optional[str] = 'main',
-                                              token: Optional[str] = None):
+                                              revision: Optional[str] = 'main'
+                                              ):
     """
     Returns the tokenization configuration dictionary for a 
     given Sentence Transformer BERT model.
@@ -365,7 +377,6 @@ def get_sentence_transformer_tokenizer_config(model: str,
     BERT model.
     - revision (str, optional): The revision of the m
     odel to use. Defaults to 'main'.
-    - token (str): A Hugging Face access token.
 
     Returns:
     - dict: A dictionary containing the configuration parameters 
@@ -380,7 +391,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
             "sentence_xlm-roberta_config.json",
             "sentence_xlnet_config.json",
     ]:
-        encoder_dict = get_hf_file_to_dict(config_name, model, revision, token)
+        encoder_dict = get_hf_file_to_dict(config_name, model, revision)
         if encoder_dict:
             break
 
@@ -460,16 +471,14 @@ def _reduce_config(config: VllmConfig):
             exc_info=e)
 
 
-def load_params_config(model: Union[str, Path],
-                       revision: Optional[str],
-                       token: Optional[str] = None,
+def load_params_config(model: Union[str, Path], revision: Optional[str],
                        **kwargs) -> PretrainedConfig:
     # This function loads a params.json config which
     # should be used when loading models in mistral format
 
     config_file_name = "params.json"
 
-    config_dict = get_hf_file_to_dict(config_file_name, model, revision, token)
+    config_dict = get_hf_file_to_dict(config_file_name, model, revision)
     assert isinstance(config_dict, dict)
 
     config_mapping = {
@@ -568,3 +577,16 @@ def try_get_generation_config(
             return GenerationConfig.from_model_config(config)
         except OSError:  # Not found
             return None
+
+
+def get_cross_encoder_activation_function(config: PretrainedConfig):
+    if (hasattr(config, "sbert_ce_default_activation_function")
+            and config.sbert_ce_default_activation_function is not None):
+
+        function_name = config.sbert_ce_default_activation_function
+        assert function_name.startswith("torch.nn.modules."), \
+            "Loading of activation functions is restricted to " \
+            "torch.nn.modules for security reasons"
+        return resolve_obj_by_qualname(function_name)()
+    else:
+        return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index d1e19c9a33c24..c24433cd436b4 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -15,7 +15,9 @@
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
+from vllm.transformers_utils.configs.olmo2 import Olmo2Config
 from vllm.transformers_utils.configs.solar import SolarConfig
+from vllm.transformers_utils.configs.telechat2 import Telechat2Config
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
@@ -33,6 +35,8 @@
     "MLPSpeculatorConfig",
     "NemotronConfig",
     "NVLM_D_Config",
+    "Olmo2Config",
     "SolarConfig",
+    "Telechat2Config",
     "UltravoxConfig",
 ]
\ No newline at end of file
diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py
new file mode 100644
index 0000000000000..d253da0d96a34
--- /dev/null
+++ b/vllm/transformers_utils/configs/aria.py
@@ -0,0 +1,47 @@
+from transformers.models.idefics2.configuration_idefics2 import (
+    Idefics2VisionConfig)
+from transformers.models.llama.configuration_llama import LlamaConfig
+
+
+class AriaVisionConfig(Idefics2VisionConfig):
+    model_type = "aria_vision_model"
+
+
+class AriaMoELMConfig(LlamaConfig):
+    """
+    Configuration class for AriaMoE language model.
+
+    This class extends the LlamaConfig to include additional parameters specific
+    to the Mixture of Experts (MoE) architecture.
+    """
+
+    model_type = "aria_moe_lm"
+
+    def __init__(
+        self,
+        moe_intermediate_size: int = 4096,
+        moe_num_experts: int = 8,
+        moe_topk: int = 2,
+        moe_num_shared_experts: int = 2,
+        **kwargs,
+    ):
+        """
+        Initialize the AriaMoELMConfig.
+
+        Args:
+            moe_intermediate_size (int): The intermediate size for MoE layers.
+                Default is 4096.
+            moe_num_experts (int): The number of experts in the MoE layer.
+                Default is 8.
+            moe_topk (int): The number of top experts to route to for each 
+                token. Default is 2.
+            moe_num_shared_experts (int): The number of shared experts. Default
+                is 2. 
+            **kwargs: Additional keyword arguments to be passed to the parent
+                LlamaConfig.
+        """
+        super().__init__(**kwargs)
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_topk = moe_topk
+        self.moe_num_shared_experts = moe_num_shared_experts
diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py
new file mode 100644
index 0000000000000..0e6d8e4879b06
--- /dev/null
+++ b/vllm/transformers_utils/configs/olmo2.py
@@ -0,0 +1,166 @@
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/configuration_olmo2.py
+"""OLMo 2 configuration."""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class Olmo2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50304):
+            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Olmo2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50279):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+
+    ```python
+    >>> from transformers import Olmo2Model, Olmo2Config
+
+    >>> # Initializing a Olmo2 7B style configuration
+    >>> configuration = Olmo2Config()
+
+    >>> # Initializing a model from the Olmo2 7B style configuration
+    >>> model = Olmo2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "olmo2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        rms_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.rms_norm_eps = rms_norm_eps
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
new file mode 100644
index 0000000000000..eb6f5a059169f
--- /dev/null
+++ b/vllm/transformers_utils/configs/telechat2.py
@@ -0,0 +1,61 @@
+# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
+""" Telechat configuration compatible with LlamaConfig. """
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Telechat2Config(PretrainedConfig):
+
+    model_type = "telechat"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "intermediate_size": "ffn_hidden_size",
+        "rms_norm_eps": "layer_norm_epsilon"
+    }
+
+    def __init__(
+        self,
+        vocab_size=160256,
+        hidden_size=4096,
+        n_layer=30,
+        n_head=32,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        apply_residual_connection_post_layernorm=False,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        ffn_hidden_size=12288,
+        training_seqlen=8192,
+        logn=True,
+        embed_layernorm=False,
+        hidden_act="silu",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm)
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.logn = logn
+        self.training_seqlen = training_seqlen
+        self.embed_layernorm = embed_layernorm
+        self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
+        self.ffn_hidden_size = ffn_hidden_size
+        self.hidden_act = hidden_act
+        super().__init__(bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 54f9f895fe541..e6701f4c4b835 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -132,7 +132,7 @@ def get_tokenizer(
     if is_from_mistral_org and tokenizer_mode != "mistral":
         warnings.warn(
             'It is strongly recommended to run mistral models with '
-            '`--tokenizer_mode "mistral"` to ensure correct '
+            '`--tokenizer-mode "mistral"` to ensure correct '
             'encoding and decoding.',
             FutureWarning,
             stacklevel=2)
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 6a114b513f382..c0b3d2585a962 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,7 +1,7 @@
 from typing import Optional, Type
 
-from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
-                         TokenizerPoolConfig)
+from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, TokenizerPoolConfig)
 from vllm.executor.ray_utils import ray
 
 from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
@@ -16,10 +16,11 @@
 def init_tokenizer_from_configs(model_config: ModelConfig,
                                 scheduler_config: SchedulerConfig,
                                 parallel_config: ParallelConfig,
-                                enable_lora: bool):
+                                lora_config: LoRAConfig):
     init_kwargs = dict(tokenizer_id=model_config.tokenizer,
-                       enable_lora=enable_lora,
+                       enable_lora=bool(lora_config),
                        max_num_seqs=scheduler_config.max_num_seqs,
+                       max_loras=lora_config.max_loras if lora_config else 0,
                        max_input_length=None,
                        tokenizer_mode=model_config.tokenizer_mode,
                        trust_remote_code=model_config.trust_remote_code,
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index e516eeabaadef..761b07f34d2f9 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -21,8 +21,9 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.enable_lora = enable_lora
         self.max_input_length = max_input_length
         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
+        max_loras = tokenizer_config.get("max_loras", 0)
         self.lora_tokenizers = LRUCache[AnyTokenizer](
-            capacity=max_num_seqs if enable_lora else 0)
+            capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
 
     @classmethod
     def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 83b3c37d6f04c..17d722e3d88fe 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -314,12 +314,15 @@ def _token_to_id(t: str):
 
             if regular_tokens:
                 decoded_list.append(
-                    self.decode(regular_tokens))  # type: ignore
+                    self.tokenizer.decode(regular_tokens))  # type: ignore
 
             decoded = ''.join(decoded_list)
 
         return decoded
 
+    # WARN: Outlines logits processors can overwrite this method.
+    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
+    # for more.
     def decode(self,
                ids: Union[List[int], int],
                skip_special_tokens: bool = True) -> str:
diff --git a/vllm/utils.py b/vllm/utils.py
index 111460a29de47..38c7dea6d2d3d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,5 +1,6 @@
 import argparse
 import asyncio
+import concurrent
 import contextlib
 import datetime
 import enum
@@ -9,6 +10,7 @@
 import inspect
 import ipaddress
 import os
+import signal
 import socket
 import subprocess
 import sys
@@ -18,13 +20,15 @@
 import uuid
 import warnings
 import weakref
-from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
-from collections.abc import Mapping
+from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
+from collections import UserDict, defaultdict
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
-from platform import uname
-from typing import (Any, AsyncGenerator, Awaitable, Callable, Dict, Generic,
-                    Hashable, List, Literal, Optional, OrderedDict, Set, Tuple,
-                    Type, TypeVar, Union, overload)
+from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
+                    Dict, Generator, Generic, Hashable, List, Literal,
+                    Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union,
+                    overload)
 from uuid import uuid4
 
 import numpy as np
@@ -41,11 +45,14 @@
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.platforms import current_platform
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
 logger = init_logger(__name__)
 
 # Exception strings for non-implemented encoder/decoder scenarios
 
-# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.rst
 # If the feature combo become valid
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
@@ -164,6 +171,11 @@ class Device(enum.Enum):
     CPU = enum.auto()
 
 
+class LayerBlockType(enum.Enum):
+    attention = "attention"
+    mamba = "mamba"
+
+
 class Counter:
 
     def __init__(self, start: int = 0) -> None:
@@ -333,24 +345,10 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-@lru_cache(maxsize=None)
-def get_vllm_instance_id() -> str:
-    """
-    If the environment variable VLLM_INSTANCE_ID is set, return it.
-    Otherwise, return a random UUID.
-    Instance id represents an instance of the VLLM. All processes in the same
-    instance should have the same instance id.
-    """
-    return envs.VLLM_INSTANCE_ID or f"vllm-instance-{random_uuid()}"
-
-
-@lru_cache(maxsize=None)
-def in_wsl() -> bool:
-    # Reference: https://github.com/microsoft/WSL/issues/4071
-    return "microsoft" in " ".join(uname()).lower()
-
-
-def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
+def make_async(
+    func: Callable[P, T],
+    executor: Optional[concurrent.futures.Executor] = None
+) -> Callable[P, Awaitable[T]]:
     """Take a blocking function, and run it on in an executor thread.
 
     This function prevents the blocking function from blocking the
@@ -361,7 +359,7 @@ def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
     def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
         loop = asyncio.get_event_loop()
         p_func = partial(func, *args, **kwargs)
-        return loop.run_in_executor(executor=None, func=p_func)
+        return loop.run_in_executor(executor=executor, func=p_func)
 
     return _async_wrapper
 
@@ -372,72 +370,23 @@ def _next_task(iterator: AsyncGenerator[T, None],
     return loop.create_task(iterator.__anext__())  # type: ignore[arg-type]
 
 
-async def iterate_with_cancellation(
-    iterator: AsyncGenerator[T, None],
-    is_cancelled: Callable[[], Awaitable[bool]],
-) -> AsyncGenerator[T, None]:
-    """Convert async iterator into one that polls the provided function
-    at least once per second to check for client cancellation.
-    """
-
-    loop = asyncio.get_running_loop()
-
-    awaits: List[Future[T]] = [_next_task(iterator, loop)]
-    next_cancel_check: float = 0
-    while True:
-        done, pending = await asyncio.wait(awaits, timeout=1.5)
-
-        # Check for cancellation at most once per second
-        time_now = time.time()
-        if time_now >= next_cancel_check:
-            if await is_cancelled():
-                with contextlib.suppress(BaseException):
-                    awaits[0].cancel()
-                    await iterator.aclose()
-                raise asyncio.CancelledError("client cancelled")
-            next_cancel_check = time_now + 1
-
-        if done:
-            try:
-                item = await awaits[0]
-                awaits[0] = _next_task(iterator, loop)
-                yield item
-            except StopAsyncIteration:
-                # we are done
-                return
-
-
 async def merge_async_iterators(
-    *iterators: AsyncGenerator[T, None],
-    is_cancelled: Optional[Callable[[], Awaitable[bool]]] = None,
-) -> AsyncGenerator[Tuple[int, T], None]:
+    *iterators: AsyncGenerator[T,
+                               None], ) -> AsyncGenerator[Tuple[int, T], None]:
     """Merge multiple asynchronous iterators into a single iterator.
 
     This method handle the case where some iterators finish before others.
     When it yields, it yields a tuple (i, item) where i is the index of the
     iterator that yields the item.
-
-    It also optionally polls a provided function at least once per second
-    to check for client cancellation.
     """
 
     loop = asyncio.get_running_loop()
 
     awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)}
-    timeout = None if is_cancelled is None else 1.5
-    next_cancel_check: float = 0
     try:
         while awaits:
-            done, pending = await asyncio.wait(awaits.keys(),
-                                               return_when=FIRST_COMPLETED,
-                                               timeout=timeout)
-            if is_cancelled is not None:
-                # Check for cancellation at most once per second
-                time_now = time.time()
-                if time_now >= next_cancel_check:
-                    if await is_cancelled():
-                        raise asyncio.CancelledError("client cancelled")
-                    next_cancel_check = time_now + 1
+            done, _ = await asyncio.wait(awaits.keys(),
+                                         return_when=FIRST_COMPLETED)
             for d in done:
                 pair = awaits.pop(d)
                 try:
@@ -466,6 +415,13 @@ async def collect_from_async_generator(
 
 def get_ip() -> str:
     host_ip = envs.VLLM_HOST_IP
+    if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ:
+        logger.warning(
+            "The environment variable HOST_IP is deprecated and ignored, as"
+            " it is often used by Docker and other software to"
+            "interact with the container's network stack. Please"
+            "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
+            " to communicate with each other.")
     if host_ip:
         return host_ip
 
@@ -705,6 +661,12 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
+@lru_cache
+def print_info_once(msg: str) -> None:
+    # Set the stacklevel to 2 to print the caller's line info
+    logger.info(msg, stacklevel=2)
+
+
 @lru_cache
 def print_warning_once(msg: str) -> None:
     # Set the stacklevel to 2 to print the caller's line info
@@ -713,25 +675,7 @@ def print_warning_once(msg: str) -> None:
 
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
-
-    if in_wsl():
-        # Pinning memory in WSL is not supported.
-        # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
-        print_warning_once("Using 'pin_memory=False' as WSL is detected. "
-                           "This may slow down the performance.")
-        return False
-    elif current_platform.is_xpu():
-        print_warning_once("Pin memory is not supported on XPU.")
-        return False
-    elif current_platform.is_neuron():
-        print_warning_once("Pin memory is not supported on Neuron.")
-        return False
-    elif current_platform.is_hpu():
-        print_warning_once("Pin memory is not supported on HPU.")
-        return False
-    elif current_platform.is_cpu() or current_platform.is_openvino():
-        return False
-    return True
+    return current_platform.is_pin_memory_available()
 
 
 class DeviceMemoryProfiler:
@@ -899,6 +843,23 @@ def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
     return [item for sublist in lists for item in sublist]
 
 
+_K = TypeVar("_K", bound=Hashable)
+_V = TypeVar("_V")
+
+
+def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
+    """
+    Unlike :class:`itertools.groupby`, groups are not broken by
+    non-contiguous data.
+    """
+    groups = defaultdict[_K, list[_V]](list)
+
+    for value in values:
+        groups[key(value)].append(value)
+
+    return groups.items()
+
+
 # TODO: This function can be removed if transformer_modules classes are
 # serialized by value when communicating between processes
 def init_cached_hf_modules() -> None:
@@ -962,7 +923,7 @@ def find_nccl_library() -> str:
     return so_file
 
 
-def enable_trace_function_call_for_thread() -> None:
+def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
     """Set up function tracing for the current thread,
     if enabled via the VLLM_TRACE_FUNCTION environment variable
     """
@@ -974,7 +935,8 @@ def enable_trace_function_call_for_thread() -> None:
         filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
                     f"_thread_{threading.get_ident()}_"
                     f"at_{datetime.datetime.now()}.log").replace(" ", "_")
-        log_path = os.path.join(tmp_dir, "vllm", get_vllm_instance_id(),
+        log_path = os.path.join(tmp_dir, "vllm",
+                                f"vllm-instance-{vllm_config.instance_id}",
                                 filename)
         os.makedirs(os.path.dirname(log_path), exist_ok=True)
         enable_trace_function_call(log_path)
@@ -1186,6 +1148,10 @@ def parse_args(self, args=None, namespace=None):
                 else:
                     processed_args.append('--' +
                                           arg[len('--'):].replace('_', '-'))
+            elif arg.startswith('-O') and arg != '-O' and len(arg) == 2:
+                # allow -O flag to be used without space, e.g. -O3
+                processed_args.append('-O')
+                processed_args.append(arg[2:])
             else:
                 processed_args.append(arg)
 
@@ -1357,8 +1323,8 @@ def supports_kw(
 
 
 def resolve_mm_processor_kwargs(
-    init_kwargs: Optional[Dict[str, Any]],
-    inference_kwargs: Optional[Dict[str, Any]],
+    init_kwargs: Optional[Mapping[str, object]],
+    inference_kwargs: Optional[Mapping[str, object]],
     callable: Callable[..., object],
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
@@ -1392,7 +1358,7 @@ def resolve_mm_processor_kwargs(
 
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
-    overrides: Optional[Dict[str, Any]],
+    overrides: Optional[Mapping[str, object]],
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
     """
@@ -1478,19 +1444,22 @@ def value(self):
 
 
 # Adapted from: https://stackoverflow.com/a/47212782/5082708
-class LazyDict(Mapping, Generic[T]):
+class LazyDict(Mapping[str, T], Generic[T]):
 
     def __init__(self, factory: Dict[str, Callable[[], T]]):
         self._factory = factory
         self._dict: Dict[str, T] = {}
 
-    def __getitem__(self, key) -> T:
+    def __getitem__(self, key: str) -> T:
         if key not in self._dict:
             if key not in self._factory:
                 raise KeyError(key)
             self._dict[key] = self._factory[key]()
         return self._dict[key]
 
+    def __setitem__(self, key: str, value: Callable[[], T]):
+        self._factory[key] = value
+
     def __iter__(self):
         return iter(self._factory)
 
@@ -1498,13 +1467,26 @@ def __len__(self):
         return len(self._factory)
 
 
-def combine_fx_passes(passes: List[Callable]) -> Callable:
+class ClassRegistry(UserDict[Type[T], _V]):
+
+    def __getitem__(self, key: Type[T]) -> _V:
+        for cls in key.mro():
+            if cls in self.data:
+                return self.data[cls]
+
+        raise KeyError(key)
+
+    def __contains__(self, key: object) -> bool:
+        return self.contains(key)
+
+    def contains(self, key: object, *, strict: bool = False) -> bool:
+        if not isinstance(key, type):
+            return False
 
-    def combined_fx(graph) -> None:
-        for fx in passes:
-            fx(graph)
+        if strict:
+            return key in self.data
 
-    return combined_fx
+        return any(cls in self.data for cls in key.mro())
 
 
 def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
@@ -1569,6 +1551,7 @@ def direct_register_custom_op(
     mutates_args: List[str],
     fake_impl: Optional[Callable] = None,
     target_lib: Optional[Library] = None,
+    dispatch_key: str = "CUDA",
 ):
     """
     `torch.library.custom_op` can have significant overhead because it
@@ -1585,7 +1568,7 @@ def direct_register_custom_op(
     library object. If you want to bind the operator to a different library,
     make sure the library object is alive when the operator is used.
     """
-    if is_in_doc_build():
+    if is_in_doc_build() or not supports_custom_op():
         return
     import torch.library
     if hasattr(torch.library, "infer_schema"):
@@ -1597,6 +1580,159 @@ def direct_register_custom_op(
         schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
     my_lib = target_lib or vllm_lib
     my_lib.define(op_name + schema_str)
-    my_lib.impl(op_name, op_func, "CUDA")
+    my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
     if fake_impl is not None:
         my_lib._register_fake(op_name, fake_impl)
+
+
+def resolve_obj_by_qualname(qualname: str) -> Any:
+    """
+    Resolve an object by its fully qualified name.
+    """
+    module_name, obj_name = qualname.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    return getattr(module, obj_name)
+
+
+def kill_process_tree(pid: int):
+    """
+    Kills all descendant processes of the given pid by sending SIGKILL.
+
+    Args:
+        pid (int): Process ID of the parent process
+    """
+    try:
+        parent = psutil.Process(pid)
+    except psutil.NoSuchProcess:
+        return
+
+    # Get all children recursively
+    children = parent.children(recursive=True)
+
+    # Send SIGKILL to all children first
+    for child in children:
+        with contextlib.suppress(ProcessLookupError):
+            os.kill(child.pid, signal.SIGKILL)
+
+    # Finally kill the parent
+    with contextlib.suppress(ProcessLookupError):
+        os.kill(pid, signal.SIGKILL)
+
+
+@dataclass
+class MemorySnapshot:
+    """Memory snapshot."""
+    torch_peak_in_bytes: int = 0
+    torch_memory_in_bytes: int = 0
+    timestamp: float = 0.0
+
+    def measure(self):
+        self.torch_peak_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.peak"]
+        self.torch_memory_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
+        self.timestamp = time.time()
+
+    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+        """support a - b"""
+        return MemorySnapshot(
+            torch_peak_in_bytes=self.torch_peak_in_bytes -
+            other.torch_peak_in_bytes,
+            torch_memory_in_bytes=self.torch_memory_in_bytes -
+            other.torch_memory_in_bytes,
+            timestamp=self.timestamp - other.timestamp)
+
+
+@dataclass
+class MemoryProfilingResult:
+    """Memory profiling result.
+    """  # noqa
+    baseline_memory_in_bytes: int = 0
+    non_kv_cache_memory_in_bytes: int = 0
+    torch_peak_increase_in_bytes: int = 0
+    non_torch_increase_in_bytes: int = 0
+    weights_memory_in_bytes: float = 0
+    before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    profile_time: float = 0.0
+
+
+@contextlib.contextmanager
+def memory_profiling(
+    baseline_memory_in_bytes: int, weights_memory_in_bytes: int
+) -> Generator[MemoryProfilingResult, None, None]:
+    """Memory profiling context manager.
+    baseline_memory_in_bytes: memory used by all the components other than
+        the current vLLM instance. It contains: memory used by other processes, memory
+        used by another vLLM instance in the same process, etc. It is usually measured
+        before the current vLLM instance initialize the device. And we assume it is
+        constant during the profiling of the current vLLM instance.
+    weights_memory_in_bytes: memory used by PyTorch when loading the model weights.
+        Note that, before loading the model weights, we also initialize the device
+        and distributed environment, which may consume some memory. This part is not
+        included in the weights_memory_in_bytes because PyTorch does not control it.
+
+    The memory in one GPU can be classified into 3 categories:
+    1. memory used by anything other than the current vLLM instance.
+    2. memory used by torch in the current vLLM instance.
+    3. memory used in the current vLLM instance, but not by torch.
+
+    A quantitive example:
+
+    Before creating the current vLLM instance:
+        category 1: 1 GiB
+        category 2: 0 GiB
+        category 3: 0 GiB
+
+    After creating the current vLLM instance and loading the model,
+    (i.e. before profiling):
+        category 1: 1 GiB
+        category 2: 2 GiB (model weights take 2 GiB)
+        category 3: 0.5 GiB (memory used by NCCL)
+
+    During profiling (peak):
+        category 1: 1 GiB
+        category 2: 4 GiB (peak activation tensors take 2 GiB)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    After profiling:
+        category 1: 1 GiB
+        category 2: 3 GiB (after garbage-collecting activation tensors)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    In this case, non-kv cache takes 5 GiB in total, including:
+    a. 2 GiB used by the model weights (category 2)
+    b. 2 GiB reserved for the peak activation tensors (category 2)
+    c. 1 GiB used by non-torch components (category 3)
+
+    The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
+
+    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+
+    (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
+    """ # noqa
+    torch.cuda.reset_peak_memory_stats()
+
+    result = MemoryProfilingResult()
+
+    result.baseline_memory_in_bytes = baseline_memory_in_bytes
+    # the part of memory used for holding the model weights
+    result.weights_memory_in_bytes = weights_memory_in_bytes
+
+    result.before_profile.measure()
+
+    yield result
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    result.after_profile.measure()
+
+    diff = result.after_profile - result.before_profile
+    result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes
+    current_cuda_memory_bytes = torch.cuda.mem_get_info(
+    )[1] - torch.cuda.mem_get_info()[0]
+    result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes  # noqa
+    result.profile_time = diff.timestamp
+    result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index e73a1e60b2730..026a0292cc339 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -6,8 +6,6 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
-from vllm.forward_context import get_forward_context
-from vllm.utils import direct_register_custom_op
 from vllm.vllm_flash_attn import flash_attn_varlen_func
 
 
@@ -19,7 +17,7 @@ def get_supported_head_sizes() -> List[int]:
 
     @staticmethod
     def get_name() -> str:
-        return "flash-attn-vllm-v1"
+        return "FLASH_ATTN_VLLM_V1"
 
     @staticmethod
     def get_impl_cls() -> Type["FlashAttentionImpl"]:
@@ -58,6 +56,7 @@ class FlashAttentionMetadata:
     seq_start_loc: torch.Tensor
     block_table: torch.Tensor
     slot_mapping: torch.Tensor
+    num_input_tokens: int = 0  # Number of tokens including padding.
 
 
 class FlashAttentionImpl(AttentionImpl):
@@ -113,13 +112,14 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
         Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
             kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
@@ -135,116 +135,55 @@ def forward(
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
-        output = torch.empty_like(query)
-        torch.ops.vllm.unified_v1_flash_attention(
-            output,
-            query,
+        assert output is not None, "Output tensor must be provided."
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        # Reshape the input keys and values and store them in the cache.
+        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+        # not padded. However, we don't need to do key[:num_actual_tokens] and
+        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
+        # the slot_mapping's shape to determine the number of actual tokens.
+        key_cache, value_cache = kv_cache.unbind(0)
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
             key,
             value,
-            self.num_heads,
-            self.head_size,
-            self.num_kv_heads,
-            kv_cache,
+            key_cache,
+            value_cache,
+            attn_metadata.slot_mapping,
             self.kv_cache_dtype,
             k_scale,
             v_scale,
-            self.scale,
-            self.sliding_window,
-            self.alibi_slopes,
-            self.logits_soft_cap,
         )
-        return output
 
+        # Compute attention and update output up to `num_actual_tokens`.
+        flash_attn_varlen_func(
+            q=query[:num_actual_tokens],
+            k=key_cache,
+            v=value_cache,
+            out=output[:num_actual_tokens],
+            cu_seqlens_q=attn_metadata.query_start_loc,
+            max_seqlen_q=attn_metadata.max_query_len,
+            cu_seqlens_k=attn_metadata.seq_start_loc,
+            max_seqlen_k=attn_metadata.max_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+            alibi_slopes=self.alibi_slopes,
+            window_size=self.sliding_window,
+            block_table=attn_metadata.block_table,
+            softcap=self.logits_soft_cap,
+        )
 
-def unified_v1_flash_attention(
-    output: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    num_heads: int,
-    head_size: int,
-    num_kv_heads: int,
-    kv_cache: torch.Tensor,
-    kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
-    softmax_scale: float,
-    window_size: Optional[List[int]] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    logits_soft_cap: Optional[float] = None,
-) -> None:
-    current_metadata = get_forward_context()
-    if current_metadata is None:
-        # Profiling run.
-        return
-
-    assert current_metadata is not None
-    assert isinstance(current_metadata, FlashAttentionMetadata)
-    attn_metadata: FlashAttentionMetadata = current_metadata
-    num_actual_tokens = attn_metadata.num_actual_tokens
-
-    # Reshape the query, key, and value tensors.
-    query = query.view(-1, num_heads, head_size)
-    key = key.view(-1, num_kv_heads, head_size)
-    value = value.view(-1, num_kv_heads, head_size)
-
-    # Reshape the input keys and values and store them in the cache.
-    key_cache = kv_cache[0]
-    value_cache = kv_cache[1]
-    torch.ops._C_cache_ops.reshape_and_cache_flash(
-        key[:num_actual_tokens],
-        value[:num_actual_tokens],
-        key_cache,
-        value_cache,
-        attn_metadata.slot_mapping,
-        kv_cache_dtype,
-        k_scale,
-        v_scale,
-    )
-
-    attn_output = flash_attn_varlen_func(
-        q=query[:num_actual_tokens],
-        k=key_cache,
-        v=value_cache,
-        cu_seqlens_q=attn_metadata.query_start_loc,
-        max_seqlen_q=attn_metadata.max_query_len,
-        cu_seqlens_k=attn_metadata.seq_start_loc,
-        max_seqlen_k=attn_metadata.max_seq_len,
-        softmax_scale=softmax_scale,
-        causal=True,
-        alibi_slopes=alibi_slopes,
-        window_size=window_size,
-        block_table=attn_metadata.block_table,
-        softcap=logits_soft_cap,
-    )
-    attn_output = attn_output.view(num_actual_tokens, -1)
-    # TODO(woosuk): Optimize this.
-    output[:num_actual_tokens].copy_(attn_output)
-
-
-def unified_v1_flash_attention_fake(
-    output: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    num_heads: int,
-    head_size: int,
-    num_kv_heads: int,
-    kv_cache: torch.Tensor,
-    kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
-    softmax_scale: float,
-    window_size: Optional[List[int]] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    logits_soft_cap: Optional[float] = None,
-) -> None:
-    return
-
-
-direct_register_custom_op(
-    op_name="unified_v1_flash_attention",
-    op_func=unified_v1_flash_attention,
-    mutates_args=["kv_cache", "output"],
-    fake_impl=unified_v1_flash_attention_fake,
-)
+        return output
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 38f1c03a4d3ac..61a3f5fd6d841 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,10 +1,12 @@
 from collections import defaultdict
-from typing import Dict, List, Optional
+from typing import Dict, Iterable, List, Optional
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock, hash_block_tokens,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens,
                                          hash_request_tokens)
 from vllm.v1.request import Request
 
@@ -17,12 +19,15 @@ def __init__(
         self,
         block_size: int,
         num_gpu_blocks: int,
+        max_model_len: int,
         sliding_window: Optional[int] = None,
         enable_caching: bool = True,
         num_preallocate_tokens: int = 64,
     ) -> None:
         self.block_size = block_size
         self.num_gpu_blocks = num_gpu_blocks
+        self.max_model_len = max_model_len
+        self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
         self.sliding_window = sliding_window
         self.enable_caching = enable_caching
         # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
@@ -79,8 +84,13 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
             return []
 
         computed_blocks = []
-        block_hashes = hash_request_tokens(self.block_size,
-                                           request.all_token_ids)
+
+        # The block hashes for the request may already be computed
+        # if the request was preempted and resumed.
+        if not request.kv_block_hashes:
+            request.set_kv_block_hashes(
+                hash_request_tokens(self.block_size, request))
+        block_hashes = request.kv_block_hashes
 
         for block_hash in block_hashes:
             # block_hashes is a chain of block hashes. If a block hash is not
@@ -120,47 +130,53 @@ def append_slots(
             # slots, but we cannot allocate new blocks due to the limit.
             return None
 
-        # When caching is enabled, assign token IDs to already allocated blocks.
-        new_token_ids = None
-        parent_block = None
-        if self.enable_caching:
-            # Figure out the token IDs to add to the blocks.
-            new_token_ids = request.all_token_ids[
-                request.num_computed_tokens:request.num_computed_tokens +
-                num_tokens]
-
-            # Find the last full block index.
-            # TODO: This may be optimized by calculating the computed tokens.
-            last_full_block_idx = len(req_blocks) - 1
-            while (last_full_block_idx >= 0
-                   and req_blocks[last_full_block_idx].block_hash is None):
-                last_full_block_idx -= 1
-
-            parent_block = (req_blocks[last_full_block_idx]
-                            if last_full_block_idx >= 0 else None)
-            token_id_idx = self._add_token_ids_to_blocks(
-                blocks=req_blocks[last_full_block_idx + 1:],
-                token_ids=new_token_ids,
-                parent_block=parent_block)
-
-            new_token_ids = new_token_ids[token_id_idx:]
-            parent_block = req_blocks[-1]
-
-        # No new block is needed. When caching is enabled, we make sure
-        # token_id_idx is equal to len(new_token_ids), meaning that all tokens
-        # are added to allocated blocks.
-        if num_required_blocks <= len(req_blocks):
-            assert not self.enable_caching or token_id_idx == num_tokens, \
-                    f"{token_id_idx=} != {num_tokens=}"
-            return []
+        if num_new_blocks <= 0:
+            # No new block is needed.
+            new_blocks = []
+        else:
+            # Get new blocks from the free block pool considering
+            # preallocated blocks.
+            num_new_blocks = min(
+                num_new_blocks + self.num_preallocate_blocks,
+                self.free_block_queue.num_free_blocks,
+                # Should not exceed the maximum number of blocks per request.
+                # This is especially because the block table has the shape
+                # [..., max_num_blocks_per_req].
+                # TODO(woosuk): Check and reject requests if
+                # num_prompt_tokens + max_tokens > max_model_len.
+                self.max_num_blocks_per_req - len(req_blocks),
+            )
+            assert num_new_blocks > 0
+
+            new_blocks = self._get_new_blocks(num_new_blocks)
+            req_blocks.extend(new_blocks)
+
+        if not self.enable_caching:
+            return new_blocks
+
+        num_computed_full_blocks = (request.num_computed_tokens //
+                                    self.block_size)
+
+        # NOTE(rickyx): We are assuming the `num_tokens` are actual
+        # tokens rather than lookahead slots (e.g. for speculative decoding).
+        # TODO(rickyx): When supporting speculative decoding, we will need to
+        # differentiate between them so that we can know how many blocks are
+        # full after appending the actual tokens.
+        num_full_blocks_after_append = (request.num_computed_tokens +
+                                        num_tokens) // self.block_size
+        assert num_full_blocks_after_append <= len(req_blocks)
+
+        new_full_blocks = req_blocks[
+            num_computed_full_blocks:num_full_blocks_after_append]
+        if new_full_blocks:
+            self._cache_full_blocks(
+                request=request,
+                blk_start_idx=num_computed_full_blocks,
+                full_blocks=new_full_blocks,
+                prev_block=req_blocks[num_computed_full_blocks - 1]
+                if num_computed_full_blocks >= 1 else None,
+            )
 
-        # Allocate new blocks considering preallocated blocks, and
-        # add token IDs to them if caching is enabled.
-        num_new_blocks = min(num_new_blocks + self.num_preallocate_blocks,
-                             self.free_block_queue.num_free_blocks)
-        new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids,
-                                          parent_block)
-        req_blocks.extend(new_blocks)
         return new_blocks
 
     def allocate_slots(
@@ -184,11 +200,20 @@ def allocate_slots(
             raise ValueError(
                 f"num_tokens must be greater than 0, got {num_tokens}")
 
-        # If a computed block of a request is an eviction candidate (in the
-        # free queue and ref_cnt == 0), it cannot be counted as a free block
-        # when allocating this request.
-        num_evictable_computed_blocks = len(
-            [blk for blk in computed_blocks if blk.ref_cnt == 0])
+        # Touch the computed blocks to make sure they won't be evicted.
+        num_evictable_computed_blocks = 0
+        if self.enable_caching:
+            self._touch(computed_blocks)
+
+            # If a computed block of a request is an eviction candidate (in the
+            # free queue and ref_cnt == 0), it cannot be counted as a free block
+            # when allocating this request.
+            num_evictable_computed_blocks = len(
+                [blk for blk in computed_blocks if blk.ref_cnt == 0])
+        else:
+            assert not computed_blocks, (
+                "Computed blocks should be empty when "
+                "prefix caching is disabled")
 
         num_required_blocks = cdiv(num_tokens, self.block_size)
         if (num_required_blocks > self.free_block_queue.num_free_blocks -
@@ -201,35 +226,37 @@ def allocate_slots(
         num_new_blocks = min(
             num_required_blocks + self.num_preallocate_blocks,
             self.free_block_queue.num_free_blocks -
-            num_evictable_computed_blocks)
+            num_evictable_computed_blocks,
+            # Should not exceed the maximum number of blocks per request.
+            # This is especially because the block table has the shape
+            # [..., max_num_blocks_per_req].
+            # TODO(woosuk): Check and reject requests if
+            # num_prompt_tokens + max_tokens > max_model_len.
+            self.max_num_blocks_per_req - len(computed_blocks),
+        )
+        assert num_new_blocks > 0
 
-        num_computed_tokens = len(computed_blocks) * self.block_size
-
-        # When caching is enabled, get the new token IDs and the parent block
-        # ID to generate cache keys.
-        new_token_ids = None
-        parent_block = None
-        if self.enable_caching:
-            # Touch the computed blocks to make sure they won't be evicted.
-            self._touch(computed_blocks)
-
-            # Get the token IDs for the blocks being allocated for hashing.
-            new_token_ids = request.all_token_ids[
-                num_computed_tokens:num_computed_tokens + num_tokens]
-            if not new_token_ids:
-                raise RuntimeError(
-                    "Failed to infer the token IDs for allocation. "
-                    f"#all_tokens={len(request.all_token_ids)} < "
-                    f"#computed_tokens={num_computed_tokens}")
+        # Concatenate the computed block IDs and the new block IDs.
+        new_blocks = self._get_new_blocks(num_new_blocks)
+        self.req_to_blocks[request.request_id] = computed_blocks + new_blocks
 
-            # Get the parent block ID to construct the block chain.
-            parent_block = computed_blocks[-1] if computed_blocks else None
+        if not self.enable_caching:
+            return new_blocks
 
-        new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids,
-                                          parent_block)
+        num_computed_tokens = len(computed_blocks) * self.block_size
+        num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
+
+        new_full_blocks = self.req_to_blocks[
+            request.request_id][len(computed_blocks):num_full_blocks]
+        if new_full_blocks:
+            self._cache_full_blocks(
+                request=request,
+                blk_start_idx=len(computed_blocks),
+                # The new full blocks are the full blocks that are not computed.
+                full_blocks=new_full_blocks,
+                prev_block=computed_blocks[-1] if computed_blocks else None,
+            )
 
-        # Concatenate the computed block IDs and the new block IDs.
-        self.req_to_blocks[request.request_id] = computed_blocks + new_blocks
         return new_blocks
 
     def free(self, request: Request) -> None:
@@ -242,30 +269,24 @@ def free(self, request: Request) -> None:
         """
         # Default to [] in case a request is freed (aborted) before alloc.
         blocks = self.req_to_blocks.pop(request.request_id, [])
+        ordered_blocks: Iterable[KVCacheBlock] = blocks
         if self.enable_caching:
             # Free blocks in reverse order so that the tail blocks are
             # freed first.
-            blocks = reversed(blocks)
+            ordered_blocks = reversed(blocks)
 
-        for block in blocks:
-            block.ref_cnt -= 1
+        for block in ordered_blocks:
+            block.decr_ref()
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
 
-    def _get_new_blocks(
-            self,
-            num_blocks: int,
-            token_ids: Optional[List[int]] = None,
-            parent_block: Optional[int] = None) -> List[KVCacheBlock]:
-        """Get new blocks from the free block pool, and add token IDs to
-        allocated blocks if caching is enabled.
+    def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
+        """Get new blocks from the free block pool.
+
         Note that we do not check block cache in this function.
 
         Args:
             num_blocks: The number of blocks to allocate.
-            token_ids: The token IDs in the blocks. None if caching is disabled.
-            parent_block: The parent block. Used to include block chain
-                in the block hash.
 
         Returns:
             A list of new block.
@@ -274,56 +295,38 @@ def _get_new_blocks(
             raise ValueError(
                 f"Cannot get {num_blocks} free blocks from the pool")
 
-        # First allocate blocks.
         ret: List[KVCacheBlock] = []
         idx = 0
         while idx < num_blocks:
+            # First allocate blocks.
             curr_block = self.free_block_queue.popleft()
             assert curr_block.ref_cnt == 0
 
-            # Evict blocks from the cache.
+            # If the block is cached, evict it.
             if self.enable_caching:
-                block_hash = curr_block.block_hash
-                if (block_hash is not None
-                        and block_hash in self.cached_block_hash_to_block):
-                    if len(self.cached_block_hash_to_block[block_hash]) == 1:
-                        del self.cached_block_hash_to_block[block_hash]
-                    else:
-                        del self.cached_block_hash_to_block[block_hash][
-                            curr_block.block_id]
-                curr_block.reset()
-
-            curr_block.ref_cnt = 1
+                self._evict_cached_block(curr_block)
+
+            curr_block.incr_ref()
             ret.append(curr_block)
             idx += 1
 
-        # Then assign token IDs to the allocated blocks.
-        if self.enable_caching:
-            assert token_ids is not None
-            token_id_idx = self._add_token_ids_to_blocks(
-                blocks=ret, token_ids=token_ids, parent_block=parent_block)
-            assert token_id_idx == len(token_ids)
-
         return ret
 
-    def _cache_full_block(self,
-                          block: KVCacheBlock,
-                          parent_block: Optional[KVCacheBlock] = None) -> None:
-        """Cache a full block for prefix caching.
+    def _evict_cached_block(self, block: KVCacheBlock) -> None:
+        """
+        If a block is cached in `cached_block_hash_to_block`, we reset its hash
+        metadata and evict it from the cache.
 
         Args:
-            block: The block to cache.
-            parent_block: The parent block. None if this is the first block.
+            block: The block to evict.
         """
-        parent_block_hash = (parent_block.block_hash
-                             if parent_block is not None else None)
-        assert len(block.token_ids) == self.block_size
-        block.token_ids = tuple(block.token_ids)
-        block_hash = hash_block_tokens(parent_block_hash, block.token_ids)
-        block.block_hash = block_hash
-        block.num_hashed_tokens = self.block_size + (
-            parent_block.num_hashed_tokens if parent_block is not None else 0)
-        self.cached_block_hash_to_block[block_hash][block.block_id] = block
+        block_hash = block.block_hash
+        if block_hash and block_hash in self.cached_block_hash_to_block:
+            block.reset_hash()
+            del self.cached_block_hash_to_block[block_hash][block.block_id]
+
+            if len(self.cached_block_hash_to_block[block_hash]) == 0:
+                del self.cached_block_hash_to_block[block_hash]
 
     def _get_cached_block(self,
                           block_hash: BlockHashType) -> Optional[KVCacheBlock]:
@@ -355,43 +358,74 @@ def _touch(self, blocks: List[KVCacheBlock]) -> None:
             # candidate), so remove it.
             if block.ref_cnt == 0:
                 self.free_block_queue.remove(block)
-            block.ref_cnt += 1
-
-    def _add_token_ids_to_blocks(
-            self,
-            blocks: List[KVCacheBlock],
-            token_ids: List[int],
-            parent_block: Optional[KVCacheBlock] = None) -> int:
-        """Add token IDs to a list of allocated blocks.
-        If a block becomes full after adding token IDs, cache it.
-        Return the token ID index that has not been added to the blocks
-        if the blocks are not enough to hold all the token IDs.
+            block.incr_ref()
 
-        Args:
-            blocks: A list of blocks to add token IDs.
-            token_ids: A list of token IDs to add.
-            parent_block: The parent block. None if this is the
-                first block.
+    def _cache_full_blocks(
+        self,
+        request: Request,
+        blk_start_idx: int,
+        full_blocks: List[KVCacheBlock],
+        prev_block: Optional[KVCacheBlock],
+    ) -> None:
+        """Cache a list of full blocks for prefix caching.
 
-        Returns:
-            The starting token ID index that has not been added to the blocks
-            due to insufficient given blocks.
+        This function takes a list of blocks that will have their block hash
+        metadata to be updated and cached. Given a request, it computes the
+        block hashes for the blocks starting from `blk_start_idx` to the end
+        of the request's full blocks, updating the metadata for each block
+        and caching them in the `cached_block_hash_to_block`.
+
+        Args:
+            request: The request to cache the blocks.
+            blk_start_idx: The index of the first block in the request's blocks
+                to cache.
+            full_blocks: The list of blocks to update hash metadata.
+            prev_block: The previous block in the chain.
         """
-        token_id_start = 0
-        for curr_block in blocks:
-            # If all token IDs are added, then the rest of the blocks are
-            # preallocated blocks, so we only need to update the
-            # parent_block_id. FIXME
-            if token_id_start == len(token_ids):
-                continue
-
-            # Add token IDs to the empty slots in the block.
-            empty_slots = self.block_size - len(curr_block.token_ids)
-            token_id_end = min(token_id_start + empty_slots, len(token_ids))
-            curr_block.token_ids.extend(token_ids[token_id_start:token_id_end])
-            # Cache the block if it becomes full.
-            if len(curr_block.token_ids) == self.block_size:
-                self._cache_full_block(curr_block, parent_block)
-            parent_block = curr_block
-            token_id_start = token_id_end
-        return token_id_start
+        num_cached_block_hashes = len(request.kv_block_hashes)
+
+        # Update the new blocks with the block hashes through the chain.
+        prev_block_hash_value = None
+        if prev_block is not None:
+            # Previous block must have a block hash because it must be
+            # a full, cached block.
+            assert prev_block.block_hash is not None
+            prev_block_hash_value = prev_block.block_hash.hash_value
+
+        for i, blk in enumerate(full_blocks):
+            blk_idx = blk_start_idx + i
+
+            if blk_idx < num_cached_block_hashes:
+                # The block hash may already be computed in
+                # "get_computed_blocks" if the tokens are not generated by
+                # this request (either the prompt tokens or the previously
+                # generated tokens with preemption). In this case we simply
+                # reuse the block hash.
+                block_hash = request.kv_block_hashes[blk_idx]
+            else:
+                # Otherwise compute the block hash and cache it in the request
+                # in case it will be preempted in the future.
+                start_token_idx = blk_idx * self.block_size
+                end_token_idx = (blk_idx + 1) * self.block_size
+                block_tokens = request.all_token_ids[
+                    start_token_idx:end_token_idx]
+                assert len(block_tokens) == self.block_size, (
+                    f"Expected {self.block_size} tokens, got "
+                    f"{len(block_tokens)} at {blk_idx}th block for request "
+                    f"{request.request_id}({request})")
+
+                # Generate extra keys for multi-modal inputs. Note that since
+                # we reach to this branch only when the block is completed with
+                # generated tokens, we only need to consider the last mm input.
+                extra_keys, _ = generate_block_hash_extra_keys(
+                    request, start_token_idx, end_token_idx, -1)
+
+                # Compute the hash of the current block.
+                block_hash = hash_block_tokens(prev_block_hash_value,
+                                               block_tokens, extra_keys)
+                request.append_kv_block_hashes(block_hash)
+
+            # Update and added the full block to the cache.
+            blk.block_hash = block_hash
+            self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
+            prev_block_hash_value = block_hash.hash_value
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 33dbfb7377bfd..d80ea128c7749 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,12 +1,25 @@
 """KV-Cache Utilities."""
-from dataclasses import dataclass, field
-from typing import List, Optional, Tuple, Union
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any, List, NamedTuple, Optional, Tuple
 
 from vllm.logger import init_logger
+from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
-BlockHashType = Tuple[int, Tuple[int]]
+
+class BlockHashType(NamedTuple):
+    """Hash value of a block (int), the token IDs in the block, and extra keys.
+    The reason we keep a tuple of token IDs and extra keys is to make sure
+    no hash collision happens when the hash value is the same.
+    """
+    # Hash value of the block in an integer.
+    hash_value: int
+    # Token IDs in the block.
+    token_ids: Tuple[int, ...]
+    # Extra keys for the block.
+    extra_keys: Optional[Any] = None
 
 
 @dataclass
@@ -16,27 +29,34 @@ class KVCacheBlock:
     block_id: int
     # Reference count.
     ref_cnt: int = 0
-    # Token IDs in the block. When the block is full, the type of token_ids
-    # should be Tuple[int] for fast matching.
-    token_ids: Union[List[int], Tuple[int]] = field(default_factory=list)
     # The hash of the block composed of (block hash, tuple of token IDs).
     # It is only available when the block is full.
-    block_hash: Optional[BlockHashType] = None
-    # The number of hashed tokens. More hashed tokens means the block
-    # is closer to the end of a prompt and more likely to be evicted.
-    num_hashed_tokens: int = 0
+    _block_hash: Optional[BlockHashType] = None
 
     # Used to construct a doubly linked list for free blocks.
     # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
     prev_free_block: Optional["KVCacheBlock"] = None
     next_free_block: Optional["KVCacheBlock"] = None
 
-    def reset(self):
-        """Reset the block metadata."""
-        self.ref_cnt = 0
-        self.token_ids = []
-        self.block_hash = None
-        self.num_hashed_tokens = 0
+    def incr_ref(self):
+        self.ref_cnt += 1
+
+    def decr_ref(self):
+        self.ref_cnt -= 1
+
+    @property
+    def block_hash(self) -> Optional[BlockHashType]:
+        return self._block_hash
+
+    @block_hash.setter
+    def block_hash(self, block_hash: BlockHashType):
+        assert self.block_hash is None, (
+            "The block already has a hash. This should not happen.")
+        self._block_hash = block_hash
+
+    def reset_hash(self):
+        """Reset the block hash when the block is evicted."""
+        self._block_hash = None
 
 
 class FreeKVCacheBlockQueue:
@@ -65,8 +85,8 @@ def __init__(self, blocks: List[KVCacheBlock]) -> None:
         self.num_free_blocks = len(blocks)
 
         # Initialize the doubly linked list of free blocks.
-        self.free_list_head = blocks[0]
-        self.free_list_tail = blocks[-1]
+        self.free_list_head: Optional[KVCacheBlock] = blocks[0]
+        self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
         for i in range(self.num_free_blocks):
             if i > 0:
                 blocks[i].prev_free_block = blocks[i - 1]
@@ -144,8 +164,80 @@ def get_all_free_blocks(self) -> List[KVCacheBlock]:
         return ret
 
 
-def hash_block_tokens(parent_block_hash: Optional[int],
-                      curr_block_token_ids: Tuple[int]) -> BlockHashType:
+def generate_block_hash_extra_keys(
+        request: Request, start_token_idx: int, end_token_idx: int,
+        start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]:
+    """Generate extra keys for the block hash. The extra keys can come from
+    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
+    For multi-modal inputs, the extra keys are (mm_hash, start_offset) that
+    indicate a mm input contained in the block and its starting offset in
+    the block tokens.
+    
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+    
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if not mm_positions:
+        return None, start_mm_idx
+
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match. This "
+            "is likely because you do not enable MM preprocessor hashing. "
+            "Please set mm_cache_preprocessor=True.")
+
+    # Note that we assume mm_positions is sorted by offset.
+    # We do not need to check all mm inputs if the start token index is out of
+    # range. This usually happens in the late prefill phase and decoding phase.
+    if mm_positions[-1]["offset"] + mm_positions[-1][
+            "length"] < start_token_idx:
+        return None, start_mm_idx
+
+    # Support start_mm_idx == -1 to indicate the last mm input.
+    if start_mm_idx < 0:
+        assert -start_mm_idx <= len(mm_positions)
+        start_mm_idx = len(mm_positions) + start_mm_idx
+
+    extra_keys = []
+    curr_mm_idx = start_mm_idx
+    while mm_positions and curr_mm_idx < len(mm_positions):
+        assert mm_hashes[curr_mm_idx] is not None
+        offset = mm_positions[curr_mm_idx]["offset"]
+        length = mm_positions[curr_mm_idx]["length"]
+        if end_token_idx > offset:
+            if start_token_idx > offset + length:
+                # This block has passed the current mm input.
+                curr_mm_idx += 1
+                continue
+
+            # The block contains the current mm input.
+            mm_start = max(0, start_token_idx - offset)
+            extra_keys.append((mm_hashes[curr_mm_idx], mm_start))
+            if end_token_idx >= offset + length:
+                # If this block contains the end of the current mm input,
+                # move to the next mm input as this block may also contain
+                # the next mm input.
+                curr_mm_idx += 1
+            else:
+                # Otherwise this block is done with mm inputs.
+                break
+        else:
+            # This block has not reached the current mm input.
+            break
+    return tuple(extra_keys), curr_mm_idx
+
+
+def hash_block_tokens(
+        parent_block_hash: Optional[int],
+        curr_block_token_ids: Sequence[int],
+        extra_keys: Optional[Tuple[Any, ...]] = None) -> BlockHashType:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -157,38 +249,57 @@ def hash_block_tokens(parent_block_hash: Optional[int],
     Args:
         parent_block_hash: The hash of the parent block. None
             if this is the first block.
-        curr_block_token_ids: A tuple of token ids in the current
+        curr_block_token_ids: A list of token ids in the current
             block. The current block is assumed to be full.
+        extra_keys: Extra keys for the block.
 
     Returns:
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
-    return (hash(
-        (parent_block_hash, *curr_block_token_ids)), curr_block_token_ids)
+    return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
+                         tuple(curr_block_token_ids), extra_keys)
 
 
 def hash_request_tokens(block_size: int,
-                        token_ids: List[int]) -> List[BlockHashType]:
+                        request: Request) -> List[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
     Args:
         block_size: The size of each block.
-        token_ids: A sequence of token ids in the request.
+        request: The request object.
 
     Returns:
         The list of computed hash values.
     """
+    token_ids = request.all_token_ids
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match.")
+
+    # TODO: Extend this to support other features such as LoRA.
+    need_extra_keys = bool(mm_positions)
+    extra_keys = None
+    curr_mm_idx = 0
+
     ret = []
-    parent_block_hash = None
+    parent_block_hash_value = None
     for start in range(0, len(token_ids), block_size):
         end = start + block_size
-        block_token_ids = tuple(token_ids[start:end])
+        block_token_ids = token_ids[start:end]
         # Do not hash the block if it is not full.
         if len(block_token_ids) < block_size:
             break
-        block_hash = hash_block_tokens(parent_block_hash, block_token_ids)
+
+        # Add extra keys if the block is a multi-modal block.
+        if need_extra_keys:
+            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+                request, start, end, curr_mm_idx)
+
+        block_hash = hash_block_tokens(parent_block_hash_value,
+                                       block_token_ids, extra_keys)
         ret.append(block_hash)
-        parent_block_hash = block_hash
+        parent_block_hash_value = block_hash.hash_value
     return ret
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index ba50a9786d805..08e7c0fd4dc9b 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -5,6 +5,8 @@
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
+from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.base import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -33,22 +35,23 @@ def __init__(
         # TODO: Support LoRA.
         assert lora_config is None, "V1 does not support LoRA yet."
 
+        # Scheduling constraints.
+        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_scheduled_tokens = \
+            self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = self.scheduler_config.max_model_len
+
         num_gpu_blocks = cache_config.num_gpu_blocks
         assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
-        # Create the block space manager.
+        # Create the KV cache manager.
         self.kv_cache_manager = KVCacheManager(
             block_size=self.cache_config.block_size,
             num_gpu_blocks=num_gpu_blocks,
+            max_model_len=self.max_model_len,
             sliding_window=self.cache_config.sliding_window,
             enable_caching=self.cache_config.enable_prefix_caching)
         self.block_size = self.cache_config.block_size
 
-        # Scheduling constraints.
-        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
-        self.max_num_scheduled_tokens = \
-            self.scheduler_config.max_num_batched_tokens
-        self.max_model_len = self.scheduler_config.max_model_len
-
         # req_id -> Request
         self.requests: Dict[str, Request] = {}
         # Priority queues for requests.
@@ -70,14 +73,13 @@ def __init__(
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
         # projector if needed). Currently, we assume that the encoder also
         # has the Transformer architecture (e.g., ViT).
-        # FIXME(woosuk): Below are placeholder values. We need to calculate the
-        # actual values from the configurations.
-        self.max_num_encoder_input_tokens = 2048
+        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  #noqa: E501
         # NOTE(woosuk): For the models without encoder (e.g., text-only models),
         # the encoder cache will not be initialized and used, regardless of
         # the cache size. This is because the memory space for the encoder cache
         # is preallocated in the profiling run.
-        self.encoder_cache_manager = EncoderCacheManager(cache_size=2048)
+        self.encoder_cache_manager = EncoderCacheManager(
+            cache_size=self.scheduler_config.encoder_cache_size)
 
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
@@ -149,6 +151,7 @@ def schedule(self) -> "SchedulerOutput":
                     break
             if not can_schedule:
                 break
+            assert new_blocks is not None
 
             # Schedule the request.
             scheduled_running_reqs.append(request)
@@ -196,9 +199,13 @@ def schedule(self) -> "SchedulerOutput":
                 if num_new_tokens == 0:
                     # The happens when prompt length is divisible by the block
                     # size and all blocks are cached. Now we force to recompute
-                    # the last token.
-                    num_computed_tokens -= 1
-                    num_new_tokens = 1
+                    # the last block. Note that we have to re-compute an entire
+                    # block because allocate_slots() assumes num_computed_tokens
+                    # is always a multiple of the block size. This limitation
+                    # can potentially be removed in the future to slightly
+                    # improve the performance.
+                    num_computed_tokens -= self.block_size
+                    num_new_tokens = self.block_size
                     computed_blocks.pop()
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
@@ -382,7 +389,7 @@ def update_from_output(
         model_runner_output: "ModelRunnerOutput",
     ) -> List[EngineCoreOutput]:
         # NOTE(woosuk): This method doesn't consider speculative decoding.
-        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
+        sampled_token_ids = model_runner_output.sampled_token_ids
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []
@@ -509,6 +516,7 @@ class NewRequestData:
     prompt_token_ids: List[int]
     prompt: Optional[str]
     mm_inputs: List["MultiModalKwargs"]
+    mm_hashes: List[str]
     mm_positions: List["PlaceholderRange"]
     sampling_params: SamplingParams
     block_ids: List[int]
@@ -526,6 +534,7 @@ def from_request(
             prompt_token_ids=request.prompt_token_ids,
             prompt=request.prompt,
             mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
             mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
             block_ids=block_ids,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index edfb8bd7c2fc1..cc0c7ea23469a 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,11 +1,11 @@
 import enum
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
+from typing import List, Optional, Union
 
 import msgspec
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
+from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 
@@ -35,19 +35,20 @@ class EngineCoreRequest:
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_data: Optional[MultiModalDataDict]
+    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
+    mm_hashes: Optional[List[str]]
     mm_placeholders: Optional[MultiModalPlaceholderDict]
-    mm_processor_kwargs: Optional[Dict[str, Any]]
     sampling_params: SamplingParams
     eos_token_id: Optional[int]
     arrival_time: float
     lora_request: Optional[LoRARequest]
 
 
-class EngineCoreOutput(msgspec.Struct,
-                       array_like=True,
-                       omit_defaults=True,
-                       gc=False):
+class EngineCoreOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     request_id: str
     new_token_ids: List[int]
@@ -56,10 +57,11 @@ class EngineCoreOutput(msgspec.Struct,
     stop_reason: Union[int, str, None] = None
 
 
-class EngineCoreOutputs(msgspec.Struct,
-                        array_like=True,
-                        omit_defaults=True,
-                        gc=False):
+class EngineCoreOutputs(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     #NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout and using an int enum for finish/stop reason
@@ -68,6 +70,11 @@ class EngineCoreOutputs(msgspec.Struct,
     outputs: List[EngineCoreOutput]
 
 
+@dataclass
+class EngineCoreProfile:
+    is_start: bool
+
+
 class EngineCoreRequestType(enum.Enum):
     """
     Request types defined as hex byte strings, so it can be sent over sockets
@@ -75,3 +82,7 @@ class EngineCoreRequestType(enum.Enum):
     """
     ADD = b'\x00'
     ABORT = b'\x01'
+    PROFILE = b'\x02'
+
+
+EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 09bff9655a882..41fb4b25d45bb 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -9,7 +9,7 @@
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
@@ -20,7 +20,7 @@
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
-from vllm.v1.executor.gpu_executor import GPUExecutor
+from vllm.v1.executor.abstract import Executor
 
 logger = init_logger(__name__)
 
@@ -30,7 +30,7 @@ class AsyncLLM(EngineClient):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        executor_class: Type[GPUExecutor],
+        executor_class: Type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
@@ -51,7 +51,7 @@ def __init__(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
-            enable_lora=bool(vllm_config.lora_config))
+            lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
         # Request streams (map of request_id -> AsyncStream).
@@ -60,12 +60,21 @@ def __init__(
         self.client_aborted_requests: List[str] = []
 
         # Processor (converts Inputs --> EngineCoreRequests).
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry)
+        self.processor = Processor(
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+            tokenizer=self.tokenizer,
+            input_registry=input_registry,
+        )
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
-        self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer)
+        self.detokenizer = Detokenizer(
+            tokenizer_name=vllm_config.model_config.tokenizer,
+            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
+            trust_remote_code=vllm_config.model_config.trust_remote_code,
+            revision=vllm_config.model_config.tokenizer_revision,
+        )
 
         # EngineCore (starts the engine in background process).
         self.engine_core = EngineCoreClient.make_client(
@@ -76,7 +85,7 @@ def __init__(
             asyncio_mode=True,
         )
 
-        self.output_handler = None
+        self.output_handler: Optional[asyncio.Task] = None
 
     def __del__(self):
         self.shutdown()
@@ -94,7 +103,7 @@ def from_engine_args(
 
         # Create the engine configs.
         if engine_config is None:
-            vllm_config = engine_args.create_engine_config()
+            vllm_config = engine_args.create_engine_config(usage_context)
         else:
             vllm_config = engine_config
 
@@ -114,14 +123,25 @@ def from_engine_args(
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
 
-        self.engine_core.shutdown()
+        if engine_core := getattr(self, "engine_core", None):
+            engine_core.shutdown()
 
         if handler := getattr(self, "output_handler", None):
             handler.cancel()
 
     @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig):
-        return GPUExecutor
+    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
+        executor_class: Type[Executor]
+        distributed_executor_backend = (
+            vllm_config.parallel_config.distributed_executor_backend)
+        if distributed_executor_backend == "mp":
+            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+            executor_class = MultiprocExecutor
+        else:
+            assert (distributed_executor_backend is None)
+            from vllm.v1.executor.uniproc_executor import UniprocExecutor
+            executor_class = UniprocExecutor
+        return executor_class
 
     async def add_request(
         self,
@@ -133,11 +153,11 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         """Add new request to the AsyncLLM."""
 
         if self.detokenizer.is_request_active(request_id):
-            raise KeyError(f"Request {request_id} already exists.")
+            raise ValueError(f"Request {request_id} already exists.")
 
         # 1) Create a new AsyncStream for the request.
         stream = self._add_request_to_streams(request_id)
@@ -346,10 +366,10 @@ async def check_health(self) -> None:
         logger.debug("Called check_health.")
 
     async def start_profile(self) -> None:
-        raise ValueError("Not supported on V1 yet.")
+        await self.engine_core.profile_async(True)
 
     async def stop_profile(self) -> None:
-        raise ValueError("Not supported on V1 yet.")
+        await self.engine_core.profile_async(False)
 
     @property
     def is_running(self) -> bool:
@@ -365,7 +385,7 @@ def errored(self) -> bool:
 
     @property
     def dead_error(self) -> BaseException:
-        return Exception
+        return Exception()  # TODO: implement
 
 
 # Retain V0 name for backwards compatibility.
diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py
index 3e6c759ad5ebd..35449238c3259 100644
--- a/vllm/v1/engine/async_stream.py
+++ b/vllm/v1/engine/async_stream.py
@@ -1,11 +1,11 @@
 import asyncio
 from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
 
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
 
 
 class AsyncStream:
-    """A stream of RequestOutputs or EmbeddingRequestOutputs for a request
+    """A stream of RequestOutputs or PoolingRequestOutputs for a request
     that can be iterated over asynchronously via an async generator."""
 
     STOP_ITERATION = Exception()  # Sentinel
@@ -16,7 +16,7 @@ def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
-    def put(self, item: Union[RequestOutput, EmbeddingRequestOutput,
+    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
                               Exception]) -> None:
         if not self._finished:
             self._queue.put_nowait(item)
@@ -32,7 +32,7 @@ def finish(
 
     async def generator(
         self
-    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         finished = False
         try:
             while True:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 35ed131d50de9..497d5db5b4c99 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,26 +1,31 @@
-import multiprocessing
+import pickle
 import queue
+import signal
 import threading
 import time
-from contextlib import contextmanager
+from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
-from multiprocessing.sharedctypes import Synchronized
-from typing import Any, Iterator, List, Tuple, Type, Union
+from typing import List, Tuple, Type
 
 import zmq
 import zmq.asyncio
 from msgspec import msgpack
 
 from vllm.config import CacheConfig, VllmConfig
+from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreRequest, EngineCoreRequestType)
-from vllm.v1.engine.mm_input_mapper import MMInputMapper
-from vllm.v1.executor.gpu_executor import GPUExecutor
+                            EngineCoreProfile, EngineCoreRequest,
+                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.utils import make_zmq_socket
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -36,23 +41,10 @@ class EngineCore:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        executor_class: Type[GPUExecutor],
+        executor_class: Type[Executor],
         usage_context: UsageContext,
     ):
-        # Override the configs for V1.
-        # FIXME
-        if usage_context == UsageContext.LLM_CLASS:
-            vllm_config.scheduler_config.max_num_seqs = 1024
-            vllm_config.scheduler_config.max_num_batched_tokens = 8192
-        elif usage_context == UsageContext.OPENAI_API_SERVER:
-            vllm_config.scheduler_config.max_num_seqs = 1024
-            vllm_config.scheduler_config.max_num_batched_tokens = 2048
-
-        # TODO (ywang96): Enable APC by default when VLM supports it.
-        if not vllm_config.model_config.is_multimodal_model:
-            vllm_config.cache_config.enable_prefix_caching = True
-
-        assert vllm_config.model_config.task != "embedding"
+        assert vllm_config.model_config.runner_type != "pooling"
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
@@ -66,9 +58,6 @@ def __init__(
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
-        # Set up multimodal input mapper (e.g., convert PIL images to tensors).
-        self.mm_input_mapper = MMInputMapper(vllm_config.model_config)
-
         # Setup scheduler.
         self.scheduler = Scheduler(vllm_config.scheduler_config,
                                    vllm_config.cache_config,
@@ -76,8 +65,12 @@ def __init__(
 
         self._last_logging_time = time.time()
 
+        self.mm_input_mapper_server = MMInputMapperServer(
+            vllm_config.model_config)
+
     def _initialize_kv_caches(self,
                               cache_config: CacheConfig) -> Tuple[int, int]:
+        start = time.time()
         num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks(
         )
 
@@ -90,19 +83,27 @@ def _initialize_kv_caches(self,
             num_gpu_blocks = num_gpu_blocks_override
 
         num_cpu_blocks = 0
-        self.model_executor.initialize_cache(num_gpu_blocks)
+        self.model_executor.initialize(num_gpu_blocks)
+        elapsed = time.time() - start
+        logger.info(("init engine (profile, create kv cache, "
+                     "warmup model) took %.2f seconds"), elapsed)
         return num_gpu_blocks, num_cpu_blocks
 
     def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
 
+        if request.mm_hashes is not None:
+            # Here, if hash exists for an image, then it will be fetched
+            # from the cache, else it will be added to the cache.
+            # Note that the cache here is mirrored with the client side of the
+            # MM mapper, so anything that has a hash must have a HIT cache
+            # entry here as well.
+            assert request.mm_inputs is not None
+            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
+                request.mm_inputs, request.mm_hashes)
+
         req = Request.from_engine_core_request(request)
-        # FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may
-        # take 10-50 ms, which can cause a spike in the latency. We should
-        # consider moving this to a separate thread.
-        if req.mm_data:
-            req.mm_inputs = self.mm_input_mapper.process_inputs(
-                req.mm_data, req.mm_processor_kwargs)
+
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
@@ -126,6 +127,20 @@ def step(self) -> List[EngineCoreOutput]:
             scheduler_output, output)
         return engine_core_outputs
 
+    def shutdown(self):
+        self.model_executor.shutdown()
+
+    def profile(self, is_start: bool = True):
+        self.model_executor.profile(is_start)
+
+
+@dataclass
+class EngineCoreProcHandle:
+    proc: BaseProcess
+    ready_path: str
+    input_path: str
+    output_path: str
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
@@ -135,25 +150,21 @@ class EngineCoreProc(EngineCore):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        executor_class: Type[GPUExecutor],
+        executor_class: Type[Executor],
         usage_context: UsageContext,
         input_path: str,
         output_path: str,
         ready_path: str,
-        should_shutdown: Synchronized,
     ):
         super().__init__(vllm_config, executor_class, usage_context)
 
-        # Signal from main process to shutdown (multiprocessing.Value).
-        self.should_shutdown = should_shutdown
-
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue = queue.Queue()
-        self.output_queue = queue.Queue()
+        self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
+        self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, ),
                          daemon=True).start()
@@ -162,32 +173,9 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        with self.make_socket(ready_path, zmq.constants.PUSH) as ready_socket:
+        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
             ready_socket.send_string(EngineCoreProc.READY_STR)
 
-    @contextmanager
-    def make_socket(self, path: str, type: Any) -> Iterator[zmq.Socket]:
-        """Context manager for use """
-
-        ctx = zmq.Context()
-        try:
-            socket = ctx.socket(type)
-
-            if type == zmq.constants.PULL:
-                socket.connect(path)
-            elif type == zmq.constants.PUSH:
-                socket.bind(path)
-            else:
-                raise ValueError(f"Unknown Socket Type: {type}")
-
-            yield socket
-
-        except KeyboardInterrupt:
-            logger.debug("EngineCore had Keyboard Interrupt.")
-
-        finally:
-            ctx.destroy(linger=0)
-
     @staticmethod
     def wait_for_startup(
         proc: BaseProcess,
@@ -220,18 +208,13 @@ def wait_for_startup(
     @staticmethod
     def make_engine_core_process(
         vllm_config: VllmConfig,
-        executor_class: Type[GPUExecutor],
+        executor_class: Type[Executor],
         usage_context: UsageContext,
         input_path: str,
         output_path: str,
         ready_path: str,
-        should_shutdown: Synchronized,
-    ) -> BaseProcess:
-        # The current process might have CUDA context,
-        # so we need to spawn a new process.
-        # NOTE(rob): this is a problem for using EngineCoreProc w/
-        # LLM, since we need a if __name__ == "__main__" guard.
-        context = multiprocessing.get_context("spawn")
+    ) -> EngineCoreProcHandle:
+        context = get_mp_context()
 
         process_kwargs = {
             "input_path": input_path,
@@ -240,7 +223,6 @@ def make_engine_core_process(
             "vllm_config": vllm_config,
             "executor_class": executor_class,
             "usage_context": usage_context,
-            "should_shutdown": should_shutdown
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=EngineCoreProc.run_engine_core,
@@ -249,28 +231,55 @@ def make_engine_core_process(
 
         # Wait for startup
         EngineCoreProc.wait_for_startup(proc, ready_path)
-        return proc
+        return EngineCoreProcHandle(proc=proc,
+                                    ready_path=ready_path,
+                                    input_path=input_path,
+                                    output_path=output_path)
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
         """Launch EngineCore busy loop in background process."""
 
+        # Signal handler used for graceful termination.
+        # SystemExit exception is only raised once to allow this and worker
+        # processes to terminate without error
+        shutdown_requested = False
+
+        # Ensure we can serialize transformer config after spawning
+        maybe_register_config_serialize_by_value()
+
+        def signal_handler(signum, frame):
+            nonlocal shutdown_requested
+            if not shutdown_requested:
+                shutdown_requested = True
+                raise SystemExit()
+
+        # Either SIGTERM or SIGINT will terminate the engine_core
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+        engine_core = None
         try:
             engine_core = EngineCoreProc(*args, **kwargs)
             engine_core.run_busy_loop()
 
-        except KeyboardInterrupt:
+        except SystemExit:
             logger.debug("EngineCore interrupted.")
 
         except BaseException as e:
             logger.exception(e)
             raise e
 
+        finally:
+            if engine_core is not None:
+                engine_core.shutdown()
+                engine_core = None
+
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
-        # Loop until we get a shutdown signal.
-        while not self.should_shutdown:
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:
@@ -281,8 +290,8 @@ def run_busy_loop(self):
                     except queue.Empty:
                         self._log_stats()
                         logger.debug("EngineCore busy loop waiting.")
-                        if self.should_shutdown:
-                            return
+                    except BaseException:
+                        raise
 
             # 2) Handle any new client requests (Abort or Add).
             while not self.input_queue.empty():
@@ -311,12 +320,13 @@ def _log_stats(self):
 
             self._last_logging_time = now
 
-    def _handle_client_request(
-            self, request: Union[EngineCoreRequest, List[str]]) -> None:
+    def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
         """Handle EngineCoreRequest or EngineCoreABORT from Client."""
 
         if isinstance(request, EngineCoreRequest):
             self.add_request(request)
+        elif isinstance(request, EngineCoreProfile):
+            self.model_executor.profile(request.is_start)
         else:
             # TODO: make an EngineCoreAbort wrapper
             assert isinstance(request, list)
@@ -329,7 +339,7 @@ def process_input_socket(self, input_path: str):
         decoder_add_req = PickleEncoder()
         decoder_abort_req = PickleEncoder()
 
-        with self.make_socket(input_path, zmq.constants.PULL) as socket:
+        with make_zmq_socket(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
@@ -341,6 +351,8 @@ def process_input_socket(self, input_path: str):
                     request = decoder_add_req.decode(request_data)
                 elif request_type == EngineCoreRequestType.ABORT.value:
                     request = decoder_abort_req.decode(request_data)
+                elif request_type == EngineCoreRequestType.PROFILE.value:
+                    request = pickle.loads(request_data)
                 else:
                     raise ValueError(f"Unknown RequestType: {request_type}")
 
@@ -355,7 +367,7 @@ def process_output_socket(self, output_path: str):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with self.make_socket(output_path, zmq.constants.PUSH) as socket:
+        with make_zmq_socket(output_path, zmq.constants.PUSH) as socket:
             while True:
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 09801e20e16ca..d56fcbdb1e7c4 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,16 +1,18 @@
-import multiprocessing
-import time
-from typing import List, Union
+import os
+import weakref
+from typing import List, Optional
 
 import msgspec
 import zmq
 import zmq.asyncio
 
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path
+from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreRequest, EngineCoreRequestType)
-from vllm.v1.engine.core import EngineCore, EngineCoreProc
+                            EngineCoreProfile, EngineCoreRequest,
+                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
+                                 EngineCoreProcHandle)
 from vllm.v1.serial_utils import PickleEncoder
 
 logger = init_logger(__name__)
@@ -58,6 +60,9 @@ def get_output(self) -> List[EngineCoreOutput]:
     def add_request(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
+    def profile(self, is_start: bool = True) -> None:
+        raise NotImplementedError
+
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -67,6 +72,9 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
+    async def profile_async(self, is_start: bool = True) -> None:
+        raise NotImplementedError
+
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -95,6 +103,15 @@ def add_request(self, request: EngineCoreRequest) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         self.engine_core.abort_requests(request_ids)
 
+    def shutdown(self):
+        self.engine_core.shutdown()
+
+    def __del__(self):
+        self.shutdown()
+
+    def profile(self, is_start: bool = True) -> None:
+        self.engine_core.profile(is_start)
+
 
 class MPClient(EngineCoreClient):
     """
@@ -120,7 +137,10 @@ def __init__(
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        self.ctx = (zmq.asyncio.Context() if asyncio_mode else zmq.Context())
+        if asyncio_mode:
+            self.ctx = zmq.asyncio.Context()
+        else:
+            self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
         # Path for IPC.
         ready_path = get_open_zmq_ipc_path()
@@ -136,33 +156,40 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
-        self.should_shutdown = multiprocessing.Value('b', False, lock=False)
-        self.proc = EngineCoreProc.make_engine_core_process(
+        self.proc_handle: Optional[EngineCoreProcHandle]
+        self.proc_handle = EngineCoreProc.make_engine_core_process(
             *args,
-            input_path=input_path,
-            output_path=output_path,
-            ready_path=ready_path,
-            should_shutdown=self.should_shutdown,
+            input_path=
+            input_path,  # type: ignore[misc]  # MyPy incorrectly flags duplicate keywords
+            output_path=output_path,  # type: ignore[misc]
+            ready_path=ready_path,  # type: ignore[misc]
             **kwargs,
         )
+        self._finalizer = weakref.finalize(self, self.shutdown)
 
     def shutdown(self):
-        # Send shutdown signal to background process.
-        self.should_shutdown = True
-
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
-        # Shutdown the process if needed.
-        if hasattr(self, "proc") and self.proc.is_alive():
-            self.proc.terminate()
+        if hasattr(self, "proc_handle") and self.proc_handle:
+            # Shutdown the process if needed.
+            if self.proc_handle.proc.is_alive():
+                self.proc_handle.proc.terminate()
+                self.proc_handle.proc.join(5)
 
-            time.sleep(5)
-            if self.proc.is_alive():
-                self.proc.kill()
+                if self.proc_handle.proc.is_alive():
+                    kill_process_tree(self.proc_handle.proc.pid)
 
-    def __del__(self):
-        self.shutdown()
+            # Remove zmq ipc socket files
+            ipc_sockets = [
+                self.proc_handle.ready_path, self.proc_handle.output_path,
+                self.proc_handle.input_path
+            ]
+            for ipc_socket in ipc_sockets:
+                socket_file = ipc_socket.replace("ipc://", "")
+                if os and os.path.exists(socket_file):
+                    os.remove(socket_file)
+            self.proc_handle = None
 
 
 class SyncMPClient(MPClient):
@@ -178,7 +205,7 @@ def get_output(self) -> List[EngineCoreOutput]:
         return engine_core_outputs
 
     def _send_input(self, request_type: EngineCoreRequestType,
-                    request: Union[EngineCoreRequest, List[str]]) -> None:
+                    request: EngineCoreRequestUnion) -> None:
 
         # (RequestType, SerializedRequest)
         msg = (request_type.value, self.encoder.encode(request))
@@ -190,6 +217,10 @@ def add_request(self, request: EngineCoreRequest) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
+    def profile(self, is_start: bool = True) -> None:
+        self._send_input(EngineCoreRequestType.PROFILE,
+                         EngineCoreProfile(is_start))
+
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
@@ -204,9 +235,8 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
 
         return engine_core_outputs
 
-    async def _send_input(
-            self, request_type: EngineCoreRequestType,
-            request: Union[EngineCoreRequest, List[str]]) -> None:
+    async def _send_input(self, request_type: EngineCoreRequestType,
+                          request: EngineCoreRequestUnion) -> None:
 
         msg = (request_type.value, self.encoder.encode(request))
         await self.input_socket.send_multipart(msg, copy=False)
@@ -217,3 +247,7 @@ async def add_request_async(self, request: EngineCoreRequest) -> None:
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         if len(request_ids) > 0:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
+
+    async def profile_async(self, is_start: bool = True) -> None:
+        await self._send_input(EngineCoreRequestType.PROFILE,
+                               EngineCoreProfile(is_start))
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6249d60199a62..02f34e2b54dd5 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
@@ -97,7 +97,7 @@ def add_tokens(
         self,
         new_token_ids: List[int],
         finish_reason: Optional[str],
-        stop_reason: Optional[str],
+        stop_reason: Optional[Union[int, str, None]],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 75a77be750acd..bea8c5502f612 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,5 +1,7 @@
 from typing import Dict, List, Mapping, Optional, Type, Union
 
+from typing_extensions import TypeVar
+
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
@@ -12,15 +14,18 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer_group import (
+    BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
-from vllm.v1.executor.gpu_executor import GPUExecutor
+from vllm.v1.executor.abstract import Executor
 
 logger = init_logger(__name__)
 
+_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
+
 
 class LLMEngine:
     """Legacy LLMEngine for backwards compatibility."""
@@ -28,7 +33,7 @@ class LLMEngine:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        executor_class: Type[GPUExecutor],
+        executor_class: Type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
@@ -46,13 +51,16 @@ def __init__(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
-            enable_lora=bool(vllm_config.lora_config))
+            lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry, mm_registry)
+        self.processor = Processor(model_config=vllm_config.model_config,
+                                   cache_config=vllm_config.cache_config,
+                                   lora_config=vllm_config.lora_config,
+                                   tokenizer=self.tokenizer,
+                                   input_registry=input_registry,
+                                   mm_registry=mm_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
         self.detokenizer = Detokenizer(
@@ -82,7 +90,7 @@ def from_engine_args(
         """Creates an LLM engine from the engine arguments."""
 
         # Create the engine configs.
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(usage_context)
         executor_class = cls._get_executor_cls(vllm_config)
 
         if VLLM_ENABLE_V1_MULTIPROCESSING:
@@ -98,11 +106,19 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig):
-        return GPUExecutor
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        raise NotImplementedError("TP not implemented yet.")
+    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
+        executor_class: Type[Executor]
+        distributed_executor_backend = (
+            vllm_config.parallel_config.distributed_executor_backend)
+        if distributed_executor_backend == "mp":
+            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+            executor_class = MultiprocExecutor
+        else:
+            assert (distributed_executor_backend is None)
+            from vllm.v1.executor.uniproc_executor import UniprocExecutor
+            executor_class = UniprocExecutor
+
+        return executor_class
 
     def get_num_unfinished_requests(self) -> int:
         return self.detokenizer.get_num_unfinished_requests()
@@ -161,13 +177,33 @@ def step(self) -> List[RequestOutput]:
     # TODO(rob): Can we get rid of these?
 
     def get_model_config(self):
-        pass
+        return self.model_config
 
     def start_profile(self):
-        pass
+        self.engine_core.profile(True)
 
     def stop_profile(self):
-        pass
+        self.engine_core.profile(False)
 
-    def get_tokenizer_group(self, group_type):
-        pass
+    def get_tokenizer_group(
+        self,
+        group_type: Type[_G] = BaseTokenizerGroup,
+    ) -> _G:
+        tokenizer_group = self.tokenizer
+
+        if tokenizer_group is None:
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
+        if not isinstance(tokenizer_group, group_type):
+            raise TypeError("Invalid type of tokenizer group. "
+                            f"Expected type: {group_type}, but "
+                            f"found type: {type(tokenizer_group)}")
+
+        return tokenizer_group
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        if engine_core := getattr(self, "engine_core", None):
+            engine_core.shutdown()
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 594c973678235..e53ba092ede04 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,39 +1,176 @@
 from typing import Any, Dict, List, Optional
 
+import PIL
+from blake3 import blake3
+
 from vllm.config import ModelConfig
+from vllm.inputs import PromptType
+from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
+from vllm.v1.utils import LRUDictCache
+
+logger = init_logger(__name__)
+
+# The idea of MM preprocessor caching is based on having a client and a server,
+# where the client executes in the frontend process (=P0) and the server in the
+# core process (=P1).
+#
+# -- Client: Executes the MM mapper and performs caching of the results.
+# -- Server: Performs caching of the results
+#
+# The caching for both client and server is mirrored/similar, and this allows us
+# to avoid the serialization of "mm_inputs" (like pixel values) between
+# client (=P0) and server (=P1) processes.
 
+# Both Client and Server must use the same cache size
+# (to perform mirrored caching)
+# TODO: Tune the MM cache size
+MM_CACHE_SIZE = 256
 
-class MMInputMapper:
+
+class MMInputMapperClient:
 
     def __init__(
         self,
         model_config: ModelConfig,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
+        self.model_config = model_config
         self.mm_registry = mm_registry
         self.multi_modal_input_mapper = mm_registry.create_input_mapper(
             model_config)
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
+        # Init cache
+        self.use_cache = model_config.mm_cache_preprocessor
+        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+
+        # DEBUG: Set to None to disable
+        self.mm_debug_cache_hit_ratio_steps = None
+        self.mm_cache_hits = 0
+        self.mm_cache_total = 0
+
+    def cache_hit_ratio(self, steps):
+        if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
+            logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
+                         self.mm_cache_hits / self.mm_cache_total)
+
+    # TODO: Support modalities beyond image.
     def process_inputs(
         self,
         mm_data: MultiModalDataDict,
+        mm_hashes: Optional[List[str]],
         mm_processor_kwargs: Optional[Dict[str, Any]],
+        precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
     ) -> List[MultiModalKwargs]:
+        if precomputed_mm_inputs is None:
+            image_inputs = mm_data["image"]
+            if not isinstance(image_inputs, list):
+                image_inputs = [image_inputs]
+            num_inputs = len(image_inputs)
+        else:
+            num_inputs = len(precomputed_mm_inputs)
+
+        # Sanity
+        if self.use_cache:
+            assert mm_hashes is not None
+            assert num_inputs == len(mm_hashes)
+
+        # Process each image input separately, so that later we can schedule
+        # them in a fine-grained manner.
+        # Apply caching (if enabled) and reuse precomputed inputs (if provided)
+        ret_inputs: List[MultiModalKwargs] = []
+        for input_id in range(num_inputs):
+            if self.mm_debug_cache_hit_ratio_steps is not None:
+                self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
+
+            mm_input = None
+            if self.use_cache:
+                assert mm_hashes is not None
+                mm_hash = mm_hashes[input_id]
+                mm_input = self.mm_cache.get(mm_hash)
+
+            self.mm_cache_total += 1
+            if mm_input is None:
+                if precomputed_mm_inputs is not None:
+                    # Reuse precomputed input (for merged preprocessor)
+                    mm_input = precomputed_mm_inputs[input_id]
+                else:
+                    # Apply MM mapper
+                    mm_input = self.multi_modal_input_mapper(
+                        {"image": [image_inputs[input_id]]},
+                        mm_processor_kwargs=mm_processor_kwargs,
+                    )
+
+                if self.use_cache:
+                    # Add to cache
+                    assert mm_hash is not None
+                    self.mm_cache.put(mm_hash, mm_input)
+            else:
+                self.mm_cache_hits += 1
+                mm_input = None  # Avoids sending mm_input to Server
+
+            ret_inputs.append(mm_input)
+
+        return ret_inputs
+
+
+class MMInputMapperServer:
+
+    def __init__(self, model_config):
+        self.use_cache = model_config.mm_cache_preprocessor
+        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+
+    def process_inputs(
+        self,
+        mm_inputs: List[Optional[MultiModalKwargs]],
+        mm_hashes: List[str],
+    ) -> List[MultiModalKwargs]:
+        assert len(mm_inputs) == len(mm_hashes)
+
+        if not self.use_cache:
+            return mm_inputs
+
+        full_mm_inputs = []
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            assert mm_hash is not None
+            if mm_input is None:
+                mm_input = self.mm_cache.get(mm_hash)
+                assert mm_input is not None
+            else:
+                self.mm_cache.put(mm_hash, mm_input)
+
+            full_mm_inputs.append(mm_input)
+
+        return full_mm_inputs
+
+
+class MMHasher:
+
+    def __init__(self):
+        pass
+
+    def hash(self, prompt: PromptType) -> Optional[List[str]]:
+        if "multi_modal_data" not in prompt:
+            return None
+
+        mm_data = prompt["multi_modal_data"]
         image_inputs = mm_data["image"]
         if not isinstance(image_inputs, list):
             image_inputs = [image_inputs]
+        assert len(image_inputs) > 0
 
-        # Process each image input separately so that later we can schedule
-        # them in a fine-grained manner.
-        mm_inputs: List[MultiModalKwargs] = []
-        num_images = len(image_inputs)
-        for i in range(num_images):
-            mm_input = self.multi_modal_input_mapper(
-                {"image": [image_inputs[i]]},
-                mm_processor_kwargs=mm_processor_kwargs,
-            )
-            mm_inputs.append(mm_input)
-        return mm_inputs
+        ret = []
+        for image in image_inputs:
+            assert isinstance(image, PIL.Image.Image)
+
+            # Convert image to bytes
+            bytes = image.tobytes()
+
+            # Hash image bytes
+            hasher = blake3()
+            hasher.update(bytes)
+            ret.append(hasher.hexdigest())
+
+        return ret
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5c1577190c75a..732757d6b0ac2 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,19 +1,21 @@
 import time
 from typing import Any, Dict, Mapping, Optional, Tuple, Union
 
-from vllm.config import LoRAConfig, ModelConfig
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
+                             MultiModalRegistry)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
+from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 
 
 class Processor:
@@ -21,6 +23,7 @@ class Processor:
     def __init__(
         self,
         model_config: ModelConfig,
+        cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         tokenizer: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
@@ -39,6 +42,14 @@ def __init__(
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
+        # Multi-modal (huggingface) input mapper
+        self.mm_input_mapper_client = MMInputMapperClient(model_config)
+
+        # Multi-modal hasher (for images)
+        self.use_hash = model_config.mm_cache_preprocessor or \
+            cache_config.enable_prefix_caching
+        self.mm_hasher = MMHasher()
+
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
     # asyncio loop while this is running.
@@ -47,14 +58,14 @@ def process_inputs(
         request_id: str,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
-        arrival_time: float,
+        arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
 
-        # TODO(woosuk): Support embedding mode.
+        # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Check max_logprobs
         # TODO(woosuk): Support encoder-decoder models.
 
@@ -66,6 +77,11 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
+        # Compute MM hashes (if enabled)
+        mm_hashes = None
+        if self.use_hash:
+            mm_hashes = self.mm_hasher.hash(prompt)
+
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
@@ -96,6 +112,18 @@ def process_inputs(
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
 
+        # For merged preprocessor, mm_data is already mm_inputs
+        precomputed_mm_inputs = None
+        if isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
+            precomputed_mm_inputs = [decoder_inputs.multi_modal_data]
+
+        # Apply MM mapper
+        mm_inputs = None
+        if len(decoder_inputs.multi_modal_data) > 0:
+            mm_inputs = self.mm_input_mapper_client.process_inputs(
+                decoder_inputs.multi_modal_data, mm_hashes,
+                decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
+
         # Make Request for Detokenizer.
         detokenizer_request = DetokenizerRequest(
             request_id,
@@ -113,9 +141,9 @@ def process_inputs(
             request_id,
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
-            decoder_inputs.multi_modal_data,
+            mm_inputs,
+            mm_hashes,
             decoder_inputs.multi_modal_placeholders,
-            decoder_inputs.mm_processor_kwargs,
             sampling_params,
             eos_token_id,
             arrival_time,
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
new file mode 100644
index 0000000000000..564d0447f15a6
--- /dev/null
+++ b/vllm/v1/executor/abstract.py
@@ -0,0 +1,40 @@
+from abc import ABC, abstractmethod
+from typing import Tuple
+
+from vllm.config import VllmConfig
+from vllm.v1.outputs import ModelRunnerOutput
+
+
+class Executor(ABC):
+    """Abstract class for executors."""
+
+    @abstractmethod
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def initialize(self, num_gpu_blocks: int) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> ModelRunnerOutput:
+        raise NotImplementedError
+
+    @abstractmethod
+    def profile(self, is_start: bool = True):
+        raise NotImplementedError
+
+    @abstractmethod
+    def shutdown(self):
+        pass
+
+    @abstractmethod
+    def check_health(self) -> None:
+        raise NotImplementedError
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
new file mode 100644
index 0000000000000..128101aa6956d
--- /dev/null
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -0,0 +1,387 @@
+import os
+import pickle
+import signal
+import sys
+import time
+import weakref
+from dataclasses import dataclass
+from enum import Enum, auto
+from multiprocessing.process import BaseProcess
+from typing import Any, Dict, List, Optional, Tuple
+
+import zmq
+
+from vllm.config import VllmConfig
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel)
+from vllm.distributed.device_communicators.shm_broadcast import (Handle,
+                                                                 MessageQueue)
+from vllm.executor.multiproc_worker_utils import (
+    _add_prefix, get_mp_context, set_multiprocessing_worker_envs)
+from vllm.logger import init_logger
+from vllm.utils import (get_distributed_init_method, get_open_port,
+                        get_open_zmq_ipc_path)
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.utils import make_zmq_socket
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+POLLING_TIMEOUT_MS = 5000
+POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
+
+
+class MultiprocExecutor(Executor):
+
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        # Call self.shutdown at exit to clean up
+        # and ensure workers will be terminated.
+        self._finalizer = weakref.finalize(self, self.shutdown)
+
+        self.vllm_config = vllm_config
+        self.parallel_config = vllm_config.parallel_config
+
+        self.world_size = self.parallel_config.world_size
+        tensor_parallel_size = self.parallel_config.tensor_parallel_size
+        assert self.world_size == tensor_parallel_size, (
+            f"world_size ({self.world_size}) must be equal to the "
+            f"tensor_parallel_size ({tensor_parallel_size}). "
+            f"Pipeline parallelism is not yet implemented in v1")
+
+        # Set multiprocessing envs that are common to V0 and V1
+        set_multiprocessing_worker_envs(self.parallel_config)
+
+        # Multiprocessing-based executor does not support multi-node setting.
+        # Since it only works for single node, we can use the loopback address
+        # 127.0.0.1 for communication.
+        distributed_init_method = get_distributed_init_method(
+            "127.0.0.1", get_open_port())
+
+        # Initialize worker and set up message queues for SchedulerOutputs
+        # and ModelRunnerOutputs
+        self.rpc_broadcast_mq = MessageQueue(self.world_size, self.world_size)
+        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
+
+        # Create workers
+        self.workers: List[WorkerProcHandle] = []
+        for rank in range(self.world_size):
+            worker = WorkerProc.make_worker_process(vllm_config, rank, rank,
+                                                    distributed_init_method,
+                                                    scheduler_output_handle)
+            self.workers.append(worker)
+
+        # Ensure message queues are ready. Will deadlock if re-ordered
+        # Must be kept consistent with the WorkerProc
+        self.rpc_broadcast_mq.wait_until_ready()
+        for w in self.workers:
+            w.worker_response_mq.wait_until_ready()
+
+    def initialize(self, num_gpu_blocks: int) -> None:
+        """
+        Initialize the KV caches and begin the model execution loop of the
+        underlying workers.
+        """
+        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
+        self.collective_rpc("compile_or_warm_up_model")
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """
+        Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        num_blocks = self.collective_rpc("determine_num_available_blocks")
+
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+
+        return num_gpu_blocks, num_cpu_blocks
+
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        """
+        Execute an RPC call on workers.
+        
+        Args:
+            method: Name of the worker method to execute
+            timeout: Maximum time in seconds to wait for execution. Rases a
+                     TimeoutError on timeout. None means wait indefinitely.
+            args: Positional arguments to pass to the worker method
+            kwargs: Keyword arguments to pass to the worker method
+
+        Returns:
+            List of results from each worker
+        """
+        start_time = time.monotonic()
+        kwargs = kwargs or {}
+
+        try:
+            self.rpc_broadcast_mq.enqueue((method, args, kwargs))
+
+            responses = [None] * self.world_size
+            for w in self.workers:
+                dequeue_timeout = timeout - (time.monotonic() - start_time
+                                             ) if timeout is not None else None
+                status, result = w.worker_response_mq.dequeue(
+                    timeout=dequeue_timeout)
+
+                if status != WorkerProc.ResponseStatus.SUCCESS:
+                    if isinstance(result, Exception):
+                        raise result
+                    else:
+                        raise RuntimeError("Worker failed")
+
+                responses[w.rank] = result
+
+            return responses
+        except TimeoutError as e:
+            raise TimeoutError(f"RPC call to {method} timed out.") from e
+        except Exception as e:
+            # Re-raise any other exceptions
+            raise e
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> ModelRunnerOutput:
+        model_output = self.collective_rpc("execute_model",
+                                           args=(scheduler_output, ))[0]
+        return model_output
+
+    def profile(self, is_start: bool = True):
+        self.collective_rpc("profile", args=(is_start, ))
+        return
+
+    def _ensure_worker_termination(self):
+        """Ensure that all worker processes are terminated. Assumes workers have
+        received termination requests. Waits for processing, then sends
+        termination and kill signals if needed."""
+
+        def wait_for_termination(procs, timeout):
+            if not time:
+                # If we are in late stage shutdown, the interpreter may replace
+                # `time` with `None`.
+                return all(not proc.is_alive() for proc in procs)
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                if all(not proc.is_alive() for proc in procs):
+                    return True
+                time.sleep(0.1)
+            return False
+
+        # Send SIGTERM if still running
+        active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
+        for p in active_procs:
+            p.terminate()
+        if not wait_for_termination(active_procs, 4):
+            # Send SIGKILL if still running
+            active_procs = [p for p in active_procs if p.is_alive()]
+            for p in active_procs:
+                p.kill()
+
+        self._cleanup_sockets()
+
+    def _cleanup_sockets(self):
+        for w in self.workers:
+            # Remove the zmq ipc socket file
+            socket_path = w.ready_path.replace("ipc://", "")
+            if os and os.path.exists(socket_path):
+                os.remove(socket_path)
+
+    def shutdown(self):
+        """Properly shut down the executor and its workers"""
+        if getattr(self, 'shutting_down', False):
+            self.shutting_down = True
+            for w in self.workers:
+                w.worker_response_mq = None
+            self._ensure_worker_termination()
+
+        self.rpc_broadcast_mq = None
+
+    def check_health(self) -> None:
+        self.collective_rpc("check_health", timeout=10)
+        return
+
+
+@dataclass
+class WorkerProcHandle:
+    proc: BaseProcess
+    rank: int
+    ready_path: str
+    worker_response_mq: MessageQueue  # The worker process writes to this MQ
+
+
+class WorkerProc:
+    """Wrapper that runs one Worker in a separate process."""
+
+    READY_STR = "READY"
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        input_shm_handle: Handle,
+        ready_path: str,
+    ):
+        self.rank = rank
+        wrapper = WorkerWrapperBase(vllm_config=vllm_config)
+        wrapper.init_worker(vllm_config, local_rank, rank,
+                            distributed_init_method)
+        self.worker = wrapper.worker
+
+        pid = os.getpid()
+        _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
+        _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
+
+        # Initialize MessageQueue for receiving SchedulerOutput
+        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+            input_shm_handle, self.worker.rank)
+
+        # Initializes a message queue for sending the model output
+        self.worker_response_mq = MessageQueue(1, 1)
+        worker_response_mq_handle = self.worker_response_mq.export_handle()
+
+        # Send Readiness signal to EngineCore process.
+        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
+            payload = pickle.dumps(worker_response_mq_handle,
+                                   protocol=pickle.HIGHEST_PROTOCOL)
+            ready_socket.send_string(WorkerProc.READY_STR)
+            ready_socket.send(payload)
+
+        self.worker.initialize()
+        self.worker.load_model()
+
+    @staticmethod
+    def make_worker_process(
+            vllm_config: VllmConfig,
+            local_rank: int,
+            rank: int,
+            distributed_init_method: str,
+            input_shm_handle,  # Receive SchedulerOutput
+    ) -> WorkerProcHandle:
+        context = get_mp_context()
+
+        # ZMQ path for worker to send ready message and shm_broadcast handle
+        # back to core process.
+        ready_path = get_open_zmq_ipc_path()
+
+        process_kwargs = {
+            "vllm_config": vllm_config,
+            "local_rank": local_rank,
+            "rank": rank,
+            "distributed_init_method": distributed_init_method,
+            "input_shm_handle": input_shm_handle,
+            "ready_path": ready_path,
+        }
+        # Run EngineCore busy loop in background process.
+        proc = context.Process(target=WorkerProc.worker_main,
+                               kwargs=process_kwargs,
+                               daemon=True)
+        proc.start()
+
+        # Wait for startup
+        worker_response_mq_handle = WorkerProc.wait_for_startup(
+            proc, ready_path)
+
+        worker_response_mq = MessageQueue.create_from_handle(
+            worker_response_mq_handle, 0)
+
+        return WorkerProcHandle(proc, rank, ready_path, worker_response_mq)
+
+    def shutdown(self):
+        self.rpc_broadcast_mq = None
+        self.worker_response_mq = None
+        destroy_model_parallel()
+        destroy_distributed_environment()
+
+    @staticmethod
+    def worker_main(*args, **kwargs):
+        """ Worker initialization and execution loops.
+        This runs a background process """
+
+        # Signal handler used for graceful termination.
+        # SystemExit exception is only raised once to allow this and worker
+        # processes to terminate without error
+        shutdown_requested = False
+
+        def signal_handler(signum, frame):
+            nonlocal shutdown_requested
+            if not shutdown_requested:
+                shutdown_requested = True
+                raise SystemExit()
+
+        # Either SIGTERM or SIGINT will terminate the worker
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+        worker = None
+        try:
+            worker = WorkerProc(*args, **kwargs)
+
+            # Ensure message queues are ready. Will deadlock if re-ordered.
+            # Must be kept consistent with the Executor
+            worker.rpc_broadcast_mq.wait_until_ready()
+            worker.worker_response_mq.wait_until_ready()
+
+            worker.worker_busy_loop()
+
+        except SystemExit:
+            logger.debug("Worker interrupted.")
+
+        except BaseException as e:
+            logger.exception(e)
+            raise
+
+        finally:
+            # Clean up once worker exits busy loop
+            if worker is not None:
+                worker.shutdown()
+                worker = None
+
+    @staticmethod
+    def wait_for_startup(
+        proc: BaseProcess,
+        ready_path: str,
+    ) -> Optional[Handle]:
+        """Wait until the Worker is ready."""
+        with make_zmq_socket(ready_path, zmq.constants.PULL) as socket:
+
+            # Wait for Worker to send READY.
+            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
+                logger.debug("Waiting for WorkerProc to startup.")
+
+                if not proc.is_alive():
+                    raise RuntimeError("WorkerProc failed to start.")
+
+            message = socket.recv_string()
+            assert message == WorkerProc.READY_STR
+            handle_frame = socket.recv(copy=False)
+            handle = pickle.loads(handle_frame.buffer)
+            return handle
+
+    class ResponseStatus(Enum):
+        SUCCESS = auto()
+        FAILURE = auto()
+
+    def worker_busy_loop(self):
+        """Main busy loop for Multiprocessing Workers"""
+        while True:
+            method, args, kwargs = self.rpc_broadcast_mq.dequeue()
+
+            try:
+                output = getattr(self.worker, method)(*args, **kwargs)
+            except BaseException as e:
+                self.worker_response_mq.enqueue(
+                    (WorkerProc.ResponseStatus.FAILURE, e))
+                continue
+
+            self.worker_response_mq.enqueue(
+                (WorkerProc.ResponseStatus.SUCCESS, output))
diff --git a/vllm/v1/executor/gpu_executor.py b/vllm/v1/executor/uniproc_executor.py
similarity index 87%
rename from vllm/v1/executor/gpu_executor.py
rename to vllm/v1/executor/uniproc_executor.py
index f71fa16b16e27..be058318de58b 100644
--- a/vllm/v1/executor/gpu_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -4,13 +4,14 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_worker import Worker
 
 logger = init_logger(__name__)
 
 
-class GPUExecutor:
+class UniprocExecutor(Executor):
 
     def __init__(self, vllm_config: VllmConfig) -> None:
         self.vllm_config = vllm_config
@@ -25,7 +26,7 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
-        self.worker = self._create_worker()
+        self.worker: Worker = self._create_worker()
         self.worker.initialize()
         self.worker.load_model()
 
@@ -54,7 +55,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         """
         return self.worker.determine_num_available_blocks()
 
-    def initialize_cache(self, num_gpu_blocks: int) -> None:
+    def initialize(self, num_gpu_blocks: int) -> None:
         """Initialize the KV cache by invoking the underlying worker.
         """
         # NOTE: This is logged in the executor because there can be >1 worker
@@ -71,7 +72,13 @@ def execute_model(
         output = self.worker.execute_model(scheduler_output)
         return output
 
+    def profile(self, is_start: bool = True):
+        self.worker.profile(is_start)
+
+    def shutdown(self):
+        pass
+
     def check_health(self) -> None:
-        # GPUExecutor will always be healthy as long as
+        # UniprocExecutor will always be healthy as long as
         # it's running.
         return
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 8574987728844..acc3a944e21b9 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -8,7 +8,7 @@
 class SamplerOutput:
 
     # [num_reqs]
-    sampled_token_ids: torch.Tensor
+    sampled_token_ids: List[int]
 
     # [num_reqs, max_num_logprobs + 1]
     logprob_token_ids: Optional[torch.Tensor]
@@ -20,6 +20,8 @@ class SamplerOutput:
     prompt_logprobs: Optional[torch.Tensor]
 
 
+# ModelRunnerOutput is serialized and sent to the scheduler process.
+# This is expensive for torch.Tensor so prefer to use List instead.
 @dataclass
 class ModelRunnerOutput:
 
@@ -29,7 +31,7 @@ class ModelRunnerOutput:
     req_id_to_index: Dict[str, int]
 
     # [num_reqs]
-    sampled_token_ids_cpu: torch.Tensor
+    sampled_token_ids: List[int]
 
     # [num_reqs, max_num_logprobs + 1]
     logprob_token_ids_cpu: Optional[torch.Tensor]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 51fb4003e5fe0..f4783ae366ef0 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,5 +1,5 @@
 import enum
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
@@ -9,6 +9,9 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
+if TYPE_CHECKING:
+    from vllm.v1.core.kv_cache_utils import BlockHashType
+
 
 class Request:
 
@@ -45,9 +48,7 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
-        # Raw multimodal data before the mm input mapper (e.g., PIL images).
-        self.mm_data = self.inputs.multi_modal_data
-        self.mm_processor_kwargs = self.inputs.mm_processor_kwargs
+        # Multi-modal input metadata.
         mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
             # FIXME(woosuk): Support other modalities.
@@ -56,6 +57,14 @@ def __init__(
             self.mm_positions = []
         # Output of the mm input mapper (e.g., image tensors).
         self.mm_inputs: List[MultiModalKwargs] = []
+        if self.inputs.multi_modal_inputs:
+            self.mm_inputs = self.inputs.multi_modal_inputs
+
+        self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
+
+        # Cache the computed kv block hashes of the request to avoid
+        # recomputing.
+        self._kv_block_hashes: List[BlockHashType] = []
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
@@ -64,9 +73,11 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
             inputs=token_inputs(
                 prompt_token_ids=request.prompt_token_ids,
                 prompt=request.prompt,
-                multi_modal_data=request.mm_data,
+                multi_modal_data=None,
+                multi_modal_inputs=request.mm_inputs,
+                multi_modal_hashes=request.mm_hashes,
                 multi_modal_placeholders=request.mm_placeholders,
-                mm_processor_kwargs=request.mm_processor_kwargs,
+                mm_processor_kwargs=None,
             ),
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
@@ -110,7 +121,7 @@ def get_finished_reason(self) -> Union[str, None]:
         return RequestStatus.get_finished_reason(self.status)
 
     def has_encoder_inputs(self) -> bool:
-        return len(self.mm_data) > 0
+        return len(self.mm_inputs) > 0
 
     @property
     def num_encoder_inputs(self) -> int:
@@ -121,6 +132,17 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
+    @property
+    def kv_block_hashes(self) -> ConstantList["BlockHashType"]:
+        # Prevent directly appending to the kv_block_hashes.
+        return ConstantList(self._kv_block_hashes)
+
+    def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None:
+        self._kv_block_hashes = value
+
+    def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None:
+        self._kv_block_hashes.append(block_hash)
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 927f274541c4d..d1a755be01ff7 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -37,8 +37,9 @@ def forward(
             topk_logprobs = None
             topk_indices = None
 
+        # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
-            sampled_token_ids=sampled,
+            sampled_token_ids=sampled.tolist(),
             logprob_token_ids=topk_indices,
             logprobs=topk_logprobs,
             prompt_logprob_token_ids=None,
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 4b26749712e32..5f327d7066830 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,9 +1,19 @@
-from typing import Generic, List, TypeVar, overload
+from collections import OrderedDict
+from collections.abc import Sequence
+from contextlib import contextmanager
+from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
+                    overload)
+
+import zmq
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 T = TypeVar("T")
 
 
-class ConstantList(Generic[T]):
+class ConstantList(Generic[T], Sequence):
 
     def __init__(self, x: List[T]) -> None:
         self._x = x
@@ -26,29 +36,33 @@ def remove(self, item):
     def clear(self):
         raise Exception("Cannot clear a constant list")
 
-    def index(self, item):
-        return self._x.index(item)
+    def index(self,
+              item: T,
+              start: int = 0,
+              stop: Optional[int] = None) -> int:
+        return self._x.index(item, start,
+                             stop if stop is not None else len(self._x))
 
     @overload
-    def __getitem__(self, item) -> T:
+    def __getitem__(self, item: int) -> T:
         ...
 
     @overload
     def __getitem__(self, s: slice, /) -> List[T]:
         ...
 
-    def __getitem__(self, item):
+    def __getitem__(self, item: Union[int, slice]) -> Union[T, List[T]]:
         return self._x[item]
 
     @overload
-    def __setitem__(self, item, value):
+    def __setitem__(self, item: int, value: T):
         ...
 
     @overload
-    def __setitem__(self, s: slice, value, /):
+    def __setitem__(self, s: slice, value: T, /):
         ...
 
-    def __setitem__(self, item, value):
+    def __setitem__(self, item: Union[int, slice], value: Union[T, List[T]]):
         raise Exception("Cannot set item in a constant list")
 
     def __delitem__(self, item):
@@ -62,3 +76,53 @@ def __contains__(self, item):
 
     def __len__(self):
         return len(self._x)
+
+
+@contextmanager
+def make_zmq_socket(
+        path: str,
+        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
+    """Context manager for a ZMQ socket"""
+
+    ctx = zmq.Context()  # type: ignore[attr-defined]
+    try:
+        socket = ctx.socket(type)
+
+        if type == zmq.constants.PULL:
+            socket.connect(path)
+        elif type == zmq.constants.PUSH:
+            socket.bind(path)
+        else:
+            raise ValueError(f"Unknown Socket Type: {type}")
+
+        yield socket
+
+    except KeyboardInterrupt:
+        logger.debug("Worker had Keyboard Interrupt.")
+
+    finally:
+        ctx.destroy(linger=0)
+
+
+K = TypeVar('K')
+V = TypeVar('V')
+
+
+class LRUDictCache(Generic[K, V]):
+
+    def __init__(self, size: int):
+        self.cache: OrderedDict[K, V] = OrderedDict()
+        self.size = size
+
+    def get(self, key: K, default=None) -> V:
+        if key not in self.cache:
+            return default
+
+        self.cache.move_to_end(key)
+        return self.cache[key]
+
+    def put(self, key: K, value: V):
+        self.cache[key] = value
+        self.cache.move_to_end(key)
+        if len(self.cache) > self.size:
+            self.cache.popitem(last=False)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
new file mode 100644
index 0000000000000..5c113c74778df
--- /dev/null
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -0,0 +1,295 @@
+# Datastructures defining an input batch
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Set
+
+import numpy as np
+import torch
+
+from vllm.multimodal import MultiModalKwargs
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.v1.sample.metadata import SamplingMetadata
+
+if TYPE_CHECKING:
+    from vllm.multimodal.inputs import PlaceholderRange
+
+
+@dataclass
+class CachedRequestState:
+
+    req_id: str
+    prompt_token_ids: List[int]
+    prompt: Optional[str]
+    mm_inputs: List[MultiModalKwargs]
+    mm_positions: List["PlaceholderRange"]
+    sampling_params: SamplingParams
+    generator: Optional[torch.Generator]
+
+    block_ids: List[int]
+    num_computed_tokens: int
+    output_token_ids: List[int]
+
+    @property
+    def num_tokens(self) -> int:
+        return len(self.prompt_token_ids) + len(self.output_token_ids)
+
+
+class InputBatch:
+
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_blocks_per_req: int,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_blocks_per_req = max_num_blocks_per_req
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.req_ids: List[Optional[str]] = [None] * max_num_reqs
+        self.req_id_to_index: Dict[str, int] = {}
+
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
+        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
+
+        # Attention-related.
+        self.block_table = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device=self.device,
+            dtype=torch.int32,
+        )
+        self.block_table_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.block_table_cpu = self.block_table_cpu_tensor.numpy()
+
+        # Sampling-related.
+        self.temperature = torch.empty((max_num_reqs, ),
+                                       dtype=torch.float32,
+                                       device=device)
+        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                  dtype=torch.float32,
+                                                  device="cpu",
+                                                  pin_memory=pin_memory)
+        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
+        self.greedy_reqs: Set[str] = set()
+        self.random_reqs: Set[str] = set()
+
+        self.top_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
+        self.top_p_reqs: Set[str] = set()
+
+        self.top_k = torch.empty((max_num_reqs, ),
+                                 dtype=torch.int32,
+                                 device=device)
+        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
+        self.top_k_reqs: Set[str] = set()
+
+        # req_index -> generator
+        # NOTE(woosuk): The indices of the requests that do not have their own
+        # generator should not be included in the dictionary.
+        self.generators: Dict[int, torch.Generator] = {}
+
+        self.num_logprobs: Dict[str, int] = {}
+        self.prompt_logprob_reqs: Set[str] = set()
+
+    def add_request(
+        self,
+        request: "CachedRequestState",
+        req_index: Optional[int] = None,
+    ) -> None:
+        if req_index is None:
+            req_index = self.num_reqs
+        assert req_index < self.max_num_reqs
+
+        req_id = request.req_id
+        self.req_ids[req_index] = req_id
+        self.req_id_to_index[req_id] = req_index
+
+        # Copy the prompt token ids and output token ids.
+        num_prompt_tokens = len(request.prompt_token_ids)
+        self.token_ids_cpu[
+            req_index, :num_prompt_tokens] = request.prompt_token_ids
+        start_idx = num_prompt_tokens
+        end_idx = start_idx + len(request.output_token_ids)
+        self.token_ids_cpu[req_index,
+                           start_idx:end_idx] = request.output_token_ids
+
+        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
+        num_blocks = len(request.block_ids)
+        self.block_table_cpu[req_index, :num_blocks] = request.block_ids
+
+        sampling_params = request.sampling_params
+        self.temperature_cpu[req_index] = sampling_params.temperature
+        if sampling_params.sampling_type == SamplingType.GREEDY:
+            self.greedy_reqs.add(req_id)
+        else:
+            self.random_reqs.add(req_id)
+
+        self.top_p_cpu[req_index] = sampling_params.top_p
+        if sampling_params.top_p < 1:
+            self.top_p_reqs.add(req_id)
+        self.top_k_cpu[req_index] = sampling_params.top_k
+        if sampling_params.top_k > 0:
+            self.top_k_reqs.add(req_id)
+
+        # NOTE(woosuk): self.generators should not include the requests that
+        # do not have their own generator.
+        if request.generator is not None:
+            self.generators[req_index] = request.generator
+
+        num_logprobs = sampling_params.logprobs
+        if num_logprobs is not None and num_logprobs > 0:
+            self.num_logprobs[req_id] = num_logprobs
+        if sampling_params.prompt_logprobs:
+            self.prompt_logprob_reqs.add(req_id)
+
+    def remove_request(self, req_id: str) -> Optional[int]:
+        req_index = self.req_id_to_index.pop(req_id, None)
+        if req_index is None:
+            return None
+        self.req_ids[req_index] = None
+
+        self.greedy_reqs.discard(req_id)
+        self.random_reqs.discard(req_id)
+        self.top_p_reqs.discard(req_id)
+        self.top_k_reqs.discard(req_id)
+        self.generators.pop(req_index, None)
+        self.num_logprobs.pop(req_id, None)
+        self.prompt_logprob_reqs.discard(req_id)
+        return req_index
+
+    def clear(self) -> None:
+        self.req_ids = [None] * self.max_num_reqs
+        self.req_id_to_index.clear()
+        self.greedy_reqs.clear()
+        self.random_reqs.clear()
+        self.top_p_reqs.clear()
+        self.top_k_reqs.clear()
+        self.generators.clear()
+        self.num_logprobs.clear()
+        self.prompt_logprob_reqs.clear()
+
+    def condense(self, empty_req_indices: List[int]) -> None:
+        if self.num_reqs == 0:
+            # The batched states are empty.
+            return
+
+        # NOTE(woosuk): This function assumes that the empty_req_indices
+        # is sorted in descending order.
+        last_req_index = self.num_reqs + len(empty_req_indices) - 1
+        while empty_req_indices:
+            # Find the largest non-empty index.
+            while last_req_index in empty_req_indices:
+                last_req_index -= 1
+
+            # Find the smallest empty index.
+            empty_index = empty_req_indices.pop()
+            if empty_index >= last_req_index:
+                break
+
+            # Swap the states.
+            req_id = self.req_ids[last_req_index]
+            assert req_id is not None
+            self.req_ids[empty_index] = req_id
+            self.req_ids[last_req_index] = None
+            self.req_id_to_index[req_id] = empty_index
+
+            # TODO(woosuk): Optimize the copy of token_ids_cpu and
+            # block_table_cpu.
+            self.token_ids_cpu[empty_index] = self.token_ids_cpu[
+                last_req_index]
+            self.num_computed_tokens_cpu[
+                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            self.block_table_cpu[empty_index] = self.block_table_cpu[
+                last_req_index]
+            self.temperature_cpu[empty_index] = self.temperature_cpu[
+                last_req_index]
+            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
+            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
+
+            # Decrement last_req_index since it is now empty.
+            last_req_index -= 1
+
+    def make_sampling_metadata(
+        self,
+        skip_copy: bool = False,
+    ) -> SamplingMetadata:
+        if not skip_copy:
+            self.temperature[:self.num_reqs].copy_(
+                self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True)
+            self.top_p[:self.num_reqs].copy_(
+                self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
+            self.top_k[:self.num_reqs].copy_(
+                self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
+        return SamplingMetadata(
+            temperature=self.temperature[:self.num_reqs],
+            all_greedy=self.all_greedy,
+            all_random=self.all_random,
+            top_p=self.top_p[:self.num_reqs],
+            top_k=self.top_k[:self.num_reqs],
+            no_top_p=self.no_top_p,
+            no_top_k=self.no_top_k,
+            generators=self.generators,
+            max_num_logprobs=self.max_num_logprobs,
+        )
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    @property
+    def all_greedy(self) -> bool:
+        return len(self.random_reqs) == 0
+
+    @property
+    def all_random(self) -> bool:
+        return len(self.greedy_reqs) == 0
+
+    @property
+    def no_top_p(self) -> bool:
+        return len(self.top_p_reqs) == 0
+
+    @property
+    def no_top_k(self) -> bool:
+        return len(self.top_k_reqs) == 0
+
+    @property
+    def max_num_logprobs(self) -> int:
+        return max(self.num_logprobs.values()) if self.num_logprobs else 0
+
+    @property
+    def no_logprob(self) -> bool:
+        return len(self.num_logprobs) == 0
+
+    @property
+    def no_prompt_logprob(self) -> bool:
+        return len(self.prompt_logprob_reqs) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index eebd1de96537f..c6fab5f05fcb3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,34 +1,30 @@
-import os
+import gc
 import time
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
+from typing import TYPE_CHECKING, Dict, List, Tuple, cast
 
 import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
 
-from vllm import envs
-from vllm.compilation.compile_context import set_compile_context
-from vllm.compilation.config import CompilationConfig
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
+from vllm.config import CompilationLevel, VllmConfig
+from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY, InputRegistry
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MultiModalKwargs
-from vllm.plugins import set_compilation_config
-from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
-                        is_pin_memory_available)
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.sampling_params import SamplingType
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
+                        LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
-    from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.scheduler import SchedulerOutput
 
 logger = init_logger(__name__)
@@ -39,7 +35,7 @@ class GPUModelRunner:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        input_registry: InputRegistry = INPUT_REGISTRY,
+        device: torch.device,
     ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -48,7 +44,6 @@ def __init__(
         self.load_config = vllm_config.load_config
         self.parallel_config = vllm_config.parallel_config
         self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
         self.speculative_config = vllm_config.speculative_config
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
@@ -57,7 +52,7 @@ def __init__(
         cache_config = self.cache_config
         scheduler_config = self.scheduler_config
         parallel_config = self.parallel_config
-        self.device = self.device_config.device
+        self.device = device
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
         if cache_config.cache_dtype == "auto":
@@ -66,21 +61,28 @@ def __init__(
             self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 cache_config.cache_dtype]
 
+        self.is_multimodal_model = model_config.is_multimodal_model
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = scheduler_config.max_num_seqs
 
         # Model-related.
-        self.num_attn_layers = model_config.get_num_attention_layers(
-            parallel_config)
+        self.num_attn_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
 
         # Multi-modal data support
-        self.input_registry = input_registry
+        self.input_registry = INPUT_REGISTRY
+        self.mm_registry = MULTIMODAL_REGISTRY
+        # NOTE: mm_input_mapper is only used for memory profiling.
+        self.mm_input_mapper = MMInputMapperClient(self.model_config)
+        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
+        self.encoder_cache_size = self.scheduler_config.encoder_cache_size
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
@@ -92,18 +94,27 @@ def __init__(
         self.requests: Dict[str, CachedRequestState] = {}
         # Persistent batch.
         self.input_batch = InputBatch(
-            max_num_reqs=self.scheduler_config.max_num_seqs,
+            max_num_reqs=self.max_num_reqs,
             max_model_len=self.max_model_len,
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
         )
 
-        self.use_cuda_graph = (envs.VLLM_TORCH_COMPILE_LEVEL
+        self.use_cuda_graph = (self.vllm_config.compilation_config.level
                                == CompilationLevel.PIECEWISE
                                and not self.model_config.enforce_eager)
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
-        self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)]
+        # The convention is different.
+        # self.cudagraph_batch_sizes sorts in ascending order.
+        # The batch sizes in the config are in descending order.
+        self.cudagraph_batch_sizes = list(
+            reversed(self.vllm_config.compilation_config.capture_sizes))
+
+        # Persistent buffers for CUDA graphs.
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device=self.device)
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
@@ -112,6 +123,38 @@ def __init__(
             dtype=self.dtype,
             device=self.device)
 
+        # OPTIMIZATION: Cache the tensors rather than creating them every step.
+        self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len),
+                                   dtype=np.int32)
+        # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
+        # a faster version of creating a new tensor every time. Thus, we should
+        # not make any assumptions about the values in these tensors.
+        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int32,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.input_ids_np = self.input_ids_cpu.numpy()
+        self.positions_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int64,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.positions_np = self.positions_cpu.numpy()
+        self.slot_mapping_cpu = torch.zeros(self.max_num_tokens,
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=self.pin_memory)
+        self.slot_mapping_np = self.slot_mapping_cpu.numpy()
+        self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
+                                               dtype=torch.int32,
+                                               device="cpu",
+                                               pin_memory=self.pin_memory)
+        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
+        self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
+                                             dtype=torch.int32,
+                                             device="cpu",
+                                             pin_memory=self.pin_memory)
+        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
         # Keep the states of the pre-empted requests.
@@ -161,9 +204,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
         req_ids_to_add: List[str] = []
         # Add new requests to the cached states.
-        for req_data in scheduler_output.scheduled_new_reqs:
-            req_id = req_data.req_id
-            sampling_params = req_data.sampling_params
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            req_id = new_req_data.req_id
+            sampling_params = new_req_data.sampling_params
             if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
                 generator = torch.Generator(device=self.device)
                 generator.manual_seed(sampling_params.seed)
@@ -172,25 +215,25 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
-                prompt_token_ids=req_data.prompt_token_ids,
-                prompt=req_data.prompt,
-                mm_inputs=req_data.mm_inputs,
-                mm_positions=req_data.mm_positions,
+                prompt_token_ids=new_req_data.prompt_token_ids,
+                prompt=new_req_data.prompt,
+                mm_inputs=new_req_data.mm_inputs,
+                mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
                 generator=generator,
-                block_ids=req_data.block_ids,
-                num_computed_tokens=req_data.num_computed_tokens,
+                block_ids=new_req_data.block_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
                 output_token_ids=[],
             )
             req_ids_to_add.append(req_id)
 
         # Update the cached states of the resumed requests.
-        for req_data in scheduler_output.scheduled_resumed_reqs:
-            req_id = req_data.req_id
+        for res_req_data in scheduler_output.scheduled_resumed_reqs:
+            req_id = res_req_data.req_id
             req_state = self.requests[req_id]
 
-            req_state.block_ids = req_data.block_ids
-            req_state.num_computed_tokens = req_data.num_computed_tokens
+            req_state.block_ids = res_req_data.block_ids
+            req_state.num_computed_tokens = res_req_data.num_computed_tokens
             req_ids_to_add.append(req_id)
 
         # Add the new or resumed requests to the persistent batch.
@@ -227,6 +270,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         num_scheduled_tokens = []
         max_num_scheduled_tokens = 0
         for req_id in self.input_batch.req_ids[:num_reqs]:
+            assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
             num_scheduled_tokens.append(num_tokens)
             max_num_scheduled_tokens = max(max_num_scheduled_tokens,
@@ -236,22 +280,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
-        indices = np.arange(num_reqs)
-        req_indices = np.repeat(indices, num_scheduled_tokens)
+        req_indices = np.repeat(self.arange_np[:num_reqs],
+                                num_scheduled_tokens)
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange_matrix = np.tile(np.arange(max_num_scheduled_tokens),
-                                (num_reqs, 1))
-        mask = arange_matrix < num_scheduled_tokens[:, np.newaxis]
-        arange = arange_matrix[mask]
+        arange = np.concatenate(
+            [self.arange_np[:n] for n in num_scheduled_tokens])
 
         # Get positions.
-        positions = torch.empty((total_num_scheduled_tokens, ),
-                                dtype=torch.int32,
-                                device="cpu",
-                                pin_memory=self.pin_memory)
-        positions_np = positions.numpy()
+        positions_np = self.positions_np[:total_num_scheduled_tokens]
         np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
                arange,
                out=positions_np)
@@ -260,56 +298,56 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
         # where M is the max_model_len.
-        token_indices = positions_np + req_indices * self.max_model_len
-        token_indices = torch.from_numpy(token_indices)
-        input_ids = torch.empty((total_num_scheduled_tokens, ),
-                                dtype=torch.int32,
-                                device="cpu",
-                                pin_memory=self.pin_memory)
-        torch.index_select(torch.from_numpy(
-            self.input_batch.token_ids_cpu).flatten(),
+        token_indices = (positions_np +
+                         req_indices * self.input_batch.token_ids_cpu.shape[1])
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
                            0,
-                           token_indices,
-                           out=input_ids)
+                           torch.from_numpy(token_indices),
+                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
 
         # Calculate the slot mapping.
-        block_numbers = self.input_batch.block_table_cpu_tensor.flatten()[
-            token_indices // self.block_size]
-        block_offsets = token_indices % self.block_size
-        slot_mapping = torch.empty((total_num_scheduled_tokens, ),
-                                   dtype=torch.int32,
-                                   device="cpu",
-                                   pin_memory=self.pin_memory)
-        torch.add(block_numbers * self.block_size,
-                  block_offsets,
-                  out=slot_mapping)
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
+        # where K is the max_num_blocks_per_req and the block size is 2.
+        # NOTE(woosuk): We can't simply use `token_indices // block_size` here
+        # because M (max_model_len) is not necessarily divisible by block_size.
+        block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                               positions_np // self.block_size)
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        block_numbers = (self.input_batch.block_table_cpu_tensor.flatten()
+                         [block_table_indices].numpy())
+        block_offsets = positions_np % self.block_size
+        np.add(block_numbers * self.block_size,
+               block_offsets,
+               out=self.slot_mapping_np[:total_num_scheduled_tokens])
 
         # Prepare the attention metadata.
-        query_start_loc = torch.empty((num_reqs + 1, ),
-                                      dtype=torch.int32,
-                                      device="cpu",
-                                      pin_memory=self.pin_memory)
-        query_start_loc_np = query_start_loc.numpy()
-        query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
+        self.query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens,
+                  out=self.query_start_loc_np[1:num_reqs + 1])
 
         seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
                     num_scheduled_tokens)
         max_seq_len = seq_lens.max()
-        seq_start_loc = torch.empty((num_reqs + 1, ),
-                                    dtype=torch.int32,
-                                    device="cpu",
-                                    pin_memory=self.pin_memory)
-        seq_start_loc_np = seq_start_loc.numpy()
-        seq_start_loc_np[0] = 0
-        np.cumsum(seq_lens, out=seq_start_loc_np[1:])
-
-        input_ids = input_ids.to(self.device, non_blocking=True)
-        self.positions[:total_num_scheduled_tokens].copy_(positions,
-                                                          non_blocking=True)
-        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
-        seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
-        slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
+        self.seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1])
+
+        # Copy the tensors to the GPU.
+        self.input_ids[:total_num_scheduled_tokens].copy_(
+            self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        self.positions[:total_num_scheduled_tokens].copy_(
+            self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to(
+            self.device, non_blocking=True)
+        seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to(
+            self.device, non_blocking=True)
+        slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
+            self.device, non_blocking=True).long()
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,
@@ -325,7 +363,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # token from the partial request.
         # TODO: Support prompt logprobs.
         logits_indices = query_start_loc[1:] - 1
-        return input_ids, attn_metadata, logits_indices
+        return attn_metadata, logits_indices
 
     def _prepare_sampling(
         self,
@@ -349,7 +387,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
 
         # Batch the multi-modal inputs.
         mm_inputs: List[MultiModalKwargs] = []
-        req_input_ids: List[Tuple[int, int]] = []
+        req_input_ids: List[Tuple[str, int]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
             for input_id in encoder_input_ids:
@@ -366,7 +404,8 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         # 2. A list (length: num_images) of tensors, each of shape
         # [feature_size, hidden_size] in case when the feature size is
         # dynamic depending on input images.
-        encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs)
+        encoder_outputs = self.model.get_multimodal_embeddings(
+            **batched_mm_inputs)
 
         # Cache the encoder outputs.
         for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
@@ -381,6 +420,7 @@ def _gather_encoder_outputs(
         encoder_outputs: List[torch.Tensor] = []
         num_reqs = self.input_batch.num_reqs
         for req_id in self.input_batch.req_ids[:num_reqs]:
+            assert req_id is not None
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
             req_state = self.requests[req_id]
@@ -420,44 +460,58 @@ def execute_model(
     ) -> ModelRunnerOutput:
         self._update_states(scheduler_output)
 
-        # Run the encoder.
-        self._execute_encoder(scheduler_output)
-        encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        else:
+            encoder_outputs = []
 
         # Prepare the decoder inputs.
-        input_ids, attn_metadata, logits_indices = self._prepare_inputs(
-            scheduler_output)
+        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
             # Use piecewise CUDA graphs.
             # Add padding to the batch size.
-            num_input_tokens = self._get_padded_batch_size(
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(
                 num_scheduled_tokens)
         else:
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
-
-        # Get the inputs embeds.
-        if encoder_outputs:
-            inputs_embeds = self.model.get_input_embeddings(
-                input_ids, encoder_outputs)
+        attn_metadata.num_input_tokens = num_input_tokens
+
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            input_ids = self.input_ids[:num_scheduled_tokens]
+            if encoder_outputs:
+                inputs_embeds = self.model.get_input_embeddings(
+                    input_ids, encoder_outputs)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+            # TODO(woosuk): Avoid the copy. Optimize.
+            self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            input_ids = None
         else:
-            inputs_embeds = self.model.get_input_embeddings(input_ids)
-        # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
-        # always use embeddings (rather than token ids) as input to the model.
-        # TODO(woosuk): Avoid the copy. Optimize.
-        self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            input_ids = self.input_ids[:num_input_tokens]
+            inputs_embeds = None
 
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
-        with set_forward_context(attn_metadata):
+        with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
-                input_ids=None,
+                input_ids=input_ids,
                 positions=self.positions[:num_input_tokens],
                 kv_caches=self.kv_caches,
                 attn_metadata=None,
-                inputs_embeds=self.inputs_embeds[:num_input_tokens],
+                inputs_embeds=inputs_embeds,
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
         hidden_states = hidden_states[logits_indices]
@@ -470,20 +524,19 @@ def execute_model(
             sampling_metadata=sampling_metadata,
         )
 
-        # NOTE: CPU-GPU synchronization happens here.
-        sampled_token_ids = sampler_output.sampled_token_ids.cpu()
-        sampled_token_ids_list = sampled_token_ids.tolist()
+        sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
         for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
+            assert req_id is not None
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
             assert seq_len <= req_state.num_tokens
             if seq_len == req_state.num_tokens:
                 # Append the sampled token to the output token ids.
-                token_id = sampled_token_ids_list[i]
+                token_id = sampled_token_ids[i]
                 self.input_batch.token_ids_cpu[i, seq_len] = token_id
                 req_state.output_token_ids.append(token_id)
             else:
@@ -502,30 +555,23 @@ def execute_model(
             logprobs = None
         else:
             logprobs = sampler_output.logprobs.cpu()
+
+        # num_reqs entries should be non-None
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
+
         model_runner_output = ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids[:num_reqs],
+            req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
-            sampled_token_ids_cpu=sampled_token_ids,
+            sampled_token_ids=sampled_token_ids,
             logprob_token_ids_cpu=logprob_token_ids,
             logprobs_cpu=logprobs,
         )
         return model_runner_output
 
     def load_model(self) -> None:
-        if self.use_cuda_graph:
-            # NOTE(woosuk): Currently, we use inductor because the piecewise
-            # CUDA graphs do not work properly with the custom CUDA kernels.
-            # FIXME(woosuk): Disable inductor to reduce the compilation time
-            # and avoid any potential issues with the inductor.
-            os.environ["VLLM_CUSTOM_OPS"] = "none"
-            set_compilation_config(
-                CompilationConfig(
-                    use_cudagraph=True,
-                    non_cudagraph_ops=["vllm.unified_v1_flash_attention"],
-                    use_inductor=True,
-                    enable_fusion=False,
-                ))
-
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
             self.model = get_model(vllm_config=self.vllm_config)
@@ -534,7 +580,30 @@ def load_model(self) -> None:
         logger.info("Loading model weights took %.4f GB",
                     self.model_memory_usage / float(2**30))
 
-    def _dummy_run(self, model: nn.Module, num_tokens: int) -> None:
+    @torch.inference_mode()
+    def _dummy_run(
+        self,
+        model: nn.Module,
+        num_tokens: int,
+        kv_caches: List[torch.Tensor],
+    ) -> torch.Tensor:
+        if self.is_multimodal_model:
+            input_ids = None
+            inputs_embeds = self.inputs_embeds[:num_tokens]
+        else:
+            input_ids = self.input_ids[:num_tokens]
+            inputs_embeds = None
+        with set_forward_context(None, self.vllm_config):
+            hidden_states = model(
+                input_ids=input_ids,
+                positions=self.positions[:num_tokens],
+                kv_caches=kv_caches,
+                attn_metadata=None,
+                inputs_embeds=inputs_embeds,
+            )
+        return hidden_states
+
+    def profile_run(self) -> None:
         # use an empty tensor instead of `None`` to force Dynamo to pass
         # it by reference, rather by specializing on the value `None`.
         # the `dtype` argument does not matter, and we use `float32` as
@@ -546,46 +615,87 @@ def _dummy_run(self, model: nn.Module, num_tokens: int) -> None:
             torch.tensor([], dtype=torch.float32, device=self.device)
             for _ in range(self.num_attn_layers)
         ]
-        with set_forward_context(None):  # noqa: SIM117
-            with set_compile_context(self.cudagraph_batch_sizes):
-                # Trigger compilation for general shape.
-                model(input_ids=None,
-                      positions=self.positions,
-                      kv_caches=dummy_kv_caches,
-                      attn_metadata=None,
-                      inputs_embeds=self.inputs_embeds)
 
-    @torch.inference_mode()
-    def profile_run(self) -> None:
-        # TODO(woosuk): Profile the max memory usage of the encoder and
-        # the encoder cache.
-        self._dummy_run(self.model, self.max_num_tokens)
+        # Profile with multimodal encoder & encoder cache.
+        # TODO (ywang96): generalize this beyond image modality since
+        # mm_input_mapper only supports image inputs.
+        if self.is_multimodal_model:
+
+            # Create dummy batch of multimodal inputs.
+            dummy_request_data = self.input_registry.dummy_data_for_profiling(
+                model_config=self.model_config,
+                seq_len=self.max_num_tokens,
+                mm_registry=self.mm_registry,
+            )
+            dummy_mm_data = dummy_request_data.multi_modal_data
+            dummy_mm_kwargs, _ = self.mm_input_mapper.process_inputs(
+                mm_data=dummy_mm_data,
+                mm_hashes=None,
+                mm_processor_kwargs=None,
+                precomputed_mm_inputs=None)
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality even when it supports multiple.
+            max_tokens_per_mm_item = max(
+                self.mm_registry.get_max_tokens_per_item_by_modality(
+                    self.model_config).values())
+
+            max_num_mm_items = min(
+                self.max_num_encoder_input_tokens,
+                self.encoder_cache_size) // max_tokens_per_mm_item
+
+            # Dummy data definition in V0 may contain multiple multimodal items
+            # (e.g, multiple images) for a single request, therefore here we
+            # always replicate first item by max_num_mm_items times since in V1
+            # they are scheduled to be processed separately.
+            batched_dummy_mm_inputs = MultiModalKwargs.batch(
+                [dummy_mm_kwargs[0]] * max_num_mm_items)
+            batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_dummy_mm_inputs, device=self.device)
+
+            # Run multimodal encoder.
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+            assert len(dummy_encoder_outputs) == max_num_mm_items, (
+                "Expected dimension 0 of encoder outputs to match the number "
+                f"of multimodal data items: {max_num_mm_items}, got "
+                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
+                "due to the 'get_multimodal_embeddings' method of the model "
+                "not implemented correctly.")
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+
+        # Trigger compilation for general shape.
+        hidden_states = self._dummy_run(self.model, self.max_num_tokens,
+                                        dummy_kv_caches)
+        logits = self.model.compute_logits(hidden_states, None)
+        logits = logits[:self.max_num_tokens]
+        # TODO(woosuk): Consider the memory usage of the sampler.
         torch.cuda.synchronize()
+        del hidden_states, logits
+        self.encoder_cache.clear()
+        gc.collect()
 
-    @torch.inference_mode()
     def capture_model(self) -> None:
         if not self.use_cuda_graph:
             logger.warning(
-                "Skipping CUDA graph capture. Please set "
-                "VLLM_TORCH_COMPILE_LEVEL=%d to use CUDA graphs.",
-                CompilationLevel.PIECEWISE)
+                "Skipping CUDA graph capture. Please add "
+                "-O %s to use CUDA graphs.", CompilationLevel.PIECEWISE)
             return
 
         start_time = time.perf_counter()
         start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
-        with set_forward_context(None):
-            # Trigger CUDA graph capture for specific shapes.
-            # Capture the large shapes first so that the smaller shapes
-            # can reuse the memory pool allocated for the large shapes.
+        # Trigger CUDA graph capture for specific shapes.
+        # Capture the large shapes first so that the smaller shapes
+        # can reuse the memory pool allocated for the large shapes.
+        with graph_capture():
             for num_tokens in reversed(self.cudagraph_batch_sizes):
-                self.model(
-                    input_ids=None,
-                    positions=self.positions[:num_tokens],
-                    kv_caches=self.kv_caches,
-                    attn_metadata=None,
-                    inputs_embeds=self.inputs_embeds[:num_tokens],
-                )
+                for _ in range(self.vllm_config.compilation_config.
+                               cudagraph_num_of_warmups):
+                    self._dummy_run(self.model, num_tokens, self.kv_caches)
+                self._dummy_run(self.model, num_tokens, self.kv_caches)
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
@@ -604,276 +714,3 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                 torch.zeros(kv_cache_shape,
                             dtype=self.kv_cache_dtype,
                             device=self.device))
-
-    def _get_padded_batch_size(self, batch_size: int) -> Optional[int]:
-        # TODO: Optimize this?
-        for size in self.cudagraph_batch_sizes:
-            if batch_size <= size:
-                return size
-        return None
-
-
-@dataclass
-class CachedRequestState:
-
-    req_id: str
-    prompt_token_ids: List[int]
-    prompt: Optional[str]
-    mm_inputs: List[MultiModalKwargs]
-    mm_positions: List["PlaceholderRange"]
-    sampling_params: SamplingParams
-    generator: Optional[torch.Generator]
-
-    block_ids: List[int]
-    num_computed_tokens: int
-    output_token_ids: List[int]
-
-    @property
-    def num_tokens(self) -> int:
-        return len(self.prompt_token_ids) + len(self.output_token_ids)
-
-
-class InputBatch:
-
-    def __init__(
-        self,
-        max_num_reqs: int,
-        max_model_len: int,
-        max_num_blocks_per_req: int,
-        device: torch.device,
-        pin_memory: bool,
-    ):
-        self.max_num_reqs = max_num_reqs
-        self.max_model_len = max_model_len
-        self.max_num_blocks_per_req = max_num_blocks_per_req
-        self.device = device
-        self.pin_memory = pin_memory
-
-        self.req_ids: List[Optional[str]] = [None] * max_num_reqs
-        self.req_id_to_index: Dict[str, int] = {}
-
-        self.token_ids_cpu = np.empty((max_num_reqs, max_model_len),
-                                      dtype=np.int32)
-        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
-
-        # Attention-related.
-        self.block_table = torch.zeros((max_num_reqs, max_num_blocks_per_req),
-                                       device=self.device,
-                                       dtype=torch.int32)
-        self.block_table_cpu_tensor = torch.zeros(
-            (max_num_reqs, max_num_blocks_per_req),
-            device="cpu",
-            dtype=torch.int32,
-            pin_memory=pin_memory,
-        )
-        self.block_table_cpu = self.block_table_cpu_tensor.numpy()
-
-        # Sampling-related.
-        self.temperature = torch.empty((max_num_reqs, ),
-                                       dtype=torch.float32,
-                                       device=device)
-        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
-                                                  dtype=torch.float32,
-                                                  device="cpu",
-                                                  pin_memory=pin_memory)
-        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
-        self.greedy_reqs: Set[str] = set()
-        self.random_reqs: Set[str] = set()
-
-        self.top_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
-        self.top_p_reqs: Set[str] = set()
-
-        self.top_k = torch.empty((max_num_reqs, ),
-                                 dtype=torch.int32,
-                                 device=device)
-        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.int32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
-        self.top_k_reqs: Set[str] = set()
-
-        # req_index -> generator
-        self.generators: Dict[int, torch.Generator] = {}
-
-        self.num_logprobs: Dict[str, int] = {}
-        self.prompt_logprob_reqs: Set[str] = set()
-
-    def add_request(
-        self,
-        request: "CachedRequestState",
-        req_index: Optional[int] = None,
-    ) -> None:
-        if req_index is None:
-            req_index = self.num_reqs
-        assert req_index < self.max_num_reqs
-
-        req_id = request.req_id
-        self.req_ids[req_index] = req_id
-        self.req_id_to_index[req_id] = req_index
-
-        # Copy the prompt token ids and output token ids.
-        num_prompt_tokens = len(request.prompt_token_ids)
-        self.token_ids_cpu[
-            req_index, :num_prompt_tokens] = request.prompt_token_ids
-        start_idx = num_prompt_tokens
-        end_idx = start_idx + len(request.output_token_ids)
-        self.token_ids_cpu[req_index,
-                           start_idx:end_idx] = request.output_token_ids
-
-        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
-        num_blocks = len(request.block_ids)
-        self.block_table_cpu[req_index, :num_blocks] = request.block_ids
-
-        sampling_params = request.sampling_params
-        self.temperature_cpu[req_index] = sampling_params.temperature
-        if sampling_params.sampling_type == SamplingType.GREEDY:
-            self.greedy_reqs.add(req_id)
-        else:
-            self.random_reqs.add(req_id)
-
-        self.top_p_cpu[req_index] = sampling_params.top_p
-        if sampling_params.top_p < 1:
-            self.top_p_reqs.add(req_id)
-        self.top_k_cpu[req_index] = sampling_params.top_k
-        if sampling_params.top_k > 0:
-            self.top_k_reqs.add(req_id)
-
-        self.generators[req_index] = request.generator
-
-        num_logprobs = sampling_params.logprobs
-        if num_logprobs is not None and num_logprobs > 0:
-            self.num_logprobs[req_id] = num_logprobs
-        if sampling_params.prompt_logprobs:
-            self.prompt_logprob_reqs.add(req_id)
-
-    def remove_request(self, req_id: str) -> Optional[int]:
-        req_index = self.req_id_to_index.pop(req_id, None)
-        if req_index is None:
-            return None
-        self.req_ids[req_index] = None
-
-        self.greedy_reqs.discard(req_id)
-        self.random_reqs.discard(req_id)
-        self.top_p_reqs.discard(req_id)
-        self.top_k_reqs.discard(req_id)
-        self.generators.pop(req_index, None)
-        self.num_logprobs.pop(req_id, None)
-        self.prompt_logprob_reqs.discard(req_id)
-        return req_index
-
-    def clear(self) -> None:
-        self.req_ids = [None] * self.max_num_reqs
-        self.req_id_to_index.clear()
-        self.greedy_reqs.clear()
-        self.random_reqs.clear()
-        self.top_p_reqs.clear()
-        self.top_k_reqs.clear()
-        self.generators.clear()
-        self.num_logprobs.clear()
-        self.prompt_logprob_reqs.clear()
-
-    def condense(self, empty_req_indices: List[int]) -> None:
-        if self.num_reqs == 0:
-            # The batched states are empty.
-            return
-
-        # NOTE(woosuk): This function assumes that the empty_req_indices
-        # is sorted in descending order.
-        last_req_index = self.num_reqs + len(empty_req_indices) - 1
-        while empty_req_indices:
-            # Find the largest non-empty index.
-            while last_req_index in empty_req_indices:
-                last_req_index -= 1
-
-            # Find the smallest empty index.
-            empty_index = empty_req_indices.pop()
-            if empty_index >= last_req_index:
-                break
-
-            # Swap the states.
-            req_id = self.req_ids[last_req_index]
-            self.req_ids[empty_index] = req_id
-            self.req_ids[last_req_index] = None
-            self.req_id_to_index[req_id] = empty_index
-
-            # TODO(woosuk): Optimize the copy of token_ids_cpu and
-            # block_table_cpu.
-            self.token_ids_cpu[empty_index] = self.token_ids_cpu[
-                last_req_index]
-            self.num_computed_tokens_cpu[
-                empty_index] = self.num_computed_tokens_cpu[last_req_index]
-            self.block_table_cpu[empty_index] = self.block_table_cpu[
-                last_req_index]
-            self.temperature_cpu[empty_index] = self.temperature_cpu[
-                last_req_index]
-            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
-            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
-            generator = self.generators.pop(last_req_index, None)
-            if generator is not None:
-                self.generators[empty_index] = generator
-
-            # Decrement last_req_index since it is now empty.
-            last_req_index -= 1
-
-    def make_sampling_metadata(
-        self,
-        skip_copy: bool = False,
-    ) -> SamplingMetadata:
-        if not skip_copy:
-            self.temperature[:self.num_reqs].copy_(
-                self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True)
-            self.top_p[:self.num_reqs].copy_(
-                self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
-            self.top_k[:self.num_reqs].copy_(
-                self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
-        return SamplingMetadata(
-            temperature=self.temperature[:self.num_reqs],
-            all_greedy=self.all_greedy,
-            all_random=self.all_random,
-            top_p=self.top_p[:self.num_reqs],
-            top_k=self.top_k[:self.num_reqs],
-            no_top_p=self.no_top_p,
-            no_top_k=self.no_top_k,
-            generators=self.generators,
-            max_num_logprobs=self.max_num_logprobs,
-        )
-
-    @property
-    def num_reqs(self) -> int:
-        return len(self.req_id_to_index)
-
-    @property
-    def all_greedy(self) -> bool:
-        return len(self.random_reqs) == 0
-
-    @property
-    def all_random(self) -> bool:
-        return len(self.greedy_reqs) == 0
-
-    @property
-    def no_top_p(self) -> bool:
-        return len(self.top_p_reqs) == 0
-
-    @property
-    def no_top_k(self) -> bool:
-        return len(self.top_k_reqs) == 0
-
-    @property
-    def max_num_logprobs(self) -> int:
-        return max(self.num_logprobs.values()) if self.num_logprobs else 0
-
-    @property
-    def no_logprob(self) -> bool:
-        return len(self.num_logprobs) == 0
-
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return len(self.prompt_logprob_reqs) == 0
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index c8192b7f86eb0..33491f700de10 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -6,6 +6,7 @@
 import torch
 import torch.distributed
 
+import vllm.envs as envs
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
@@ -13,7 +14,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size
+from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
@@ -55,7 +57,22 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
-        self.model_runner = GPUModelRunner(vllm_config)
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        torch_profiler_trace_dir)
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                with_stack=True,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, use_gzip=True))
+        else:
+            self.profiler = None
 
     def initialize(self):
         if self.device_config.device.type == "cuda":
@@ -86,6 +103,9 @@ def initialize(self):
         # Set random seed.
         set_random_seed(self.model_config.seed)
 
+        # Construct the model runner
+        self.model_runner = GPUModelRunner(self.vllm_config, self.device)
+
     def load_model(self) -> None:
         self.model_runner.load_model()
 
@@ -105,35 +125,48 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
         torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
 
+        _, total_gpu_memory = torch.cuda.mem_get_info()
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
         self.model_runner.profile_run()
-
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
         torch.cuda.synchronize()
-        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+
+        free_gpu_memory, _ = torch.cuda.mem_get_info()
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
-        peak_memory = self.init_gpu_memory - free_gpu_memory
-        assert peak_memory > 0, (
+        assert self.init_gpu_memory > free_gpu_memory, (
             "Error in memory profiling. "
             f"Initial free memory {self.init_gpu_memory}, current free memory"
             f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
+        # Get the peak memory allocation recorded by torch
+        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
+
+        # Check for any memory left around that may have been allocated on the
+        # gpu outside of `torch`. NCCL operations, for example, can use a few
+        # GB during a forward pass
+        torch.cuda.empty_cache()
+        torch_allocated_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
+        total_allocated_bytes = torch.cuda.mem_get_info(
+        )[1] - torch.cuda.mem_get_info()[0]
+        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
+        if non_torch_allocations > 0:
+            peak_memory += non_torch_allocations
+        available_kv_cache_memory = (
+            total_gpu_memory * self.cache_config.gpu_memory_utilization -
+            peak_memory)
+
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
         cache_block_size = _get_cache_block_size(self.cache_config,
                                                  self.model_config,
                                                  self.parallel_config)
-        num_gpu_blocks = int(
-            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
-             peak_memory) // cache_block_size)
+        num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
         num_gpu_blocks = max(num_gpu_blocks, 0)
-        # if self.model_runner.lora_manager:
-        #     self.model_runner.remove_all_loras()
-        gc.collect()
-        torch.cuda.empty_cache()
         return num_gpu_blocks, 0
 
     def initialize_cache(self, num_gpu_blocks: int) -> None:
@@ -168,9 +201,21 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
     ) -> ModelRunnerOutput:
         output = self.model_runner.execute_model(scheduler_output)
-        # TODO(woosuk): Send the output to the engine process.
+        return output if self.rank == 0 else None
         return output
 
+    def profile(self, is_start: bool = True):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        if is_start:
+            self.profiler.start()
+        else:
+            self.profiler.stop()
+
+    def check_health(self) -> None:
+        # worker will always be healthy as long as it's running.
+        return
+
 
 def init_worker_distributed_environment(
     parallel_config: ParallelConfig,
@@ -215,8 +260,8 @@ def _get_cache_block_size(
 ) -> int:
     head_size = model_config.get_head_size()
     num_heads = model_config.get_num_kv_heads(parallel_config)
-    num_attention_layers = model_config.get_num_attention_layers(
-        parallel_config)
+    num_attention_layers = model_config.get_num_layers_by_block_type(
+        parallel_config, LayerBlockType.attention)
 
     key_cache_block = cache_config.block_size * num_heads * head_size
     value_cache_block = key_cache_block
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index ac3270d1c9909..7ccd4571b19df 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,8 +6,8 @@
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
-                        is_pin_memory_available)
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
+                        get_dtype_size, is_pin_memory_available)
 
 logger = init_logger(__name__)
 
@@ -34,8 +34,8 @@ def __init__(
 
         self.head_size = model_config.get_head_size()
         # Models like Jamba, have mixed typed layers, E.g Mamba
-        self.num_attention_layers = model_config.get_num_attention_layers(
-            parallel_config)
+        self.num_attention_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
 
         self.block_size = cache_config.block_size
@@ -105,8 +105,8 @@ def get_cache_block_size(
     ) -> int:
         head_size = model_config.get_head_size()
         num_heads = model_config.get_num_kv_heads(parallel_config)
-        num_attention_layers = model_config.get_num_attention_layers(
-            parallel_config)
+        num_attention_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
 
         key_cache_block = cache_config.block_size * num_heads * head_size
         value_cache_block = key_cache_block
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index d040831870bd8..cc24cfe04d2ba 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -4,6 +4,7 @@
 import torch
 
 from vllm.attention import AttentionMetadata
+from vllm.forward_context import set_forward_context
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MultiModalKwargs
@@ -34,6 +35,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "input_positions": self.input_positions,
             "encoder_input_tokens": self.encoder_input_tokens,
             "encoder_input_positions": self.encoder_input_positions,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         _add_sampling_metadata_broadcastable_dict(tensor_dict,
@@ -303,7 +305,8 @@ def execute_model(
             intermediate_tensors,
         }
 
-        hidden_states = model_executable(**execute_model_kwargs)
+        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+            hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states,
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index d3e1202c15e61..420aaf8a1b4cd 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -2,14 +2,15 @@
 import weakref
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
-                    TypeVar, Union)
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Type, TypeVar,
+                    Union)
 
 import torch
 from torch import nn
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@@ -19,7 +20,6 @@
                              MultiModalKwargs, MultiModalPlaceholderMap)
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import make_tensor_with_pad
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
     _add_attn_metadata_broadcastable_dict,
@@ -43,6 +43,7 @@ class ModelInputForCPU(ModelRunnerInputBase):
     """
     input_tokens: Optional[torch.Tensor] = None
     input_positions: Optional[torch.Tensor] = None
+    token_type_ids: Optional[torch.Tensor] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     multi_modal_kwargs: Optional[BatchedTensorInputs] = None
     virtual_engine: Optional[int] = None
@@ -54,6 +55,7 @@ def as_broadcastable_tensor_dict(
         tensor_dict = {
             "input_tokens": self.input_tokens,
             "input_positions": self.input_positions,
+            "token_type_ids": self.token_type_ids,
             "multi_modal_kwargs": self.multi_modal_kwargs,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
@@ -78,11 +80,14 @@ class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU):
     Used by the ModelRunner.
     """
     sampling_metadata: Optional["SamplingMetadata"] = None
+    is_prompt: Optional[bool] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
             "input_tokens": self.input_tokens,
             "input_positions": self.input_positions,
+            "token_type_ids": self.token_type_ids,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         _add_sampling_metadata_broadcastable_dict(tensor_dict,
@@ -104,65 +109,233 @@ def from_broadcasted_tensor_dict(
 
 class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
 
+    class ModelInputData:
+
+        def __init__(self, use_mrope: bool):
+            self.use_mrope = use_mrope
+            self.input_tokens: List[int] = []
+            self.input_positions: Optional[
+                List[int]] = [] if not self.use_mrope else None
+            self.token_type_ids: Optional[List[int]] = []
+            self.seq_lens: List[int] = []
+            self.query_lens: List[int] = []
+            self.prefill_block_tables: List[List[int]] = []
+            self.decode_block_tables: List[List[int]] = []
+            self.max_decode_seq_len: int = 0
+            self.num_prefills: int = 0
+            self.num_prefill_tokens: int = 0
+            self.num_decode_tokens: int = 0
+            self.slot_mapping: List[int] = []
+            self.multi_modal_inputs_list: List[MultiModalKwargs] = []
+            self.multi_modal_placeholder_maps: Dict[
+                str, MultiModalPlaceholderMap] = defaultdict(
+                    MultiModalPlaceholderMap)
+            self.input_mrope_positions: Optional[List[List[int]]] = [
+                [] for _ in range(3)
+            ] if self.use_mrope else None
+
     def __init__(self,
                  runner: "CPUModelRunner",
                  finished_requests_ids: Optional[List[str]] = None) -> None:
         super().__init__()
         self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
         self.runner = runner
+
+        self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled
+                                or runner.cache_config.enable_prefix_caching)
         self.model_input_cls = self.runner._model_input_cls
         self.attn_backend = self.runner.attn_backend
-        self.sliding_window = self.runner.sliding_window
-        self.block_size = self.runner.block_size
-        self.device = self.runner.device
         self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
+        self.input_data = ModelInputForCPUBuilder.ModelInputData(
+            self.runner.model_config.uses_mrope)
+        self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()(
+            self)
 
     def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
         self.seq_group_metadata_list.append(seq_group_metadata)
 
+    def set_seq_group_list(
+            self, seq_group_metadata_list: List[SequenceGroupMetadata]):
+        self.seq_group_metadata_list = seq_group_metadata_list
+
     def build(self) -> ModelInputForCPU:
+        self._build_input_data()
+
+        input_data = self.input_data
+        input_tokens = torch.tensor(input_data.input_tokens,
+                                    dtype=torch.long,
+                                    device="cpu")
+        input_positions = torch.tensor(
+            input_data.input_positions
+            if not input_data.use_mrope else input_data.input_mrope_positions,
+            dtype=torch.long,
+            device="cpu")
+        token_type_ids = torch.tensor(input_data.token_type_ids,
+                                    dtype=torch.long,
+                                    device="cpu") \
+                                    if input_data.token_type_ids else None
+
+        # For multi-modal models
         multi_modal_kwargs = None
-        # NOTE: We assume that all sequences in the group are all prompts or
-        # all decodes.
-        is_prompt = self.seq_group_metadata_list[0].is_prompt
-        # Prepare input tensors.
-        if is_prompt:
-            (input_tokens, input_positions, attn_metadata, seq_lens,
-             multi_modal_kwargs) = self._prepare_prompt(
-                 self.seq_group_metadata_list)
-        else:
-            (input_tokens, input_positions,
-             attn_metadata) = self._prepare_decode(
-                 self.seq_group_metadata_list)
-            seq_lens = None
+        if len(input_data.multi_modal_inputs_list) != 0:
+            multi_modal_kwargs = MultiModalKwargs.batch(
+                input_data.multi_modal_inputs_list)
+
+        attn_metadata = self.att_metadata_builder.build(
+            input_data.seq_lens, input_data.query_lens, -1, -1)
 
         return self.model_input_cls(
             input_tokens=input_tokens,
             input_positions=input_positions,
+            token_type_ids=token_type_ids,
+            seq_lens=input_data.seq_lens,
+            query_lens=input_data.query_lens,
             attn_metadata=attn_metadata,
             multi_modal_kwargs=multi_modal_kwargs,
-            # query_lens is not needed if chunked prefill is not
-            # supported. Since CPU worker doesn't support chunked prefill
-            # just use seq_lens instead.
-            seq_lens=seq_lens,
-            query_lens=seq_lens,
         )
 
-    def _compute_multi_modal_input(
-        self,
-        seq_data: SequenceData,
-        computed_len: int,
-        seq_group_metadata: SequenceGroupMetadata,
-    ):
+    def _build_input_data(self):
+        for seq_group_metadata in self.seq_group_metadata_list:
+            for seq_id, seq_data in seq_group_metadata.seq_data.items():
+                if seq_group_metadata.is_prompt:
+                    self._compute_prompt_input_tokens(self.input_data,
+                                                      seq_group_metadata,
+                                                      seq_data, seq_id)
+                    if seq_group_metadata.multi_modal_data:
+                        self._compute_multi_modal_input(
+                            seq_group_metadata, seq_data)
+                else:
+                    self._compute_decode_input_tokens(self.input_data,
+                                                      seq_group_metadata,
+                                                      seq_data, seq_id)
+
+    def _compute_decode_input_tokens(self, data: ModelInputData,
+                                     seq_group_metadata: SequenceGroupMetadata,
+                                     seq_data: SequenceData, seq_id: int):
+        """
+        Compute decode input tokens, positions, block table and slot mapping.
+        """
+        block_size = self.runner.block_size
+
+        block_table = seq_group_metadata.block_tables[seq_id]
+        seq_len = seq_data.get_len()
+        context_len = seq_data.get_num_computed_tokens()
+
+        tokens = seq_data.get_last_token_id()
+        token_positions = seq_len - 1
+        block_number = block_table[token_positions // block_size]
+        block_offset = token_positions % block_size
+        slot = block_number * block_size + block_offset
+
+        # For paged_attention kernel
+        if self.runner.sliding_window:
+            start_idx = max(0, seq_len - self.runner.sliding_window)
+            start_block = start_idx // block_size
+            start_idx = start_block * block_size
+            seq_len = seq_len - start_idx
+            block_table = block_table[start_block:]
+
+        # For MRotaryEmbedding
+        if data.input_positions is None:
+            next_pos = MRotaryEmbedding.get_next_input_positions(
+                seq_data.mrope_position_delta,
+                context_len,
+                seq_len,
+            )
+            for idx in range(3):
+                data.input_mrope_positions[idx].extend(  # type: ignore
+                    next_pos[idx])
+        else:
+            data.input_positions.append(token_positions)  # type: ignore
+
+        # Update fields
+        data.input_tokens.append(tokens)
+        data.max_decode_seq_len = max(data.max_decode_seq_len, seq_len)
+        data.num_decode_tokens += 1
+        data.slot_mapping.append(slot)
+        data.decode_block_tables.append(block_table)
+        data.query_lens.append(1)
+        data.seq_lens.append(seq_len)
+
+    def _compute_prompt_input_tokens(self, data: ModelInputData,
+                                     seq_group_metadata: SequenceGroupMetadata,
+                                     seq_data: SequenceData, seq_id: int):
+        """
+        Compute prompt input tokens, positions, block table and slot mapping.
+        """
+        token_chunk_size = seq_group_metadata.token_chunk_size
+        block_size = self.runner.block_size
+
+        block_table = seq_group_metadata.block_tables[seq_id]
+        seq_len = seq_data.get_len()
+        context_len = seq_data.get_num_computed_tokens()
+        seq_len = min(seq_len, context_len + token_chunk_size)
+
+        # For prefix caching
+        prefix_cache_block_num = len(seq_group_metadata.computed_block_nums)
+        if prefix_cache_block_num > 0:
+            prefix_cache_len = (prefix_cache_block_num *
+                                self.runner.block_size)
+            if prefix_cache_len <= context_len:
+                # We already passed the cache hit region,
+                # so do normal computation.
+                pass
+            elif context_len < prefix_cache_len < seq_len:
+                # Partial hit. Compute the missing part.
+                context_len = prefix_cache_len
+                token_chunk_size = seq_len - context_len
+            elif seq_len <= prefix_cache_len:
+                # Full hit. Only compute the last token to avoid
+                # erroneous behavior. FIXME: Ideally we should directly
+                # mark all tokens as computed in the scheduler and do not
+                # schedule this sequence, so this case should not happen.
+                context_len = seq_len - 1
+                token_chunk_size = 1
+
+        tokens = seq_data.get_token_ids()
+        tokens = tokens[context_len:seq_len]
+        token_positions = range(context_len, seq_len)
+        token_types = seq_group_metadata.token_type_ids
+
+        # For encoder-only models, the block_table is None,
+        # and there is no need to initialize the slot_mapping.
+        if block_table is not None:
+            slot_mapping = [_PAD_SLOT_ID] * len(token_positions)
+            for i, pos in enumerate(token_positions):
+                block_number = block_table[pos // block_size]
+                block_offset = pos % block_size
+                slot = block_number * block_size + block_offset
+                slot_mapping[i] = slot
+            data.slot_mapping.extend(slot_mapping)
+
+        # The MROPE positions are prepared in _compute_multi_modal_input
+        if data.input_positions is not None:
+            data.input_positions.extend(token_positions)
+
+        if data.token_type_ids is not None:
+            data.token_type_ids.extend(token_types if token_types else [])
+
+        # Update fields
+        data.input_tokens.extend(tokens)
+        data.num_prefills += 1
+        data.num_prefill_tokens += len(tokens)
+        data.query_lens.append(len(tokens))
+        data.prefill_block_tables.append(block_table)
+        data.seq_lens.append(seq_len)
+
+    def _compute_multi_modal_input(self,
+                                   seq_group_metadata: SequenceGroupMetadata,
+                                   seq_data: SequenceData):
+        computed_len = seq_data.get_num_computed_tokens()
+        seq_len = self.input_data.seq_lens[-1]
+
         # NOTE: mm_data only includes the subset of multi-modal items that
         # intersect with the current prefill positions.
         mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
-            seq_group_metadata,
-            range(computed_len, len(seq_data.get_token_ids())),
-        )
+            seq_group_metadata, range(computed_len, seq_len))
 
         if not mm_data:
-            return None, None, None
+            return
 
         if self.runner.mm_registry.has_processor(self.runner.model_config):
             mm_kwargs = mm_data
@@ -173,8 +346,10 @@ def _compute_multi_modal_input(
             )
 
         # special processing for mrope position deltas.
-        mrope_positions = None
         if self.runner.model_config.uses_mrope:
+            assert not self.chunked_prefill, \
+                "MROPE on CPU does not support chunked-prefill."
+
             image_grid_thw = mm_kwargs.get("image_grid_thw", None)
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
             assert image_grid_thw is not None or video_grid_thw is not None, (
@@ -198,226 +373,15 @@ def _compute_multi_modal_input(
                     context_len=computed_len,
                 )
             seq_data.mrope_position_delta = mrope_position_delta
-        return mm_kwargs, placeholder_maps, mrope_positions
 
-    def _prepare_prompt(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               BatchedTensorInputs]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[int] = []
-        input_positions: List[int] = []
-        input_mrope_positions: List[List[int]] = [[] for _ in range(3)]
-
-        slot_mapping: List[int] = []
-        seq_lens: List[int] = []
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        multi_modal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
-
-        for seq_group_metadata in seq_group_metadata_list:
-            assert seq_group_metadata.is_prompt
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            assert len(seq_ids) == 1
-            seq_id = seq_ids[0]
-
-            seq_data = seq_group_metadata.seq_data[seq_id]
-            prompt_tokens = seq_data.get_token_ids()
-            computed_len = seq_data.get_num_computed_tokens()
-            seq_len = len(prompt_tokens)
-
-            seq_lens.append(seq_len)  # Prompt token num
-            input_tokens.extend(prompt_tokens)  # Token ids
-
-            mrope_positions = None
-            if seq_group_metadata.multi_modal_data:
-                (
-                    mm_kwargs,
-                    placeholder_maps,
-                    mrope_positions,
-                ) = self._compute_multi_modal_input(seq_data, computed_len,
-                                                    seq_group_metadata)
-
-                multi_modal_kwargs_list.append(mm_kwargs)
-                for modality, placeholder_map in placeholder_maps.items():
-                    multi_modal_placeholder_maps[modality].extend(
-                        placeholder_map)
-
-            # Token position ids
-            # NOTE(woosuk): Here we assume that the first token in the prompt
-            # is always the first token in the sequence.
-            if mrope_positions:
-                for idx in range(3):
-                    input_mrope_positions[idx].extend(mrope_positions[idx])
-            else:
-                input_positions.extend(list(range(computed_len, seq_len)))
-
-            # Compute the slot mapping.
-            block_table = seq_group_metadata.block_tables[seq_id]
-            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
-            # where start_idx is max(0, seq_len - sliding_window).
-            # For example, if the prompt len is 10, sliding window is 8, and
-            # block size is 4, the first two tokens are masked and the slot
-            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
-            start_idx = 0
-            if self.sliding_window is not None:
-                start_idx = max(0, seq_len - self.sliding_window)
-
-            for i in range(computed_len, seq_len):
-                if i < start_idx:
-                    slot_mapping.append(_PAD_SLOT_ID)
-                    continue
-
-                # For encoder-only models, the block_table is None,
-                # and there is no need to initialize the slot_mapping.
-                if block_table is not None:
-                    block_number = block_table[i //
-                                               self.block_size]  # type: ignore
-                    block_offset = i % self.block_size  # type: ignore
-                    slot = block_number * self.block_size + block_offset
-                    slot_mapping.append(slot)
-
-        if any(input_mrope_positions):
-            input_positions = None  # type: ignore
-        else:
-            input_mrope_positions = None  # type: ignore
+            for i in range(3):
+                self.input_data.input_mrope_positions[  # type: ignore
+                    i].extend(mrope_positions[i])
 
-        num_prompt_tokens = len(input_tokens)
-
-        input_tokens = torch.tensor(input_tokens,
-                                    dtype=torch.long,
-                                    device=self.device)  # type: ignore
-        input_positions = torch.tensor(input_positions
-                                       or input_mrope_positions,
-                                       dtype=torch.long,
-                                       device=self.device)  # type: ignore
-        slot_mapping = torch.tensor(slot_mapping,
-                                    dtype=torch.long,
-                                    device=self.device)  # type: ignore
-        placeholder_index_maps = {
-            modality: placeholder_map.index_map()
-            for modality, placeholder_map in
-            multi_modal_placeholder_maps.items()
-        }
-
-        attn_metadata = self.attn_backend.make_metadata(
-            is_prompt=True,
-            seq_lens=seq_lens,
-            seq_lens_tensor=torch.tensor([]),
-            max_decode_seq_len=0,
-            num_prefills=len(seq_lens),
-            num_prefill_tokens=num_prompt_tokens,
-            num_decode_tokens=0,
-            block_tables=torch.tensor([]),
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-        )
-
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        return (input_tokens, input_positions, attn_metadata, seq_lens,
-                multi_modal_kwargs)
-
-    def _prepare_decode(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[int] = []
-        input_positions: List[int] = []
-        input_mrope_positions: List[List[int]] = [[] for _ in range(3)]
-        slot_mapping: List[int] = []
-        seq_lens: List[int] = []
-        block_tables: List[List[int]] = []
-
-        for seq_group_metadata in seq_group_metadata_list:
-            assert not seq_group_metadata.is_prompt
-            assert seq_group_metadata.token_chunk_size == 1
-
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-
-            for seq_id in seq_ids:
-                seq_data = seq_group_metadata.seq_data[seq_id]
-                generation_token = seq_data.get_last_token_id()
-                input_tokens.append(generation_token)
-
-                seq_len = seq_data.get_len()
-                position = seq_len - 1
-                if seq_data.mrope_position_delta is not None:
-                    context_len = seq_data.get_num_computed_tokens()
-                    next_pos = MRotaryEmbedding.get_next_input_positions(
-                        seq_data.mrope_position_delta,
-                        context_len,
-                        seq_len,
-                    )
-                    for idx in range(3):
-                        input_mrope_positions[idx].extend(next_pos[idx])
-                else:
-                    input_positions.append(position)
-
-                seq_len = seq_len if self.sliding_window is None else min(
-                    seq_len, self.sliding_window)
-                seq_lens.append(seq_len)
-
-                block_table = seq_group_metadata.block_tables[seq_id]
-                block_number = block_table[position // self.block_size]
-                block_offset = position % self.block_size
-                slot = block_number * self.block_size + block_offset
-                slot_mapping.append(slot)
-
-                if self.sliding_window is not None:
-                    sliding_window_blocks = (self.sliding_window //
-                                             self.block_size)
-                    block_table = block_table[-sliding_window_blocks:]
-                block_tables.append(block_table)
-
-        if any(input_mrope_positions):
-            input_positions = None  # type: ignore
-        else:
-            input_mrope_positions = None  # type: ignore
-
-        max_decode_seq_len = max(seq_lens)
-
-        input_tokens = torch.tensor(input_tokens,
-                                    dtype=torch.long,
-                                    device=self.device)
-        input_positions = torch.tensor(input_positions
-                                       or input_mrope_positions,
-                                       dtype=torch.long,
-                                       device=self.device)
-        slot_mapping = torch.tensor(slot_mapping,
-                                    dtype=torch.long,
-                                    device=self.device)
-        seq_lens_tensor = torch.tensor(seq_lens,
-                                       dtype=torch.int,
-                                       device=self.device)
-
-        block_tables = make_tensor_with_pad(
-            block_tables,
-            pad=0,
-            dtype=torch.int,
-            device=self.device,
-        )
-
-        attn_metadata = self.attn_backend.make_metadata(
-            is_prompt=False,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_decode_seq_len=max_decode_seq_len,
-            num_prefill_tokens=0,
-            num_decode_tokens=len(input_tokens),
-            num_prefills=0,
-            block_tables=block_tables,
-        )
-        return (
-            input_tokens,
-            input_positions,
-            attn_metadata,
-        )
+        self.input_data.multi_modal_inputs_list.append(mm_kwargs)
+        for modality, placeholder_map in placeholder_maps.items():
+            self.input_data.multi_modal_placeholder_maps[modality].extend(
+                placeholder_map)
 
 
 class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
@@ -432,29 +396,34 @@ def __init__(
         vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
+        return_hidden_states: bool = False,
         *args,
         **kwargs,
     ):
         ModelRunnerBase.__init__(self, vllm_config)
-        # Currently, CPU worker doesn't support chunked prefill.
-        assert self.scheduler_config.chunked_prefill_enabled is False
         model_config = self.model_config
         cache_config = self.cache_config
 
         self.is_driver_worker = is_driver_worker
+        self.return_hidden_states = return_hidden_states
 
         self.device = self.device_config.device
+        self.pin_memory = False
 
         self.kv_cache_dtype = kv_cache_dtype
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
+        num_attn_heads = self.model_config.get_num_attention_heads(
+            self.parallel_config)
+        needs_attn_backend = (num_attn_heads != 0
+                              or self.model_config.is_attention_free)
         self.attn_backend = get_attn_backend(
             self.model_config.get_head_size(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
             self.model_config.is_attention_free,
-        )
+        ) if needs_attn_backend else None
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -479,11 +448,19 @@ def _prepare_model_input_tensors(
 
         """
         builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
-        for seq_group_metadata in seq_group_metadata_list:
-            builder.add_seq_group(seq_group_metadata)
+        builder.set_seq_group_list(seq_group_metadata_list)
 
         return builder.build()  # type: ignore
 
+    # sampler property will be used by spec_decode_worker
+    @property
+    def sampler(self):
+        return self.model.sampler
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
 
 class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
     _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
@@ -520,9 +497,12 @@ def prepare_model_input(
                                                      pin_memory=False,
                                                      generators=generators)
 
+        is_prompt = (seq_group_metadata_list[0].is_prompt
+                     if seq_group_metadata_list else None)
         return dataclasses.replace(model_input,
                                    sampling_metadata=sampling_metadata,
-                                   virtual_engine=virtual_engine)
+                                   virtual_engine=virtual_engine,
+                                   is_prompt=is_prompt)
 
     @torch.no_grad()
     def execute_model(
@@ -531,28 +511,33 @@ def execute_model(
         kv_caches: List[torch.Tensor],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        previous_hidden_states: Optional[torch.Tensor] = None,
     ) -> Optional[List[SamplerOutput]]:
         if num_steps > 1:
             raise ValueError(
                 "CPU worker does not support multi-step execution.")
 
         model_executable = self.model
-        execute_model_kwargs = {
-            "input_ids":
-            model_input.input_tokens,
-            "positions":
-            model_input.input_positions,
-            "kv_caches":
-            kv_caches,
-            "attn_metadata":
-            model_input.attn_metadata,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device),
-            "intermediate_tensors":
-            intermediate_tensors,
-        }
 
-        hidden_states = model_executable(**execute_model_kwargs)
+        multimodal_kwargs = {}
+        if model_input.multi_modal_kwargs is not None:
+            multimodal_kwargs = MultiModalKwargs.as_kwargs(
+                model_input.multi_modal_kwargs, device=self.device)
+        execute_model_kwargs = {}
+        if previous_hidden_states is not None:
+            execute_model_kwargs.update(
+                {"previous_hidden_states": previous_hidden_states})
+
+        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+            hidden_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                kv_caches=kv_caches,
+                attn_metadata=model_input.attn_metadata,
+                intermediate_tensors=intermediate_tensors,
+                **execute_model_kwargs,
+                **multimodal_kwargs,
+            )
 
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states,
@@ -567,4 +552,12 @@ def execute_model(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
+        if self.return_hidden_states:
+            # we only need to pass hidden states of most recent token
+            if model_input.is_prompt:
+                output.prefill_hidden_states = hidden_states
+            output.hidden_states = hidden_states
         return [output]
+
+    def generate_proposals(self, *args, **kwargs):
+        return self.model.generate_proposals(*args, **kwargs)
diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
similarity index 89%
rename from vllm/worker/cpu_embedding_model_runner.py
rename to vllm/worker/cpu_pooling_model_runner.py
index 7053075bf4d8f..17b2fd2564a04 100644
--- a/vllm/worker/cpu_embedding_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from vllm.forward_context import set_forward_context
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.multimodal import MultiModalKwargs
 from vllm.pooling_params import PoolingParams
@@ -15,12 +16,12 @@
 @dataclasses.dataclass(frozen=True)
 class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU):
     """
-    Used by the CPUEmbeddingModelRunner.
+    Used by the CPUPoolingModelRunner.
     """
     pooling_metadata: Optional["PoolingMetadata"] = None
 
 
-class CPUEmbeddingModelRunner(
+class CPUPoolingModelRunner(
         CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]):
     _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = (
         ModelInputForCPUWithPoolingMetadata)
@@ -49,6 +50,9 @@ def execute_model(
         ]
 
         model_executable = self.model
+        cross_enc_kwargs = {}
+        if model_input.token_type_ids is not None:
+            cross_enc_kwargs["token_type_ids"] = model_input.token_type_ids
         execute_model_kwargs = {
             "input_ids":
             model_input.input_tokens,
@@ -60,11 +64,17 @@ def execute_model(
             model_input.attn_metadata,
             **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
                                          device=self.device),
+            **cross_enc_kwargs,
             "intermediate_tensors":
             intermediate_tensors,
         }
 
-        hidden_states = model_executable(**execute_model_kwargs)
+        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+            hidden_states = model_executable(**execute_model_kwargs)
+
+        # Only perform pooling in the driver worker.
+        if not self.is_driver_worker:
+            return []
 
         return [
             self.model.pooler(hidden_states=hidden_states,
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index bc9164bd9d5df..09758a5d9accf 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -14,9 +14,9 @@
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from vllm.worker.cpu_embedding_model_runner import CPUEmbeddingModelRunner
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
+from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerBase,
                                      WorkerInput)
@@ -128,6 +128,7 @@ def __init__(
         distributed_init_method: str,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[CPUModelRunner]] = None,
     ) -> None:
         WorkerBase.__init__(self, vllm_config=vllm_config)
 
@@ -151,19 +152,33 @@ def __init__(
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
+        # Return hidden states from target model if the draft model is an
+        # mlp_speculator
+        speculative_config = self.speculative_config
+        model_config = self.model_config
+        speculative_args = {} if speculative_config is None \
+            or (speculative_config.draft_model_config.model ==
+                model_config.model) \
+            or (speculative_config.draft_model_config.hf_config.model_type
+                not in ["medusa", "mlp_speculator", "eagle"]) \
+                    else {"return_hidden_states": True}
         ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner
-        if self.model_config.task == "embedding":
-            ModelRunnerClass = CPUEmbeddingModelRunner
+        if self.model_config.runner_type == "pooling":
+            ModelRunnerClass = CPUPoolingModelRunner
         elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = CPUEncoderDecoderModelRunner
         self.model_runner: CPUModelRunnerBase = ModelRunnerClass(
             vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=is_driver_worker)
+            is_driver_worker=is_driver_worker,
+            **speculative_args,
+        )
+        if model_runner_cls is not None:
+            self.model_runner = model_runner_cls(self.model_runner)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CPUCacheEngine]
-        # Initialize cpu_cache as embedding models don't initialize kv_caches
+        # Initialize cpu_cache as pooling models don't initialize kv_caches
         self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
 
         # Torch profiler. Enabled and configured through env vars:
@@ -197,7 +212,7 @@ def init_device(self) -> None:
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
-
+        self.device = torch.device("cpu")
         self.init_distributed_environment()
         # Set random seed.
         set_random_seed(self.model_config.seed)
@@ -297,6 +312,14 @@ def do_metadata_broadcast(self) -> bool:
     def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
         return self.cpu_cache
 
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
     def execute_worker(
         self,
         worker_input: WorkerInput,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 82824faa6629a..bff01320d7927 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -8,7 +8,7 @@
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata)
 from vllm.attention.backends.utils import PAD_SLOT_ID
-from vllm.attention.selector import (_Backend, get_env_variable_attn_backend,
+from vllm.attention.selector import (get_env_variable_attn_backend,
                                      get_global_forced_attn_backend)
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
@@ -18,14 +18,14 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
                              MultiModalRegistry)
+from vllm.platforms import _Backend
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, PoolerOutput,
                            SequenceGroupMetadata)
 from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
 from vllm.worker.model_runner import (GPUModelRunnerBase,
                                       ModelInputForGPUBuilder,
-                                      ModelInputForGPUWithSamplingMetadata,
-                                      _get_graph_batch_size)
+                                      ModelInputForGPUWithSamplingMetadata)
 from vllm.worker.model_runner_base import (
     _add_attn_metadata_broadcastable_dict,
     _add_sampling_metadata_broadcastable_dict)
@@ -175,7 +175,7 @@ def execute_model(
         } if self.has_inner_state else {}
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        with set_forward_context(model_input.attn_metadata):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
@@ -464,7 +464,8 @@ def _prepare_encoder_model_input_tensors(
                 # We will be using CUDA graph replay for this decode.
                 max_len_of_block_table = self.get_max_block_per_batch()
                 batch_size = len(encoder_seq_lens)
-                graph_batch_size = _get_graph_batch_size(batch_size)
+                graph_batch_size = self.vllm_config.pad_for_cudagraph(
+                    batch_size)
                 assert graph_batch_size >= batch_size
                 cuda_graph_pad_size = graph_batch_size - batch_size
                 # extend the cross_block_tables and encoder_seq_lens to match
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 1ff30d685c6b1..9d479f412af46 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -272,6 +272,19 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
     return indices, offsets
 
 
+def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"):
+    if module.__class__.__name__.endswith(suffix):
+
+        def forward_hook(module, args, output):
+            htorch.core.mark_step()
+            return output
+
+        module.register_forward_hook(forward_hook)
+
+    for child_name, child_module in module.named_children():
+        modify_decoder_layer(child_module)
+
+
 class HpuModelAdapter:
 
     def __init__(self, model, block_size, dtype, enforce_eager):
@@ -609,6 +622,10 @@ def load_model(self) -> None:
                 assert hasattr(
                     self.model, "embedding_padding_modules"
                 ), "Model does not have embedding_padding_modules"
+                assert not self.lora_config.bias_enabled, \
+                    "Bias support in LoRA is not enabled in HPU yet."
+                assert not self.lora_config.fully_sharded_loras, \
+                    "Fully sharded LoRAs is not enabled in HPU yet."
                 self.lora_manager = LRUCacheWorkerLoRAManager(
                     self.scheduler_config.max_num_seqs,
                     self.scheduler_config.max_num_batched_tokens,
@@ -636,6 +653,7 @@ def load_model(self) -> None:
             else:
                 self.model = self.model.to("hpu")
                 htcore.mark_step()
+            modify_decoder_layer(self.model)
             torch.hpu.synchronize()
 
             with HabanaMemoryProfiler() as m_wrap:
@@ -1268,11 +1286,9 @@ def create_dummy_seq_group_metadata(self,
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1]
-        max_seq_len = min(
-            self.bucketing_global_state.prompt_seq_bucket_cfg[-1],
-            self.max_num_batched_tokens // max_batch_size)
-
+        max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
+        max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
+                             self.scheduler_config.max_num_seqs)
         self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
                              False, True)
         return
@@ -1290,7 +1306,6 @@ def warmup_scenario(self,
                          f"bs{batch_size}_"
                          f"seq{seq_len}_"
                          f"graphs{'T' if use_graphs else 'F'}")
-        max_num_seqs = self.scheduler_config.max_num_seqs
         # This represents the maximum number of different requests
         # that will have unique loras, an therefore the max amount of memory
         # consumption create dummy lora request copies from the lora request
@@ -1312,16 +1327,10 @@ def warmup_scenario(self,
                     dummy_lora_requests.append(dummy_lora_request)
                 dummy_lora_requests_per_seq = [
                     dummy_lora_requests[idx % len(dummy_lora_requests)]
-                    for idx in range(max_num_seqs)
+                    for idx in range(batch_size)
                 ]
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs or is_pt_profiler_run else 1
-        if self.lora_config and not is_lora_profile_run:
-            lora_mapping = LoRAMapping(
-                **dict(index_mapping=[0] * batch_size * seq_len,
-                       prompt_mapping=[0] * batch_size * seq_len,
-                       is_prefill=is_prompt))
-            self.set_active_loras(set(), lora_mapping)
         if is_prompt:
             seqs = [
                 self.create_dummy_seq_group_metadata(
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 493f7a9fad098..cca7cd50bfc7b 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -65,8 +65,8 @@ def __init__(
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[HPUCacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
-        self.hpu_cache: Optional[List[List[torch.tensor]]] = None
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
+        self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 042f9f07eace6..6ff98a8f1bab2 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -18,11 +18,9 @@
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
-from vllm.compilation.compile_context import set_compile_context
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
+from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import get_pp_group
+from vllm.distributed import get_kv_transfer_group, get_pp_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
@@ -64,16 +62,7 @@
 logger = init_logger(__name__)
 
 LORA_WARMUP_RANK = 8
-_BATCH_SIZE_ALIGNMENT = 8
-# all the token sizes that **can** be captured by cudagraph.
-# they can be arbitrarily large.
-# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192.
-# the actual sizes to capture will be determined by the model,
-# depending on the model's max_num_seqs.
-# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025)
-]
+
 _NUM_WARMUP_ITERS = 2
 
 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
@@ -93,6 +82,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
     """
     input_tokens: Optional[torch.Tensor] = None
     input_positions: Optional[torch.Tensor] = None
+    token_types: Optional[torch.Tensor] = None
     seq_lens: Optional[List[int]] = None
     query_lens: Optional[List[int]] = None
     lora_mapping: Optional["LoRAMapping"] = None
@@ -201,6 +191,7 @@ class InterDataForSeqGroup:
         def simple_reinit(self):
             self.input_tokens[0].clear()  # type: ignore
             self.input_positions[0].clear()  # type: ignore
+            self.token_types[0].clear()  # type: ignore
             self.mrope_input_positions = None  # type: ignore
             self.seq_lens[0] = 0  # type: ignore
             self.orig_seq_lens[0] = 0  # type: ignore
@@ -227,6 +218,7 @@ def __init__(
             # Input tokens and positions.
             input_tokens: Optional[List[List[int]]] = None,
             input_positions: Optional[List[List[int]]] = None,
+            token_types: Optional[List[List[int]]] = None,
             mrope_input_positions: Optional[List[List[List[int]]]] = None,
 
             # The sequence length (may be capped to the sliding window).
@@ -292,6 +284,12 @@ def __init__(
                         for seq_id in range(len(self.seq_ids)):
                             self.input_positions[seq_id].clear()
 
+                    if token_types:
+                        self.token_types = token_types
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.token_types[seq_id].clear()
+
                     self.mrope_input_positions = None
 
                     if seq_lens:
@@ -355,6 +353,7 @@ def __init__(
             else:
                 self.input_tokens = input_tokens or []
                 self.input_positions = input_positions or []
+                self.token_types = token_types or []
                 self.mrope_input_positions = mrope_input_positions or None
                 self.seq_lens = seq_lens or []
                 self.orig_seq_lens = orig_seq_lens or []
@@ -387,6 +386,7 @@ def __post_init__(self):
 
             self.input_tokens = [[] for _ in range(self.n_seqs)]
             self.input_positions = [[] for _ in range(self.n_seqs)]
+            self.token_types = [[] for _ in range(self.n_seqs)]
             self.mrope_input_positions = None
             self.seq_lens = [0] * self.n_seqs
             self.orig_seq_lens = [0] * self.n_seqs
@@ -499,12 +499,15 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
 
         # Compute tokens.
         tokens = seq_data.get_token_ids()[context_len:seq_len]
+        token_types = seq_group_metadata.token_type_ids
 
         inter_data.seq_lens[seq_idx] = seq_len
         inter_data.orig_seq_lens[seq_idx] = seq_len
         inter_data.context_lens[seq_idx] = context_len
         inter_data.input_tokens[seq_idx].extend(tokens)
         inter_data.input_positions[seq_idx].extend(range(context_len, seq_len))
+        inter_data.token_types[seq_idx].extend(
+            token_types if token_types else [])
         inter_data.query_lens[seq_idx] = seq_len - context_len
 
         if seq_data.mrope_position_delta is not None:
@@ -562,6 +565,8 @@ def _compute_for_prefix_cache_hit(
                 seq_idx][uncomputed_start:]
             inter_data.input_positions[seq_idx] = inter_data.input_positions[
                 seq_idx][uncomputed_start:]
+            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
+                uncomputed_start:]
             context_len = prefix_cache_len
 
             inter_data.context_lens[seq_idx] = context_len
@@ -576,6 +581,8 @@ def _compute_for_prefix_cache_hit(
                 seq_idx][-1:]
             inter_data.input_positions[seq_idx] = inter_data.input_positions[
                 seq_idx][-1:]
+            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
+                -1:]
             inter_data.query_lens[seq_idx] = 1
             inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1
 
@@ -615,11 +622,13 @@ def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
             inter_data.lora_requests.add(seq_group_metadata.lora_request)
         query_len = inter_data.query_lens[seq_idx]
         inter_data.lora_index_mapping.append([lora_id] * query_len)
-        inter_data.lora_prompt_mapping.append(
-            [lora_id] *
-            (query_len if seq_group_metadata.sampling_params
-             and seq_group_metadata.sampling_params.prompt_logprobs is not None
-             else 1))
+        sampling_params = seq_group_metadata.sampling_params
+        if sampling_params and sampling_params.prompt_logprobs is not None:
+            inter_data.lora_prompt_mapping.append([lora_id] * query_len)
+        elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample:
+            inter_data.lora_prompt_mapping.append([lora_id])
+        else:
+            inter_data.lora_prompt_mapping.append([])
 
     def _compute_prompt_adapter_input(
             self, inter_data: InterDataForSeqGroup,
@@ -700,6 +709,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                         spatial_merge_size=hf_config.vision_config.
                         spatial_merge_size,
                         context_len=inter_data.context_lens[seq_idx],
+                        seq_len=inter_data.seq_lens[seq_idx],
                     )
 
                 seq_data.mrope_position_delta = mrope_position_delta
@@ -745,7 +755,6 @@ def _use_captured_graph(self,
                             max_decode_seq_len: int,
                             max_encoder_seq_len: int = 0) -> bool:
         return (decode_only and not self.runner.model_config.enforce_eager
-                and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
                 and max_decode_seq_len <= self.runner.max_seq_len_to_capture
                 and max_encoder_seq_len <= self.runner.max_seq_len_to_capture
                 and batch_size <= self.runner.max_batchsize_to_capture)
@@ -793,7 +802,8 @@ def _get_cuda_graph_pad_size(self,
                                         max_encoder_seq_len):
             return -1
 
-        graph_batch_size = _get_graph_batch_size(batch_size)
+        graph_batch_size = self.runner.vllm_config.pad_for_cudagraph(
+            batch_size)
         assert graph_batch_size >= batch_size
         return graph_batch_size - batch_size
 
@@ -803,9 +813,12 @@ def build(self) -> ModelInputForGPU:
         """
         # Combine and flatten intermediate data.
         input_tokens = []
+        token_types = []
         for inter_data in self.inter_data_list:
             for cur_input_tokens in inter_data.input_tokens:
                 input_tokens.extend(cur_input_tokens)
+            for cur_token_types in inter_data.token_types:
+                token_types.extend(cur_token_types)
 
         if not input_tokens:
             # This may happen when all prefill requests hit
@@ -874,6 +887,12 @@ def build(self) -> ModelInputForGPU:
         input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long,
                                                self.runner.device,
                                                self.runner.pin_memory)
+
+        token_types_tensor = async_tensor_h2d(token_types, torch.long,
+                                               self.runner.device,
+                                               self.runner.pin_memory) \
+                                                if token_types else None
+
         if mrope_input_positions is not None:
             for idx in range(3):
                 mrope_input_positions[idx].extend(
@@ -952,6 +971,7 @@ def build(self) -> ModelInputForGPU:
         return self.model_input_cls(
             input_tokens=input_tokens_tensor,
             input_positions=input_positions_tensor,
+            token_types=token_types_tensor,
             attn_metadata=attn_metadata,
             seq_lens=seq_lens,
             query_lens=query_lens,
@@ -995,8 +1015,8 @@ def __init__(
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
-        self.max_batchsize_to_capture = _get_max_graph_batch_size(
-            self.scheduler_config.max_num_seqs)
+        self.max_batchsize_to_capture = \
+            self.vllm_config.compilation_config.max_capture_size
 
         self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [
             {} for _ in range(self.parallel_config.pipeline_parallel_size)
@@ -1141,10 +1161,10 @@ def load_model(self) -> None:
                     "provided. Defaulting to scaling factors of 1.0. "
                     "This may lead to less accurate results!")
 
-        if envs.VLLM_TORCH_COMPILE_LEVEL == CompilationLevel.DYNAMO_AS_IS \
-            and supports_dynamo():
-            from vllm.plugins import get_torch_compile_backend
-            backend = get_torch_compile_backend() or "eager"
+        if self.vllm_config.compilation_config.level ==\
+            CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
+            backend = self.vllm_config.compilation_config.init_backend(
+                self.vllm_config)
             self.model = torch.compile(
                 self.model,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
@@ -1306,14 +1326,7 @@ def profile_run(self) -> None:
                 dtype=self.model_config.dtype,
                 device=self.device)
 
-        graph_batch_size = self.max_batchsize_to_capture
-        batch_size_capture_list = [
-            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
-        ]
-        if self.model_config.enforce_eager:
-            batch_size_capture_list = []
-        with set_compile_context(batch_size_capture_list):
-            self.execute_model(model_input, kv_caches, intermediate_tensors)
+        self.execute_model(model_input, kv_caches, intermediate_tensors)
         torch.cuda.synchronize()
         return
 
@@ -1432,18 +1445,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                 dtype=self.model_config.dtype,
                 device=self.device)
 
-        graph_batch_size = self.max_batchsize_to_capture
-        batch_size_capture_list = [
-            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
-        ]
-
         with self.attn_state.graph_capture(
                 max_batch_size), graph_capture() as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
             # memory usage of CUDA graph.
             for virtual_engine in range(
                     self.parallel_config.pipeline_parallel_size):
-                for batch_size in reversed(batch_size_capture_list):
+                for batch_size in \
+                    self.vllm_config.compilation_config.capture_sizes:
                     attn_metadata = (
                         self.attn_state.graph_capture_get_metadata_for_batch(
                             batch_size,
@@ -1504,7 +1513,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         self._update_inputs_to_capture_for_enc_dec_model(
                             capture_inputs)
 
-                    with set_forward_context(attn_metadata):
+                    with set_forward_context(attn_metadata, self.vllm_config):
                         graph_runner.capture(**capture_inputs)
                     self.graph_memory_pool = graph_runner.graph.pool()
                     self.graph_runners[virtual_engine][batch_size] = (
@@ -1639,6 +1648,24 @@ def execute_model(
         else:
             model_executable = self.model
 
+        # Receive KV cache in distributed KV cache transfer setting
+        # In disagg prefill setting, it will also recv hidden states and bypass
+        # model forwarding
+        # In KV cache database setting, it will change the model input so that
+        # we can skip prefilling on tokens that successfully received KV caches
+        # NOTE: The receive operation is blocking
+        bypass_model_exec = False
+        if self.need_recv_kv(model_input, kv_caches):
+            hidden_or_intermediate_states, bypass_model_exec, model_input = \
+                get_kv_transfer_group().recv_kv_caches_and_hidden_states(
+                    # model is used to know which layer the current worker
+                    # is working on, so that we can receive KV for only those
+                    # layers.
+                    model_executable,
+                    model_input,
+                    kv_caches=kv_caches
+                )
+
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
         seqlen_agnostic_kwargs = {
             "finished_requests_ids": model_input.finished_requests_ids,
@@ -1650,21 +1677,36 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-        with set_forward_context(model_input.attn_metadata):
-            hidden_or_intermediate_states = model_executable(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                kv_caches=kv_caches,
-                attn_metadata=model_input.attn_metadata,
-                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                             device=self.device),
-                **seqlen_agnostic_kwargs)
+        if not bypass_model_exec:
+            with set_forward_context(model_input.attn_metadata,
+                                     self.vllm_config):
+                hidden_or_intermediate_states = model_executable(
+                    input_ids=model_input.input_tokens,
+                    positions=model_input.input_positions,
+                    kv_caches=kv_caches,
+                    attn_metadata=model_input.attn_metadata,
+                    intermediate_tensors=intermediate_tensors,
+                    **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
+                                                 device=self.device),
+                    **seqlen_agnostic_kwargs)
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
 
+        # Sending KV cache in distributed KV cache transfer setting
+        # NOTE: the send operation is non-blocking
+        if self.need_send_kv(model_input, kv_caches):
+            get_kv_transfer_group().send_kv_caches_and_hidden_states(
+                # model_executable is used to know which layer the current
+                # worker is working on, so that we can send KV for only those
+                # layers.
+                model_executable,
+                model_input,
+                kv_caches,
+                hidden_or_intermediate_states,
+            )
+
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             if (self.is_driver_worker
@@ -1732,6 +1774,56 @@ def execute_model(
 
         return [output]
 
+    def need_recv_kv(self, model_input, kv_caches) -> bool:
+        """Check if we need to receive kv-cache from the other worker.
+        We need to receive KV when
+            1. current vLLM instance is KV cache consumer/decode vLLM instance
+            2. this batch is not a profiling run
+            3. this batch is a prefill run
+            
+        Args:
+            model_input: input to the model executable
+            kv_caches: vLLM's paged memory
+        """
+
+        if self.vllm_config.kv_transfer_config is None:
+            return False
+
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+
+        # check if the current run is profiling
+        is_profile_run = (kv_caches[0].numel() == 0)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+
+        return self.vllm_config.kv_transfer_config.is_kv_consumer and (
+            not is_profile_run) and is_prefill_run
+
+    def need_send_kv(self, model_input, kv_caches) -> bool:
+        """Check if we need to send kv-cache to the other worker.
+        We need to send KV when
+            1. current vLLM instance is KV cache producer/prefill vLLM instance
+            2. this batch is not a profiling run
+            3. this batch is a prefill run
+            
+        Args:
+            model_input: input to the model executable
+            kv_caches: vLLM's paged memory
+        """
+
+        if self.vllm_config.kv_transfer_config is None:
+            return False
+
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+
+        # check if the current run is profiling
+        is_profile_run = (kv_caches[0].numel() == 0)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+
+        return self.vllm_config.kv_transfer_config.is_kv_producer and (
+            not is_profile_run) and is_prefill_run
+
 
 # NOTE: this is nn.Module so the profiler can properly capture/group
 #  kernels calls made within the graph
@@ -1770,7 +1862,7 @@ def capture(
         # Run the model a few times without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
-        # Note one iteration is not enough for torch.jit.script
+        # Note one iteration is not enough for torch.compile
         for _ in range(_NUM_WARMUP_ITERS):
             self.model(
                 input_ids=input_ids,
@@ -1883,37 +1975,3 @@ def forward(
             return self.output_buffers["hidden_states"]
 
         return self.output_buffers
-
-
-def _get_graph_batch_size(batch_size: int) -> int:
-    """Returns the padded batch size given actual batch size.
-
-    Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
-    2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
-    """
-    if batch_size <= 2:
-        return batch_size
-    elif batch_size <= 4:
-        return 4
-    else:
-        return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
-                _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
-
-
-def _get_max_graph_batch_size(max_num_seqs: int) -> int:
-    """
-    max_num_seqs: Maximum number of sequences in a batch.
-    _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture.
-
-    pad the max_num_seqs if necessary by calling _get_graph_batch_size,
-    which will deal with some edge cases like 1, 2, 4.
-
-    if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded size.
-    if not, it means the padded size is larger than the largest size in
-    _BATCH_SIZES_TO_CAPTURE, return the largest size in _BATCH_SIZES_TO_CAPTURE.
-    """
-    padded_size = _get_graph_batch_size(max_num_seqs)
-    if padded_size in _BATCH_SIZES_TO_CAPTURE:
-        return padded_size
-    assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-    return _BATCH_SIZES_TO_CAPTURE[-1]
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 9e529f86b46bb..cd4770202a186 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -289,3 +289,18 @@ def get_generators(self, finished_request_ids: Optional[List[str]] = None):
                 self.generators.pop(request_id, None)
 
         return self.generators
+
+
+class ModelRunnerWrapperBase:
+    """
+    The whole point of this class is to lazily initialize the model_runner.
+    """
+
+    def __init__(
+        self,
+        moderl_runner: ModelRunnerBase,
+    ) -> None:
+        self.model_runner: ModelRunnerBase = moderl_runner
+
+    def __getattr__(self, attr):
+        return getattr(self.model_runner, attr)
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 3ee0fb4dc943e..18b03bf1bfb56 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -29,7 +29,9 @@
 
 logger = init_logger(__name__)
 
-MULTI_STEP_ATTENTION_BACKENDS = ["FLASH_ATTN", "ROCM_FLASH", "FLASHINFER"]
+MULTI_STEP_ATTENTION_BACKENDS = [
+    "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION"
+]
 MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"]
 
 def _get_supported_attention_backends(chunked_prefill_enabled: bool) \
@@ -643,7 +645,8 @@ def _advance_step(self, model_input: StatefulModelInput,
         return model_input
 
     def load_model(self) -> None:
-        return self._base_model_runner.load_model()
+        self._base_model_runner.load_model()
+        self.model_memory_usage = self._base_model_runner.model_memory_usage
 
     def save_sharded_state(
         self,
@@ -817,7 +820,7 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 205f8a337ce6c..0bf522d5333ed 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -489,7 +489,7 @@ def model_profile_run():
                 block_size = cache_config.block_size
                 seq_num_blocks = (seq_len + block_size - 1) // block_size
 
-                seq_data, dummy_multi_modal_data = input_registry \
+                dummy_data = input_registry \
                     .dummy_data_for_profiling(model_config,
                                               seq_len,
                                               mm_registry)
@@ -498,11 +498,11 @@ def model_profile_run():
                 seq = SequenceGroupMetadata(
                     request_id=str(group_id),
                     is_prompt=True,
-                    seq_data={group_id: seq_data},
+                    seq_data={group_id: dummy_data.seq_data},
                     sampling_params=sampling_params,
                     block_tables=block_tables,
                     lora_request=None,
-                    multi_modal_data=dummy_multi_modal_data)
+                    multi_modal_data=dummy_data.multi_modal_data)
                 seqs.append(seq)
 
             self.model_runner.block_size = tmp_cache_config.block_size
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/pooling_model_runner.py
similarity index 95%
rename from vllm/worker/embedding_model_runner.py
rename to vllm/worker/pooling_model_runner.py
index 37cfcbf13d7a3..1beae1e3884c5 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -21,12 +21,12 @@
 @dataclasses.dataclass(frozen=True)
 class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
     """
-    Used by the EmbeddingModelRunner.
+    Used by the PoolingModelRunner.
     """
     pooling_metadata: Optional["PoolingMetadata"] = None
 
 
-class EmbeddingModelRunner(
+class PoolingModelRunner(
         GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
     _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
         ModelInputForGPUWithPoolingMetadata)
@@ -52,7 +52,7 @@ def execute_model(
     ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
         if num_steps > 1:
             raise ValueError(
-                "EmbeddingModelRunner does not support multi-step execution.")
+                "PoolingModelRunner does not support multi-step execution.")
 
         if self.lora_config:
             assert model_input.lora_requests is not None
@@ -97,7 +97,11 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-        with set_forward_context(model_input.attn_metadata):
+        cross_enc_kwargs = {}
+        if model_input.token_types is not None:
+            cross_enc_kwargs["token_type_ids"] = model_input.token_types
+
+        with set_forward_context(model_input.attn_metadata, self.vllm_config):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
@@ -105,7 +109,8 @@ def execute_model(
                 attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                             device=self.device))
+                                             device=self.device),
+                **cross_enc_kwargs)
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index a721186137328..9a054eb8a4cf7 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,3 +1,4 @@
+import enum
 import time
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
@@ -11,7 +12,6 @@
 import torch_xla.runtime as xr
 
 from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -39,6 +39,15 @@
 _MAX_NUM_SAMPLES = 128
 
 
+class ExecutionMode(enum.Enum):
+    PREFILL = enum.auto()
+    DECODE = enum.auto()
+    PREFIX_PREFILL = enum.auto()
+
+    def is_prefill(self) -> bool:
+        return self in (ExecutionMode.PREFILL, ExecutionMode.PREFIX_PREFILL)
+
+
 @dataclass(frozen=True)
 class ModelInputForTPU(ModelRunnerInputBase):
     token_ids: torch.Tensor
@@ -140,16 +149,21 @@ def load_model(self) -> None:
             model = get_model(vllm_config=self.vllm_config)
         model = model.eval()
         xm.wait_device_ops()
-        self.model = ModelWrapper(model)
+        model = ModelWrapper(model)
+        self.model = torch.compile(model,
+                                   backend="openxla",
+                                   fullgraph=True,
+                                   dynamic=False)
 
     def _dummy_run(
         self,
         batch_size: int,
         seq_len: int,
         kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-        is_prompt: bool,
+        exec_mode: ExecutionMode,
     ) -> None:
-        if is_prompt:
+        exec_mode = ExecutionMode(exec_mode)
+        if exec_mode.is_prefill():
             seq_len = (seq_len + 15) // 16 * 16
             token_ids = torch.zeros((batch_size, seq_len),
                                     dtype=torch.int32,
@@ -160,18 +174,38 @@ def _dummy_run(
             slot_mapping = torch.zeros((batch_size, seq_len),
                                        dtype=torch.int64,
                                        device=self.device)
-            attn_metadata = self.attn_backend.make_metadata(
-                num_prefills=batch_size,
-                num_prefill_tokens=batch_size * seq_len,
-                num_decode_tokens=0,
-                slot_mapping=slot_mapping,
-                multi_modal_placeholder_index_maps=None,
-                block_tables=None,
-                context_lens=None,
-            )
             input_lens = torch.ones((batch_size, ),
                                     dtype=torch.int32,
                                     device=self.device)
+            if exec_mode == ExecutionMode.PREFILL:
+                attn_metadata = self.attn_backend.make_metadata(
+                    num_prefills=batch_size,
+                    num_prefill_tokens=batch_size * seq_len,
+                    num_decode_tokens=0,
+                    slot_mapping=slot_mapping,
+                    multi_modal_placeholder_index_maps=None,
+                    block_tables=None,
+                    context_lens=None,
+                    effective_query_lens=None,
+                )
+            else:
+                context_lens = torch.ones((batch_size, ),
+                                          dtype=torch.int32,
+                                          device=self.device)
+                block_tables = torch.tensor(self.block_tables[:batch_size],
+                                            dtype=torch.int32,
+                                            device=self.device)
+                effective_query_lens = torch.ones_like(context_lens)
+                attn_metadata = self.attn_backend.make_metadata(
+                    num_prefills=batch_size,
+                    num_prefill_tokens=batch_size * seq_len,
+                    num_decode_tokens=0,
+                    slot_mapping=slot_mapping,
+                    multi_modal_placeholder_index_maps=None,
+                    block_tables=block_tables,
+                    context_lens=context_lens,
+                    effective_query_lens=effective_query_lens,
+                )
         else:
             assert seq_len == 1
             token_ids = torch.zeros((batch_size, seq_len),
@@ -204,7 +238,7 @@ def _dummy_run(
             )
         t = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
         p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
-        num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
+        num_samples = _MAX_NUM_SAMPLES if exec_mode.is_prefill() else 1
 
         # NOTE(woosuk): There are two stages of compilation: torch.compile and
         # XLA compilation. Using `mark_dynamic` can reduce the torch.compile
@@ -213,7 +247,7 @@ def _dummy_run(
         # be re-compiled for every different shapes. This overhead is inevitable
         # in the first run, but can be skipped afterwards as we cache the XLA
         # graphs in the disk (VLLM_XLA_CACHE_PATH).
-        if is_prompt:
+        if exec_mode.is_prefill():
             # Prefll
             torch._dynamo.mark_dynamic(token_ids, 1)
             torch._dynamo.mark_dynamic(position_ids, 1)
@@ -229,15 +263,8 @@ def _dummy_run(
             torch._dynamo.mark_dynamic(t, 0)
             torch._dynamo.mark_dynamic(p, 0)
         # Dummy run.
-        self.model(token_ids,
-                   position_ids,
-                   attn_metadata,
-                   input_lens,
-                   t,
-                   p,
-                   num_samples,
-                   kv_caches,
-                   is_prompt=is_prompt)
+        self.model(token_ids, position_ids, attn_metadata, input_lens, t, p,
+                   num_samples, kv_caches)
 
     def warmup_model(
         self,
@@ -248,13 +275,13 @@ def warmup_model(
         start = time.time()
         for batch_size in [1]:
             seq_len = 16
-            while True:
-                self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=True)
+            while seq_len <= self.model_config.max_model_len:
+                self._dummy_run(batch_size,
+                                seq_len,
+                                kv_caches,
+                                exec_mode=ExecutionMode.PREFILL)
                 xm.wait_device_ops()
                 logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
-
-                if seq_len >= self.model_config.max_model_len:
-                    break
                 num_tokens = batch_size * seq_len
                 if num_tokens >= self.scheduler_config.max_num_batched_tokens:
                     break
@@ -263,12 +290,39 @@ def warmup_model(
         end = time.time()
         logger.info("Compilation for prefill done in %.2f s.", end - start)
 
+        # Prefix prefill
+        if self.cache_config.enable_prefix_caching:
+            logger.info("Compiling the model with different input shapes for "
+                        "prefix prefill...")
+            start = time.time()
+            for batch_size in [1]:
+                seq_len = 16
+                while seq_len <= self.model_config.max_model_len:
+                    self._dummy_run(batch_size,
+                                    seq_len,
+                                    kv_caches,
+                                    exec_mode=ExecutionMode.PREFIX_PREFILL)
+                    xm.wait_device_ops()
+                    logger.info("batch_size: %d, seq_len: %d", batch_size,
+                                seq_len)
+                    num_tokens = batch_size * seq_len
+                    if (num_tokens >=
+                            self.scheduler_config.max_num_batched_tokens):
+                        break
+                    seq_len = seq_len * 2
+            end = time.time()
+            logger.info("Compilation for prefix prefill done in %.2f s.",
+                        end - start)
+
         # Decode
         start = time.time()
         seq_len = 1
         batch_size = 8  # Must be in sync with _get_padded_batch_size()
         while True:
-            self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=False)
+            self._dummy_run(batch_size,
+                            seq_len,
+                            kv_caches,
+                            exec_mode=ExecutionMode.DECODE)
             xm.wait_device_ops()
             logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
 
@@ -287,9 +341,11 @@ def _prepare_prompt(
         input_tokens: List[int] = []
         input_positions: List[int] = []
         prompt_lens: List[int] = []
+        context_lens: List[int] = []
         slot_mapping: List[int] = []
 
-        for seq_group_metadata in seq_group_metadata_list:
+        for batch_idx, seq_group_metadata in enumerate(
+                seq_group_metadata_list):
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
             assert len(seq_ids) == 1
@@ -298,19 +354,31 @@ def _prepare_prompt(
             seq_data = seq_group_metadata.seq_data[seq_id]
             # Could include output tokens when a request is preempted.
             prompt_tokens = seq_data.get_token_ids()
+            seq_len = len(prompt_tokens)
+
+            num_computed_blocks = len(seq_group_metadata.computed_block_nums)
+            num_computed_tokens = num_computed_blocks * self.block_size
+            if num_computed_tokens > 0:
+                prompt_tokens = prompt_tokens[num_computed_tokens:]
+                context_lens.append(seq_len)
+            else:
+                context_lens.append(0)
+
             prompt_len = len(prompt_tokens)
             prompt_lens.append(prompt_len)
 
             input_tokens.extend(prompt_tokens)
-            input_positions.extend(list(range(prompt_len)))
+            input_positions.extend(range(num_computed_tokens, seq_len))
 
             assert seq_group_metadata.block_tables is not None
             block_table = seq_group_metadata.block_tables[seq_id]
-            for i in range(prompt_len):
+            for i in range(num_computed_tokens, seq_len):
                 block_number = block_table[i // self.block_size]
                 block_offset = i % self.block_size
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
+            if num_computed_tokens > 0:
+                self.block_tables[batch_idx, :len(block_table)] = block_table
 
             # Add paddings to EACH prompt to the smallest power of 2 that is
             # greater than or equal to the prompt length.
@@ -338,14 +406,21 @@ def _prepare_prompt(
         prompt_lens = torch.tensor(prompt_lens,
                                    dtype=torch.int32,
                                    device="cpu")
+        context_lens = torch.tensor(context_lens,
+                                    dtype=torch.int32,
+                                    device="cpu")
+        block_tables = torch.tensor(self.block_tables[:num_prefills],
+                                    dtype=torch.int32,
+                                    device="cpu")
         attn_metadata = self.attn_backend.make_metadata(
             num_prefills=num_prefills,
             num_prefill_tokens=0,  # NOTE: This is not used.
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=None,
-            block_tables=None,
-            context_lens=None,
+            block_tables=block_tables,
+            context_lens=context_lens,
+            effective_query_lens=prompt_lens,
         )
         return input_tokens, input_positions, attn_metadata, prompt_lens
 
@@ -550,6 +625,10 @@ def execute_model(
             # process them separately. This is a temporary hack that should be
             # optimized by using SplashAttention.
             orig_slot_mapping = model_input.attn_metadata.slot_mapping
+            orig_block_tables = model_input.attn_metadata.block_tables
+            orig_context_lens = model_input.attn_metadata.context_lens
+            orig_effective_query_lens = \
+                model_input.attn_metadata.effective_query_lens
             batch_size = model_input.input_lens.shape[0]
             start_idx = 0
             next_token_ids = []
@@ -568,18 +647,24 @@ def execute_model(
                 attn_metadata.num_prefills = 1
                 attn_metadata.slot_mapping = orig_slot_mapping[
                     None, start_idx:end_idx].to(self.device)
+                if orig_context_lens[i].item() > 0:
+                    attn_metadata.context_lens = orig_context_lens[i:i + 1].to(
+                        self.device)
+                    attn_metadata.block_tables = orig_block_tables[
+                        i].unsqueeze(0).to(self.device)
+                    attn_metadata.effective_query_lens = \
+                        orig_effective_query_lens[i:i + 1].to(self.device)
+                else:
+                    attn_metadata.context_lens = None
+                    attn_metadata.block_tables = None
+                    attn_metadata.effective_query_lens = None
                 input_lens = model_input.input_lens[i:i + 1].to(self.device)
                 t = model_input.t[i:i + 1].to(self.device)
                 p = model_input.p[i:i + 1].to(self.device)
-                output_token_ids = self.model(token_ids,
-                                              position_ids,
-                                              attn_metadata,
-                                              input_lens,
-                                              t,
-                                              p,
+                output_token_ids = self.model(token_ids, position_ids,
+                                              attn_metadata, input_lens, t, p,
                                               model_input.num_samples,
-                                              kv_caches,
-                                              is_prompt=True)
+                                              kv_caches)
                 next_token_ids.append(output_token_ids[0])
                 start_idx = end_idx
 
@@ -624,15 +709,10 @@ def execute_model(
             input_lens = model_input.input_lens.to(self.device)
             for i in range(num_steps):
                 slot_mapping = attn_metadata.slot_mapping
-                output_token_ids = self.model(token_ids,
-                                              position_ids,
-                                              attn_metadata,
-                                              input_lens,
-                                              t,
-                                              p,
+                output_token_ids = self.model(token_ids, position_ids,
+                                              attn_metadata, input_lens, t, p,
                                               model_input.num_samples,
-                                              kv_caches,
-                                              is_prompt=False)
+                                              kv_caches)
                 self.cached_step_outputs.append(output_token_ids)
 
                 if i < num_steps - 1:
@@ -667,32 +747,11 @@ def execute_model(
             return [sampler_output]
 
 
-class ModelWrapper(TorchCompileWrapperWithCustomDispatcher):
+class ModelWrapper(nn.Module):
 
     def __init__(self, model: nn.Module):
+        super().__init__()
         self.model = model
-        compiled_callable = torch.compile(self.forward,
-                                          backend="openxla",
-                                          fullgraph=True,
-                                          dynamic=False)
-        super().__init__(compiled_callable)
-
-    def __call__(self, *args, is_prompt: bool, **kwargs):
-        if len(self.compiled_codes) < 3 or not self.use_custom_dispatcher:
-            # not fully compiled yet, or not using the custom dispatcher,
-            # let PyTorch handle it
-            return self.compiled_callable(*args, **kwargs)
-        # the 3 compiled codes are:
-        # 0: for profiling
-        # 1: for prompt
-        # 2: for decode
-        # dispatch to the compiled code directly, skip PyTorch
-        if is_prompt:
-            with self.dispatch_to_code(1):
-                return self.forward(*args, **kwargs)
-        else:
-            with self.dispatch_to_code(2):
-                return self.forward(*args, **kwargs)
 
     def forward(
         self,
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 096cb23416909..8754f7538f251 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -13,7 +13,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
-from vllm.worker.tpu_model_runner import TPUModelRunner
+from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerBase,
                                      WorkerInput)
@@ -112,7 +112,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             batch_size=1,
             seq_len=self.scheduler_config.max_num_batched_tokens,
             kv_caches=kv_caches,
-            is_prompt=True,
+            exec_mode=ExecutionMode.PREFILL,
         )
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index f43635464ef00..5f71ec0c14df8 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # Reminder: Please update docs/source/usage/compatibility_matrix.rst
     # If the feature combo become valid
 
     if enc_dec_mr.cache_config.enable_prefix_caching:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index d3ca6d9d0b17e..f51b51d433d3d 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -7,8 +7,9 @@
 import torch.distributed
 
 import vllm.envs as envs
-from vllm.config import ParallelConfig, VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
+from vllm.config import VllmConfig
+from vllm.distributed import (ensure_kv_transfer_initialized,
+                              ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
 from vllm.logger import init_logger
@@ -20,10 +21,11 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
+from vllm.utils import GiB_bytes, memory_profiling
 from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.embedding_model_runner import EmbeddingModelRunner
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
+from vllm.worker.pooling_model_runner import PoolingModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
                                      WorkerInput)
 
@@ -73,10 +75,8 @@ def __init__(
                     else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
-        if model_runner_cls is not None:
-            ModelRunnerClass = model_runner_cls
-        elif model_config.task == "embedding":
-            ModelRunnerClass = EmbeddingModelRunner
+        if model_config.runner_type == "pooling":
+            ModelRunnerClass = PoolingModelRunner
         elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = EncoderDecoderModelRunner
         self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
@@ -85,10 +85,13 @@ def __init__(
             is_driver_worker=is_driver_worker,
             **speculative_args,
         )
+        if model_runner_cls is not None:
+            self.model_runner = model_runner_cls(self.model_runner)
+
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
         self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
         self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
 
@@ -142,7 +145,7 @@ def init_device(self) -> None:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
-        init_worker_distributed_environment(self.parallel_config, self.rank,
+        init_worker_distributed_environment(self.vllm_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
         # Set random seed.
@@ -192,29 +195,19 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        self.model_runner.profile_run()
-        torch.cuda.synchronize()
+        with memory_profiling(baseline_memory_in_bytes=total_gpu_memory -
+                              self.init_gpu_memory,
+                              weights_memory_in_bytes=self.model_runner.
+                              model_memory_usage) as result:
+            self.model_runner.profile_run()
+            torch.cuda.synchronize()
 
         self._assert_memory_footprint_increased_during_profiling()
 
-        # Get the peak memory allocation recorded by torch
-        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
-
-        # Check for any memory left around that may have been allocated on the
-        # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass
-        torch.cuda.empty_cache()
-        torch_allocated_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-        total_allocated_bytes = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
-        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
-        if non_torch_allocations > 0:
-            peak_memory += non_torch_allocations
-
-        available_kv_cache_memory = (
-            total_gpu_memory * self.cache_config.gpu_memory_utilization -
-            peak_memory)
+        memory_for_current_instance = total_gpu_memory * \
+            self.cache_config.gpu_memory_utilization
+        available_kv_cache_memory = (memory_for_current_instance -
+                                     result.non_kv_cache_memory_in_bytes)
 
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.
@@ -229,18 +222,23 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         num_gpu_blocks = max(num_gpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
 
-        logger.info(
-            "Memory profiling results: total_gpu_memory=%.2fGiB"
-            " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB"
-            " memory_usage_post_profile=%.2fGiB"
-            " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB"
-            " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3),
-            (total_gpu_memory - free_memory_pre_profile) / (1024**3),
-            (peak_memory - non_torch_allocations) / (1024**3),
-            total_allocated_bytes / (1024**3),
-            non_torch_allocations / (1024**3),
-            available_kv_cache_memory / (1024**3),
-            self.cache_config.gpu_memory_utilization)
+        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
+               "the current vLLM instance can use "
+               "total_gpu_memory "
+               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
+               " x gpu_memory_utilization "
+               f"({self.cache_config.gpu_memory_utilization:.2f})"
+               f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
+               "model weights take "
+               f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;"
+               " non_torch_memory takes "
+               f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               " PyTorch activation peak memory takes "
+               f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               " the rest of the memory reserved for KV Cache is "
+               f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
+
+        logger.info(msg)
 
         # Final cleanup
         if self.model_runner.lora_manager:
@@ -448,20 +446,22 @@ def get_cache_block_size_bytes(self) -> int:
 
 
 def init_worker_distributed_environment(
-    parallel_config: ParallelConfig,
+    vllm_config: VllmConfig,
     rank: int,
     distributed_init_method: Optional[str] = None,
     local_rank: int = -1,
 ) -> None:
     """Initialize the distributed environment."""
+    parallel_config = vllm_config.parallel_config
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_distributed_environment(parallel_config.world_size, rank,
                                  distributed_init_method, local_rank)
-
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
 
+    ensure_kv_transfer_initialized(vllm_config)
+
 
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index cf8a4946a71c4..6d00102e0a324 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,9 +1,8 @@
 import dataclasses
-import importlib
 import os
 import time
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 
 import torch
 
@@ -15,7 +14,7 @@
 from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
-                        update_environment_variables)
+                        resolve_obj_by_qualname, update_environment_variables)
 from vllm.worker.model_runner_base import (BroadcastableModelInput,
                                            ModelRunnerBase,
                                            ModelRunnerInputBase)
@@ -44,6 +43,7 @@ def __init__(
         self.speculative_config = vllm_config.speculative_config
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
+        self.kv_transfer_config = vllm_config.kv_transfer_config
 
     @abstractmethod
     def init_device(self) -> None:
@@ -411,23 +411,14 @@ class WorkerWrapperBase:
     We first instantiate the WorkerWrapper, which remembers the worker module
     and class name. Then, when we call `update_environment_variables`, and the
     real initialization happens in `init_worker`.
-
-    If worker_class_fn is specified, it will be executed to get the worker
-    class.
-    Otherwise, the worker class will be obtained by dynamically importing it
-    using worker_module_name and worker_class_name.
     """
 
     def __init__(
         self,
-        worker_module_name: str,
-        worker_class_name: str,
-        trust_remote_code: bool = False,
-        worker_class_fn: Optional[Callable[[],
-                                           Type[WorkerBase]]] = None) -> None:
-        self.worker_module_name = worker_module_name
-        self.worker_class_name = worker_class_name
-        self.worker_class_fn = worker_class_fn
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        trust_remote_code = vllm_config.model_config.trust_remote_code
         self.worker: Optional[WorkerBase] = None
         if trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
@@ -448,7 +439,7 @@ def init_worker(self, *args, **kwargs):
         Here we inject some common logic before initializing the worker.
         Arguments are passed to the worker class constructor.
         """
-        enable_trace_function_call_for_thread()
+        enable_trace_function_call_for_thread(self.vllm_config)
 
         # see https://github.com/NVIDIA/nccl/issues/1234
         os.environ['NCCL_CUMEM_ENABLE'] = '0'
@@ -456,12 +447,8 @@ def init_worker(self, *args, **kwargs):
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 
-        if self.worker_class_fn:
-            worker_class = self.worker_class_fn()
-        else:
-            mod = importlib.import_module(self.worker_module_name)
-            worker_class = getattr(mod, self.worker_class_name)
-
+        worker_class = resolve_obj_by_qualname(
+            self.vllm_config.parallel_config.worker_cls)
         self.worker = worker_class(*args, **kwargs)
         assert self.worker is not None
 
@@ -480,6 +467,9 @@ def execute_method(self, method, *args, **kwargs):
             logger.exception(msg)
             raise e
 
+    def __getattr__(self, attr):
+        return getattr(self.worker, attr)
+
 
 def extract_previous_hidden_states(
         data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index e6322e095bbb9..9cf25387560da 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -37,10 +37,6 @@
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = -1
-_BATCH_SIZE_ALIGNMENT = 8
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
-]
 
 TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU")