diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index eec2a51e2f8fd..64ba1b32fb074 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -9,8 +9,11 @@ steps:
- image: badouralix/curl-jq
command:
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+
- wait
+
- label: "A100"
+ # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
plugins:
@@ -18,7 +21,7 @@ steps:
podSpec:
priorityClassName: perf-benchmark
containers:
- - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+ - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources:
@@ -41,20 +44,48 @@ steps:
- name: devshm
emptyDir:
medium: Memory
- # - label: "H100"
- # agents:
- # queue: H100
- # plugins:
- # - docker#v5.11.0:
- # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
- # command:
- # - bash
- # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
- # mount-buildkite-agent: true
- # propagate-environment: true
- # ipc: host
- # gpus: all
- # environment:
- # - VLLM_USAGE_SOURCE
- # - HF_TOKEN
+ - label: "H200"
+ # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+ agents:
+ queue: H200
+ plugins:
+ - docker#v5.12.0:
+ image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+ command:
+ - bash
+ - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+ mount-buildkite-agent: true
+ propagate-environment: true
+ ipc: host
+ gpus: 4,5,6,7
+ volumes:
+ - /data/benchmark-hf-cache:/root/.cache/huggingface
+ environment:
+ - VLLM_USAGE_SOURCE
+ - HF_TOKEN
+
+ - block: "Run H100 Benchmark"
+ key: block-h100
+ depends_on: ~
+
+ - label: "H100"
+ # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+ agents:
+ queue: H100
+ depends_on: block-h100
+ plugins:
+ - docker#v5.12.0:
+ image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+ command:
+ - bash
+ - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+ mount-buildkite-agent: true
+ propagate-environment: true
+ ipc: host
+ gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+ volumes:
+ - /data/benchmark-hf-cache:/root/.cache/huggingface
+ environment:
+ - VLLM_USAGE_SOURCE
+ - HF_TOKEN
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 7cf05610b9953..9d3646e2f6a15 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -157,6 +157,18 @@ def results_to_json(latency, throughput, serving):
throughput_results,
serving_results)
+ for df in [latency_results, serving_results, throughput_results]:
+ if df.empty:
+ continue
+
+ # Sort all dataframes by their respective "Test name" columns
+ df.sort_values(by="Test name", inplace=True)
+
+ # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
+ # we want to turn it into "8xGPUTYPE"
+ df["GPU"] = df["GPU"].apply(
+ lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
+
# get markdown tables
latency_md_table = tabulate(latency_results,
headers='keys',
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index d397b05cdff23..0d16a83781ab2 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -6,6 +6,7 @@
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
# and we still want to see other benchmarking results even when mixtral crashes.
+set -x
set -o pipefail
check_gpus() {
@@ -85,11 +86,7 @@ kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
- pkill -f pt_main_thread
- # this line doesn't work now
- # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
- pkill -f python3
- pkill -f /usr/bin/python3
+ pgrep python3 | xargs -r kill -9
# wait until GPU memory usage smaller than 1GB
@@ -289,7 +286,7 @@ run_serving_tests() {
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
- eval "$server_command" &
+ bash -c "$server_command" &
server_pid=$!
# wait until the server is alive
@@ -322,7 +319,7 @@ run_serving_tests() {
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
- eval "$client_command"
+ bash -c "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
index 19f7160e68a4d..aa0f7ade808e0 100644
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -1,6 +1,6 @@
#!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
-URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
+URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
TIMEOUT_SECONDS=10
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index f78e360b7afd3..2de6fceb0c3fe 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,7 +1,7 @@
steps:
- label: "Build wheel - CUDA 12.1"
agents:
- queue: cpu_queue
+ queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
@@ -18,7 +18,7 @@ steps:
- label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel
agents:
- queue: cpu_queue
+ queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
@@ -26,3 +26,32 @@ steps:
- "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
+
+ - block: "Build release image"
+ depends_on: ~
+ key: block-release-image-build
+
+ - label: "Build release image"
+ depends_on: block-release-image-build
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+ - label: "Build and publish TPU release image"
+ depends_on: ~
+ if: build.env("NIGHTLY") == "1"
+ agents:
+ queue: tpu_queue_postmerge
+ commands:
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+ - "docker push vllm/vllm-tpu:nightly"
+ - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
+ plugins:
+ - docker-login#v3.0.0:
+ username: vllm
+ password-env: DOCKERHUB_TOKEN
+ env:
+ DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 902e162720b89..3515ccd65667e 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -85,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \
- --ignore=kernels/test_gguf.py \
--ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 5d7a0bff90963..bc06838d804ff 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -4,49 +4,11 @@
# It serves a sanity check for compilation and basic model usage.
set -ex
-# Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
-
# Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; }
+remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
trap remove_docker_container EXIT
remove_docker_container
-# Run the image, setting --shm-size=4g for tensor parallel.
-source /etc/environment
-#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
-
-function cpu_tests() {
- set -e
-
- # Run basic model test
- docker exec cpu-test bash -c "
- set -e
- pip install pytest pytest-asyncio \
- decord einops librosa peft Pillow sentence-transformers soundfile \
- transformers_stream_generator matplotlib datamodel_code_generator
- pip install torchvision --index-url https://download.pytorch.org/whl/cpu
- pytest -v -s tests/models/decoder_only/language -m cpu_model
- pytest -v -s tests/models/embedding/language -m cpu_model
- pytest -v -s tests/models/encoder_decoder/language -m cpu_model
- pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
- pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
-
- # online inference
- docker exec cpu-test bash -c "
- set -e
- python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
- timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
- python3 benchmarks/benchmark_serving.py \
- --backend vllm \
- --dataset-name random \
- --model facebook/opt-125m \
- --num-prompts 20 \
- --endpoint /v1/completions \
- --tokenizer facebook/opt-125m"
-}
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.ppc64le .
-# All of CPU tests are expected to be finished less than 25 mins.
-export -f cpu_tests
-timeout 25m bash -c "cpu_tests"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 14756b5964aaf..4f1729d46dae2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -13,26 +13,27 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
# Setup cleanup
-remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
+remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
function cpu_tests() {
set -e
+ export NUMA_NODE=$2
# offline inference
- docker exec cpu-test-avx2 bash -c "
+ docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference.py"
# Run basic model test
- docker exec cpu-test bash -c "
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip install pytest pytest-asyncio \
decord einops librosa peft Pillow sentence-transformers soundfile \
@@ -45,20 +46,26 @@ function cpu_tests() {
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
# Run compressed-tensor test
- docker exec cpu-test bash -c "
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
- docker exec cpu-test bash -c "
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_ipex_quant.py"
+ # Run chunked-prefill and prefix-cache test
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
+ set -e
+ pytest -s -v -k cpu_model \
+ tests/basic_correctness/test_chunked_prefill.py"
+
# online inference
- docker exec cpu-test bash -c "
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -75,4 +82,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 25 mins.
export -f cpu_tests
-timeout 25m bash -c "cpu_tests $CORE_RANGE"
+timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
new file mode 100644
index 0000000000000..d06604f96f2b8
--- /dev/null
+++ b/.buildkite/run-gh200-test.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script build the GH200 docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+DOCKER_BUILDKIT=1 docker build . \
+ --target vllm-openai \
+ --platform "linux/arm64" \
+ -t gh200-test \
+ --build-arg max_jobs=66 \
+ --build-arg nvcc_threads=2 \
+ --build-arg torch_cuda_arch_list="9.0+PTX" \
+ --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+# Setup cleanup
+remove_docker_container() { docker rm -f gh200-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference
+docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+ python3 examples/offline_inference.py
+'
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index 4505dc7a9373c..fa4f74fca7a11 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index faeac8e2ded36..e0a12afbe7320 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -12,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
-# Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
+# Run the image and test offline inference/tensor parallel
+docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
+ python3 examples/offline_inference.py
+ python3 examples/offline_inference_cli.py -tp 2
+'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 24bf223fb12c0..b563c96343f92 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -9,8 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
-# nightly(bool): run this test in nightly pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually)
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -51,7 +50,9 @@ steps:
- tests/multimodal
- tests/test_utils
- tests/worker
+ - tests/standalone_tests/lazy_torch_compile.py
commands:
+ - python3 standalone_tests/lazy_torch_compile.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@@ -60,6 +61,13 @@ steps:
- pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker
+- label: Python-only Installation Test
+ source_file_dependencies:
+ - tests/standalone_tests/python_only_compile.sh
+ - setup.py
+ commands:
+ - bash standalone_tests/python_only_compile.sh
+
- label: Basic Correctness Test # 30min
#mirror_hardwares: [amd]
fast_check: true
@@ -171,16 +179,16 @@ steps:
- vllm/
- tests/v1
commands:
- - pytest -v -s v1
+ - VLLM_USE_V1=1 pytest -v -s v1
-- label: Examples Test # 15min
+- label: Examples Test # 25min
working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/entrypoints
- examples/
commands:
- - pip install awscli tensorizer # for llava example and tensorizer test
+ - pip install tensorizer # for tensorizer test
- python3 offline_inference.py
- python3 cpu_offload.py
- python3 offline_inference_chat.py
@@ -190,10 +198,13 @@ steps:
- python3 offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py
- - python3 offline_profile.py --model facebook/opt-125m
+ - python3 offline_inference_classification.py
+ - python3 offline_inference_embedding.py
+ - python3 offline_inference_scoring.py
+ - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min
- #mirror_hardwares: [amd]
+ mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/prefix_caching
@@ -213,8 +224,12 @@ steps:
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/model_executor/layers
+ - vllm/model_executor/guided_decoding
- tests/test_logits_processor
- command: pytest -v -s test_logits_processor.py
+ - tests/model_executor/test_guided_processors
+ commands:
+ - pytest -v -s test_logits_processor.py
+ - pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 30min
source_file_dependencies:
@@ -229,7 +244,7 @@ steps:
source_file_dependencies:
- vllm/lora
- tests/lora
- command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+ command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
parallelism: 4
- label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -313,7 +328,7 @@ steps:
##### models test #####
-- label: Basic Models Test # 30min
+- label: Basic Models Test # 24min
source_file_dependencies:
- vllm/
- tests/models
@@ -323,7 +338,7 @@ steps:
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py
-- label: Language Models Test (Standard) # 42min
+- label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
@@ -333,10 +348,9 @@ steps:
commands:
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pytest -v -s models/embedding/language -m core_model
- - pytest -v -s models/embedding/vision_language -m core_model
-- label: Language Models Test (Extended) # 50min
- nightly: true
+- label: Language Models Test (Extended) # 1h10min
+ optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
@@ -345,9 +359,8 @@ steps:
commands:
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/language -m 'not core_model'
- - pytest -v -s models/embedding/vision_language -m 'not core_model'
-- label: Multi-Modal Models Test (Standard) # 26min
+- label: Multi-Modal Models Test (Standard) # 28min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
@@ -356,13 +369,15 @@ steps:
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+ - pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
-- label: Multi-Modal Models Test (Extended) # 1h15m
- nightly: true
+- label: Multi-Modal Models Test (Extended) 1 # 1h16m
+ optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
@@ -370,14 +385,26 @@ steps:
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+ - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+ - pytest -v -s models/embedding/vision_language -m 'not core_model'
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
+- label: Multi-Modal Models Test (Extended) 2 # 38m
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/decoder_only/vision_language
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
optional: true
@@ -412,11 +439,11 @@ steps:
- tests/distributed/
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+ - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+ - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 40min
#mirror_hardwares: [amd]
@@ -429,19 +456,23 @@ steps:
- vllm/model_executor/models/
- tests/distributed/
- vllm/compilation
+ - vllm/worker/worker_base.py
+ - vllm/worker/worker.py
+ - vllm/worker/model_runner.py
commands:
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+ - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error
- - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
+ - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests"
@@ -474,18 +505,22 @@ steps:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
-- label: LoRA Long Context (Distributed) # 11min
- # This test runs llama 13B, so it is required to run on 4 GPUs.
+- label: LoRA TP Test (Distributed)
num_gpus: 4
- soft_fail: true
source_file_dependencies:
- vllm/lora
- - tests/lora/test_long_context
+ - tests/lora
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # This test runs llama 13B, so it is required to run on 4 GPUs.
- pytest -v -s -x lora/test_long_context.py
+ # There is some Tensor Parallelism related processing logic in LoRA that
+ # requires multi-GPU testing for validation.
+ - pytest -v -s -x lora/test_chatglm3_tp.py
+ - pytest -v -s -x lora/test_llama_tp.py
+
- label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/tests"
@@ -513,6 +548,7 @@ steps:
- label: Distributed Tests (A100) # optional
gpu: a100
+ optional: true
num_gpus: 4
source_file_dependencies:
- vllm/
@@ -521,11 +557,12 @@ steps:
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
+ - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
gpu: a100
+ optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
index 541b395eddbe7..7345dd4e66b29 100644
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@@ -25,7 +25,12 @@ echo "Version: $version"
# If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then
- new_version="1.0.0.dev"
+ suffix="${version##*.}"
+ if [[ $suffix == cu* ]]; then
+ new_version="1.0.0.dev+${suffix}"
+ else
+ new_version="1.0.0.dev"
+ fi
new_wheel="${wheel/$version/$new_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index cd721971d01d6..3cb91fc0f8232 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,13 +3,16 @@
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-CMakeLists.txt @tlrmchlsmth @WoosukKwon
+/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+CMakeLists.txt @tlrmchlsmth
+
+# vLLM V1
+/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
# Test ownership
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
index 71f4e520135d4..d1f6105a47166 100644
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,2 +1,2 @@
github: [vllm-project]
-open_collective: [vllm]
+open_collective: vllm
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 4f54eea564ecb..683b70cd89989 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -15,6 +15,8 @@ updates:
allow:
- dependency-type: "all"
ignore:
+ - dependency-name: "*"
+ update-types: ["version-update:semver-patch"]
- dependency-name: "torch"
- dependency-name: "torchvision"
- dependency-name: "xformers"
@@ -24,9 +26,6 @@ updates:
- dependency-name: "ray[adag]"
- dependency-name: "lm-eval"
groups:
- patch-update:
- applies-to: version-updates
- update-types: ["patch"]
minor-update:
applies-to: version-updates
update-types: ["minor"]
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
index 3b2da7b9f8966..3246c6f9bc4b7 100755
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -15,19 +15,36 @@ NEW=/tmp/new_pr_body.txt
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
cp "${OLD}" "${NEW}"
-# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
-sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
-
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
sed -i '/FIX #xxxx.*$/d' "${NEW}"
# Remove "FILL IN THE PR DESCRIPTION HERE"
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
+sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
+
+# Remove HTML section that includes text of "PR Checklist (Click to Expand)"
+python3 - <.*?.*?PR Checklist \(Click to Expand\).*?
.*?
', re.DOTALL)
+content = re.sub(pattern, '', content)
+
+with open("${NEW}", "w") as file:
+ file.write(content)
+EOF
+
# Run this only if ${NEW} is different than ${OLD}
if ! cmp -s "${OLD}" "${NEW}"; then
- echo "Updating PR body"
gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
+ echo
+ echo "Updated PR body:"
+ echo
+ cat "${NEW}"
else
echo "No changes needed"
fi
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
new file mode 100644
index 0000000000000..ab6f6e5d2060d
--- /dev/null
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -0,0 +1,81 @@
+name: Lint and Deploy Charts
+
+on: pull_request
+
+jobs:
+ lint-and-deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ with:
+ fetch-depth: 0
+
+ - name: Set up Helm
+ uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
+ with:
+ version: v3.14.4
+
+ #Python is required because ct lint runs Yamale and yamllint which require Python.
+ - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: '3.13'
+
+ - name: Set up chart-testing
+ uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
+ with:
+ version: v3.10.1
+
+ - name: Run chart-testing (lint)
+ run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
+
+ - name: Setup minio
+ run: |
+ docker network create vllm-net
+ docker run -d -p 9000:9000 --name minio --net vllm-net \
+ -e "MINIO_ACCESS_KEY=minioadmin" \
+ -e "MINIO_SECRET_KEY=minioadmin" \
+ -v /tmp/data:/data \
+ -v /tmp/config:/root/.minio \
+ minio/minio server /data
+ export AWS_ACCESS_KEY_ID=minioadmin
+ export AWS_SECRET_ACCESS_KEY=minioadmin
+ export AWS_EC2_METADATA_DISABLED=true
+ mkdir opt-125m
+ cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
+ aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
+ aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
+
+ - name: Create kind cluster
+ uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
+
+ - name: Build the Docker image vllm cpu
+ run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+
+ - name: Configuration of docker images, network and namespace for the kind cluster
+ run: |
+ docker pull amazon/aws-cli:2.6.4
+ kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
+ kind load docker-image vllm-cpu-env:latest --name chart-testing
+ docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
+ kubectl create ns ns-vllm
+
+ - name: Run chart-testing (install)
+ run: |
+ export AWS_ACCESS_KEY_ID=minioadmin
+ export AWS_SECRET_ACCESS_KEY=minioadmin
+ helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+
+ - name: curl test
+ run: |
+ kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
+ sleep 10
+ CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
+ --header "Content-Type: application/json" \
+ --data '{
+ "model": "opt-125m",
+ "prompt": "San Francisco is a",
+ "max_tokens": 7,
+ "temperature": 0
+ }'):$CODE"
+ echo "$CODE"
\ No newline at end of file
diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml
new file mode 100644
index 0000000000000..4932af943a07b
--- /dev/null
+++ b/.github/workflows/png-lint.yml
@@ -0,0 +1,37 @@
+name: Lint PNG exports from excalidraw
+on:
+ push:
+ branches:
+ - "main"
+ paths:
+ - '*.excalidraw.png'
+ - '.github/workflows/png-lint.yml'
+ pull_request:
+ branches:
+ - "main"
+ paths:
+ - '*.excalidraw.png'
+ - '.github/workflows/png-lint.yml'
+
+env:
+ LC_ALL: en_US.UTF-8
+
+defaults:
+ run:
+ shell: bash
+
+permissions:
+ contents: read
+
+jobs:
+ actionlint:
+ runs-on: ubuntu-latest
+ steps:
+ - name: "Checkout"
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ with:
+ fetch-depth: 0
+
+ - name: "Run png-lint.sh to check excalidraw exported images"
+ run: |
+ tools/png-lint.sh
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c1051d10a4860..e40ceaaa8b037 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -39,67 +39,68 @@ jobs:
const script = require('.github/workflows/scripts/create_release.js')
await script(github, context, core)
- wheel:
- name: Build Wheel
- runs-on: ${{ matrix.os }}
- needs: release
-
- strategy:
- fail-fast: false
- matrix:
- os: ['ubuntu-20.04']
- python-version: ['3.9', '3.10', '3.11', '3.12']
- pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
- cuda-version: ['11.8', '12.1']
-
- steps:
- - name: Checkout
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
- - name: Setup ccache
- uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
- with:
- create-symlink: true
- key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
- - name: Set up Linux Env
- if: ${{ runner.os == 'Linux' }}
- run: |
- bash -x .github/workflows/scripts/env.sh
-
- - name: Set up Python
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
- with:
- python-version: ${{ matrix.python-version }}
-
- - name: Install CUDA ${{ matrix.cuda-version }}
- run: |
- bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
- - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
- run: |
- bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
- - name: Build wheel
- shell: bash
- env:
- CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
- run: |
- bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
- wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
- asset_name=${wheel_name//"linux"/"manylinux1"}
- echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
- echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
- - name: Upload Release Asset
- uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- with:
- upload_url: ${{ needs.release.outputs.upload_url }}
- asset_path: ./dist/${{ env.wheel_name }}
- asset_name: ${{ env.asset_name }}
- asset_content_type: application/*
+ # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
+ # wheel:
+ # name: Build Wheel
+ # runs-on: ${{ matrix.os }}
+ # needs: release
+
+ # strategy:
+ # fail-fast: false
+ # matrix:
+ # os: ['ubuntu-20.04']
+ # python-version: ['3.9', '3.10', '3.11', '3.12']
+ # pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
+ # cuda-version: ['11.8', '12.1']
+
+ # steps:
+ # - name: Checkout
+ # uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+ # - name: Setup ccache
+ # uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
+ # with:
+ # create-symlink: true
+ # key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+
+ # - name: Set up Linux Env
+ # if: ${{ runner.os == 'Linux' }}
+ # run: |
+ # bash -x .github/workflows/scripts/env.sh
+
+ # - name: Set up Python
+ # uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ # with:
+ # python-version: ${{ matrix.python-version }}
+
+ # - name: Install CUDA ${{ matrix.cuda-version }}
+ # run: |
+ # bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+
+ # - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+ # run: |
+ # bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+
+ # - name: Build wheel
+ # shell: bash
+ # env:
+ # CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+ # run: |
+ # bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+ # wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
+ # asset_name=${wheel_name//"linux"/"manylinux1"}
+ # echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+ # echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+
+ # - name: Upload Release Asset
+ # uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
+ # env:
+ # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ # with:
+ # upload_url: ${{ needs.release.outputs.upload_url }}
+ # asset_path: ./dist/${{ env.wheel_name }}
+ # asset_name: ${{ env.asset_name }}
+ # asset_content_type: application/*
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
# - name: Publish package
diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml
new file mode 100644
index 0000000000000..e0bb24276a653
--- /dev/null
+++ b/.github/workflows/sphinx-lint.yml
@@ -0,0 +1,32 @@
+name: Lint documentation
+
+on:
+ push:
+ branches:
+ - main
+ paths:
+ - "docs/**"
+ pull_request:
+ branches:
+ - main
+ paths:
+ - "docs/**"
+
+jobs:
+ sphinx-lint:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.12"]
+ steps:
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements-lint.txt
+ - name: Linting docs
+ run: tools/sphinx-lint.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5acbd762ee957..51b49a18dddf2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
@@ -196,6 +196,8 @@ set(VLLM_EXT_SRC
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
+ "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
+ "csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp")
@@ -204,19 +206,32 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
- set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+ set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
- FetchContent_Declare(
+ # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
+ if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
+ set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
+ endif()
+
+ if(VLLM_CUTLASS_SRC_DIR)
+ if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
+ get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
+ endif()
+ message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
+ FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
+ else()
+ FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
- GIT_TAG v3.5.1
+ GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
- GIT_SHALLOW TRUE
- )
+ GIT_SHALLOW FALSE
+ )
+ endif()
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC
@@ -224,10 +239,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
- "csrc/quantization/gguf/gguf_kernel.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
- "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+ "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+ "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+ "csrc/sparse/cutlass/sparse_compressor_entry.cu"
+ "csrc/cutlass_extensions/common.cpp")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@@ -236,7 +253,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
- cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
+ cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
@@ -257,11 +274,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
#
- # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+ # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
+ # For Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+ set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+ "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
+ "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -270,12 +290,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
- message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+ message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
- "later if you intend on running FP8 quantized models on "
+ "later if you intend on running FP8 sparse or quantized models on "
"Hopper.")
else()
- message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+ message(STATUS "Not building cutlass_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
@@ -288,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
- "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+ "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
@@ -390,7 +410,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
- INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+ INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)
@@ -414,7 +434,7 @@ set_gencode_flags_for_srcs(
CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA")
- cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+ cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
@@ -509,7 +529,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
+ GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/Dockerfile b/Dockerfile
index 220dbe26712ec..123703848749c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
@@ -46,9 +47,14 @@ WORKDIR /workspace
# install build and runtime dependencies
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+ python3 -m pip install -r requirements-cuda-arm64.txt; \
+ fi
# cuda arch list used by torch
# can be useful for both `dev` and `test`
@@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
+ARG TARGETPLATFORM
# install build dependencies
COPY requirements-build.txt requirements-build.txt
@@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+ python3 -m pip install -r requirements-cuda-arm64.txt; \
+ fi
+
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
@@ -134,8 +146,8 @@ COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
-
#################### DEV IMAGE ####################
+
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
@@ -143,6 +155,9 @@ ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-# install vllm wheel first, so that torch etc will be installed
+# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose
RUN --mount=type=cache,target=/root/.cache/pip \
- . /etc/environment && \
- python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+ pip uninstall -y torch && \
+ python3 -m pip install -r requirements-cuda-arm64.txt; \
+ fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+. /etc/environment && \
+if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
+ python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+fi
COPY examples examples
#################### vLLM installation IMAGE ####################
-
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
@@ -191,6 +213,10 @@ ADD . /vllm-workspace/
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+ python3 -m pip install -e tests/vllm_test_utils
+
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install hf_transfer
@@ -205,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/
-
#################### TEST IMAGE ####################
#################### OPENAI API SERVER ####################
@@ -214,8 +239,11 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
-
+ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+ else \
+ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+ fi
ENV VLLM_USAGE_SOURCE production-docker-image
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.arm b/Dockerfile.arm
new file mode 100644
index 0000000000000..093ee2209222f
--- /dev/null
+++ b/Dockerfile.arm
@@ -0,0 +1,62 @@
+# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-arm
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+ apt-get update -y \
+ && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
+
+# Set LD_PRELOAD for tcmalloc on ARM
+ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+ --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+ pip install --upgrade pip && \
+ pip install -r requirements-build.txt
+
+FROM cpu-test-arm AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+ --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+ --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
+ pip install -v -r requirements-cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Disabling AVX512 specific optimizations for ARM
+ARG VLLM_CPU_DISABLE_AVX512="true"
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+ --mount=type=cache,target=/root/.cache/ccache \
+ --mount=type=bind,source=.git,target=.git \
+ VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+ pip install dist/*.whl && \
+ rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 287b4958da4e5..ebe226cf6d148 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -16,7 +16,7 @@ RUN --mount=type=cache,target=/var/cache/apt \
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install intel-openmp
+ pip install intel-openmp==2025.0.1
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
@@ -62,4 +62,8 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install -e tests/vllm_test_utils
+
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index d18fc016387bf..87e0c1a6a934e 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -11,6 +11,9 @@ ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 2143315d2a078..77162bc82de62 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -1,5 +1,6 @@
# default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
+# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
FROM $BASE_IMAGE
@@ -38,4 +39,7 @@ ENV VLLM_TARGET_DEVICE neuron
RUN --mount=type=bind,source=.git,target=.git \
pip install --no-build-isolation -v -e .
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
CMD ["/bin/bash"]
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index a05ff452cd36e..8bd188ffde408 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -22,4 +22,7 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVIC
COPY examples/ /workspace/examples
COPY benchmarks/ /workspace/benchmarks
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index b19c6ddec7948..971248577983f 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -29,6 +29,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py install
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 8fb79afaebe97..e733994f8c33e 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -51,9 +51,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
*"rocm-6.2"*) \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --pre \
- torch==2.6.0.dev20240918 \
+ torch==2.6.0.dev20241113+rocm6.2 \
'setuptools-scm>=8' \
- torchvision==0.20.0.dev20240918 \
+ torchvision==0.20.0.dev20241113+rocm6.2 \
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
*) ;; esac
@@ -168,4 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
if ls libs/*.whl; then \
python3 -m pip install libs/*.whl; fi
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
CMD ["/bin/bash"]
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 0a507b6ecdf60..b617932a85b47 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -22,4 +22,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
-r requirements-tpu.txt
RUN python3 setup.py develop
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
CMD ["/bin/bash"]
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 8bb80a299ccc4..827792c8c7bd8 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -50,9 +50,25 @@ ENV CUDA_HOME="/usr/local/cuda" \
PATH="${CUDA_HOME}/bin:${PATH}" \
LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
+# Install necessary packages to build numactl from source
+RUN microdnf install -y autoconf automake libtool make rpm-build
+
+# Download the numactl source RPM to /root directory
+WORKDIR /root
+RUN microdnf download --enablerepo=ubi-9-baseos-source --source numactl.src
+
+# Get NUMACTL_V (just the version part of the RPM file)
+RUN NUMACTL_V=$(ls /root/numactl-* | sed -r 's|/root/numactl-(.+)\.el9\.src\.rpm|\1|') && \
+ echo "########################################### NUMACTL_V is set to $NUMACTL_V" && \
+ rpm -i /root/numactl-${NUMACTL_V}.el9.src.rpm && \
+ rpmbuild -ba /root/rpmbuild/SPECS/numactl.spec && \
+ rpm -i /root/rpmbuild/RPMS/x86_64/{numactl-libs-${NUMACTL_V}.el9.x86_64.rpm,numactl-${NUMACTL_V}.el9.x86_64.rpm,numactl-devel-${NUMACTL_V}.el9.x86_64.rpm}
+
## Python cuda base #################################################################
FROM cuda-base AS python-cuda-base
+WORKDIR /workspace
+
ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
@@ -206,10 +222,15 @@ FROM vllm-openai as vllm-grpc-adapter
USER root
+RUN --mount=type=bind,from=cuda-base,source=/root/rpmbuild/RPMS/x86_64,target=/tmp/rpm/ \
+ rpm -i --nodeps /tmp/rpm/numactl-libs-*.el9.x86_64.rpm
+
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
- HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.5.3
+ HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]"
+
+RUN pip install git+https://github.com/opendatahub-io/vllm-tgis-adapter.git@vllm0p6p5
ENV GRPC_PORT=8033 \
PORT=8000 \
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 63bc682770422..a374f20d7d949 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -64,5 +64,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
ENV VLLM_USAGE_SOURCE production-docker-image \
TRITON_XPU_PROFILE 1
-
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/README.md b/README.md
index 6530886ed7de2..93b71ddaccc61 100644
--- a/README.md
+++ b/README.md
@@ -16,9 +16,10 @@ Easy, fast, and cheap LLM serving for everyone
---
*Latest News* 🔥
-- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
+- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
-- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
+- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
@@ -100,6 +101,7 @@ vLLM is a community project. Our compute resources for development and testing a
- Dropbox
- Google Cloud
- Lambda Lab
+- Nebius
- NVIDIA
- Replicate
- Roblox
@@ -132,3 +134,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
* For coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+
+## Media Kit
+
+* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 25c8b1bbf3e22..b67849038cf0d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -24,6 +24,7 @@ class RequestFuncInput:
model: str
best_of: int = 1
logprobs: Optional[int] = None
+ extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
@@ -36,6 +37,7 @@ class RequestFuncOutput:
ttft: float = 0.0 # Time to first token
itl: List[float] = field(
default_factory=list) # List of inter-token latencies
+ tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
error: str = ""
@@ -54,6 +56,7 @@ async def async_request_tgi(
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
+ "truncate": request_func_input.prompt_len,
# TGI does not accept ignore_eos flag.
}
payload = {
@@ -241,6 +244,8 @@ async def async_request_openai_completions(
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
+ if request_func_input.extra_body:
+ payload.update(request_func_input.extra_body)
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
@@ -335,6 +340,8 @@ async def async_request_openai_chat_completions(
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
+ if request_func_input.extra_body:
+ payload.update(request_func_input.extra_body)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
new file mode 100644
index 0000000000000..1a0e62598bfcb
--- /dev/null
+++ b/benchmarks/benchmark_guided.py
@@ -0,0 +1,494 @@
+"""Benchmark guided decoding throughput."""
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+from typing import List
+
+import datasets
+import pandas as pd
+import uvloop
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+ build_async_engine_client_from_engine_args)
+from vllm.sampling_params import GuidedDecodingParams
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+ """A class representing a single inference request for benchmarking.
+
+ Attributes:
+ prompt: The input text prompt for the model.
+ multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+ images).
+ prompt_len: The length of the prompt in tokens.
+ expected_output_len: The expected length of the output in tokens.
+ """
+ prompt: str
+ prompt_len: int
+ expected_output_len: int
+ schema: dict
+ structure_type: str = 'json'
+ completion: str = None
+
+
+def run_vllm(requests: List[SampleRequest],
+ engine_args: EngineArgs,
+ n: int,
+ guided_decoding_rate: float = 1.0,
+ warmup: bool = False) -> float:
+ from vllm import LLM, SamplingParams
+ llm = LLM(**vars(engine_args))
+
+ # Add the requests to the engine.
+ prompts: List[str] = []
+ sampling_params: List[SamplingParams] = []
+ # create a list containing random selected true or false
+ guided_decoding_req_idx = random.sample(
+ range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+ if warmup:
+ print(">>>>> Running warmup prompt, for the first 5")
+ # We setup the first 5 requests to warmup FSM
+ # if using xgrammar dataset, we will skip warmup
+ warmup_requests = requests[:5]
+ for i, request in enumerate(warmup_requests):
+ prompts.append(request.prompt)
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=1.0,
+ top_p=1.0,
+ ignore_eos=True,
+ max_tokens=request.expected_output_len,
+ guided_decoding=GuidedDecodingParams(json=request.schema)
+ if guided_decoding_rate > 0 else None,
+ ))
+ llm.generate(prompts, sampling_params, use_tqdm=False)
+
+ print(">>>>> Benchmark started...")
+ prompts = []
+ sampling_params = []
+ for i, request in enumerate(requests):
+ prompts.append(request.prompt)
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=1.0,
+ top_p=1.0,
+ ignore_eos=True,
+ max_tokens=request.expected_output_len,
+ guided_decoding=GuidedDecodingParams(
+ **{request.structure_type: request.schema})
+ if i in guided_decoding_req_idx else None,
+ ))
+
+ start = time.perf_counter()
+ outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
+ ret = []
+ for output, request in zip(outputs, requests):
+ generated_text = output.outputs[0].text
+ ret.append({
+ "generated": generated_text,
+ "expected": request.completion
+ })
+ end = time.perf_counter()
+ return end - start, ret
+
+
+async def run_vllm_async(
+ requests: List[SampleRequest],
+ engine_args: AsyncEngineArgs,
+ n: int,
+ guided_decoding_rate: float = 1.0,
+ warmup: bool = False,
+ disable_frontend_multiprocessing: bool = False) -> float:
+ from vllm import SamplingParams
+
+ async with build_async_engine_client_from_engine_args(
+ engine_args, disable_frontend_multiprocessing) as llm:
+
+ # Add the requests to the engine.
+ prompts: List[str] = []
+ sampling_params: List[SamplingParams] = []
+ guided_decoding_req_idx = random.sample(
+ range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+ if warmup:
+ print(">>>>>> Running warmup prompt, for the first 5")
+ # We setup the first 5 requests to warmup FSM
+ # if using xgrammar dataset, we will skip warmup
+ warmup_requests = requests[:5]
+ for i, request in enumerate(warmup_requests):
+ prompts.append(request.prompt)
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=1.0,
+ top_p=1.0,
+ ignore_eos=True,
+ max_tokens=request.expected_output_len,
+ guided_decoding=GuidedDecodingParams(
+ json=request.schema)
+ if guided_decoding_rate > 0 else None,
+ ))
+ generators = []
+ for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+ generator = llm.generate(prompt, sp, request_id=f"test{i}")
+ generators.append(generator)
+ all_gens = merge_async_iterators(*generators)
+ async for i, res in all_gens:
+ pass
+
+ print(">>>>> Benchmark started...")
+ prompts = []
+ sampling_params = []
+ for i, request in enumerate(requests):
+ prompts.append(request.prompt)
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=1.0,
+ top_p=1.0,
+ ignore_eos=True,
+ max_tokens=request.expected_output_len,
+ guided_decoding=GuidedDecodingParams(json=request.schema)
+ if i in guided_decoding_req_idx else None,
+ ))
+
+ generators = []
+ start_time = []
+ latencies = []
+ start = time.perf_counter()
+ for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+ generator = llm.generate(prompt, sp, request_id=f"test{i}")
+ generators.append(generator)
+ start_time.append(time.perf_counter())
+ latencies.append([])
+ all_gens = merge_async_iterators(*generators)
+ generated_texts = [''] * len(requests)
+ async for i, res in all_gens:
+ generated_texts[i] = res.outputs[0].text
+ lat = time.perf_counter() - start_time[i]
+ latencies[i].append(lat)
+ ret = [{
+ 'generated': gt,
+ 'expected': req.completion
+ } for gt, req in zip(generated_texts, requests)]
+ end = time.perf_counter()
+ first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
+ next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
+ for lat in latencies])
+ return end - start, ret, (first_latency, next_latency)
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+ args: argparse.Namespace) -> List[SampleRequest]:
+ if args.dataset == 'json':
+ if args.json_schema_path is None:
+ dir_path = os.path.dirname(os.path.realpath(__file__))
+ args.json_schema_path = os.path.join(dir_path,
+ "structured_schemas",
+ "structured_schema_1.json")
+ with open(args.json_schema_path) as f:
+ schema = json.load(f)
+ prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
+ input_len = len(tokenizer(prompt).input_ids)
+ print(f"Input length of the prompt: {input_len} tokens")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=schema,
+ structure_type=args.structure_type)
+ for _ in range(args.num_prompts)
+ ]
+
+ elif args.dataset == "grammar":
+ schema = """
+ ?start: select_statement
+
+ ?select_statement: "SELECT " column_list " FROM " table_name
+
+ ?column_list: column_name ("," column_name)*
+
+ ?table_name: identifier
+
+ ?column_name: identifier
+
+ ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+ """
+ prompt = "Generate an SQL query to show the 'username' \
+ and 'email' from the 'users' table."
+
+ input_len = len(tokenizer(prompt).input_ids)
+ print(f"Input length of the prompt: {input_len} tokens")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=schema,
+ structure_type=args.structure_type)
+ for _ in range(args.num_prompts)
+ ]
+
+ elif args.dataset == "regex":
+ regex = r"\w+@\w+\.com\n"
+ args.regex = regex
+ prompt = "Generate an email address for Alan Turing, \
+ who works in Enigma. End in .com and new line. \
+ Example result: alan.turing@enigma.com\n"
+
+ input_len = len(tokenizer(prompt).input_ids)
+ print(f"Input length of the prompt: {input_len} tokens")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=regex,
+ structure_type=args.structure_type)
+ for _ in range(args.num_prompts)
+ ]
+
+ elif args.dataset == "choice":
+ choice = ["Positive", "Negative"]
+ args.choice = choice
+ prompt = "Classify this sentiment: vLLM is wonderful!"
+ input_len = len(tokenizer(prompt).input_ids)
+ print(f"Input length of the prompt: {input_len} tokens")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=choice,
+ structure_type=args.structure_type)
+ for _ in range(args.num_prompts)
+ ]
+
+ elif args.dataset == "xgrammar_bench":
+ args.warmup = False
+ requests: List[SampleRequest] = []
+ dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+ split="train")
+ print(f"dataset has {len(dataset)} entries")
+ len_dataset = len(dataset)
+ for data_point_idx in range(args.num_prompts):
+ idx = data_point_idx
+ while idx >= len_dataset:
+ idx -= len_dataset
+ schema = dataset["schema"][idx]
+ prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+ tokenize=False)
+ input_len = len(tokenizer(prompt).input_ids)
+ completion = dataset["completion"][idx]
+
+ requests.append(
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=schema,
+ completion=completion))
+
+ return requests
+
+
+def evaluate(ret, args):
+
+ def _eval_correctness_json(expected, actual):
+ # extract json string from string using regex
+ import re
+ actual = actual.replace('\n', '').replace(' ', '').strip()
+ try:
+ actual = re.search(r'\{.*\}', actual).group()
+ actual = json.loads(actual)
+ except Exception:
+ return False
+
+ return True
+
+ def _eval_correctness_choice(expected, actual):
+ return actual in args.choice
+
+ def _eval_correctness_regex(expected, actual):
+ import re
+ return re.match(args.regex, actual) is not None
+
+ def _eval_correctness(expected, actual):
+ if args.structure_type == 'json':
+ return _eval_correctness_json(expected, actual)
+ elif args.structure_type == 'regex':
+ return _eval_correctness_regex(expected, actual)
+ elif args.structure_type == 'choice':
+ return _eval_correctness_choice(expected, actual)
+ else:
+ return None
+
+ scores = []
+ for res in ret:
+ score = _eval_correctness(res['expected'], res['generated'])
+ res['correctness'] = score
+ scores.append(score)
+
+ not_none_scores = [score for score in scores if score is not None]
+
+ return (sum(not_none_scores) / len(not_none_scores) *
+ 100) if len(not_none_scores) > 0 else None
+
+
+def main(args: argparse.Namespace):
+ print(args)
+ random.seed(args.seed)
+
+ # async engine is working for 'regex', 'choice' and 'grammar'
+ if args.dataset == 'grammar':
+ args.structure_type = 'grammar'
+ args.async_engine = False
+ elif args.dataset == 'regex':
+ args.structure_type = 'regex'
+ args.async_engine = False
+ elif args.dataset == 'choice':
+ args.structure_type = 'choice'
+ args.async_engine = False
+ else:
+ args.structure_type = 'json'
+
+ if args.no_guided_decoding:
+ args.guided_decoding_ratio = 0
+ if args.save_results:
+ result_file_name = f'{args.guided_decoding_ratio}guided'
+ result_file_name += f"_{args.model.split('/')[-1]}"
+ result_file_name += f"_{args.dataset}"
+ result_file_name += f"_{args.num_prompts}"
+ result_file_name += f"_out{args.output_len}"
+ result_file_name += f"_async{args.async_engine}"
+ result_file_name += f"_warmup{args.warmup}"
+ result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
+ result_file_name += ".txt"
+ else:
+ result_file_name = None
+
+ # Synthesize a prompt with the given input length.
+ tokenizer = AutoTokenizer.from_pretrained(
+ args.tokenizer, trust_remote_code=args.trust_remote_code)
+ requests = sample_requests(tokenizer, args)
+
+ if args.async_engine:
+ engine_args = AsyncEngineArgs.from_cli_args(args)
+ elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
+ run_vllm_async(requests, engine_args, args.n,
+ args.guided_decoding_ratio, args.warmup,
+ args.disable_frontend_multiprocessing))
+ else:
+ engine_args = EngineArgs.from_cli_args(args)
+ elapsed_time, ret = run_vllm(requests, engine_args, args.n,
+ args.guided_decoding_ratio, args.warmup)
+ first_latency, next_latency = None, None
+
+ score = evaluate(ret, args)
+ total_num_tokens = sum(request.prompt_len + request.expected_output_len
+ for request in requests)
+ total_output_tokens = sum(request.expected_output_len
+ for request in requests)
+ if first_latency is not None:
+ latency_breakdown = "\nFirst token latency(msecs):\n"
+ latency_breakdown += f"{first_latency.describe()}"
+ latency_breakdown += "\nNext token latency(msecs):\n"
+ latency_breakdown += f"{next_latency.describe()}"
+ print(
+ f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+ f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+ f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
+ f"Correct rate is {score} %",
+ f"{latency_breakdown if first_latency is not None else ''}")
+
+ # Output JSON results if specified
+ if args.output_json or result_file_name:
+ results = {
+ "elapsed_time": elapsed_time,
+ "num_requests": len(requests),
+ "total_num_tokens": total_num_tokens,
+ "total_output_tokens": total_output_tokens,
+ "requests_per_second": len(requests) / elapsed_time,
+ "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
+ "output_tokens_per_second":
+ f"{total_output_tokens / elapsed_time:.2f}",
+ "correct_rate(%)": score
+ }
+ results = {"outputs": ret, **results}
+ if first_latency is not None:
+ results["first_token_latency(msecs)"] = first_latency.describe(
+ ).to_dict()
+ results["next_token_latency(msecs)"] = next_latency.describe(
+ ).to_dict()
+ if args.output_json:
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+ elif result_file_name:
+ with open(result_file_name, "w") as f:
+ json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
+ parser = AsyncEngineArgs.add_cli_args(parser)
+
+ parser.add_argument("--output-len",
+ type=int,
+ default=512,
+ help="Output length for each request. Overrides the "
+ "output length from the dataset.")
+ parser.add_argument(
+ "--dataset",
+ default='json',
+ choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+ parser.add_argument("--json_schema_path",
+ type=str,
+ default=None,
+ help="Path to json schema.")
+ parser.add_argument("--n",
+ type=int,
+ default=1,
+ help="Number of generated sequences per prompt.")
+ parser.add_argument("--num-prompts",
+ type=int,
+ default=10,
+ help="Number of prompts to process.")
+ parser.add_argument(
+ '--output-json',
+ type=str,
+ default=None,
+ help='Path to save the throughput results in JSON format.')
+ parser.add_argument("--async-engine",
+ action='store_true',
+ default=False,
+ help="Use vLLM async engine rather than LLM class.")
+ parser.add_argument("--no-guided-decoding",
+ action='store_true',
+ default=False,
+ help="Whether to disable JSON decoding or not.")
+ parser.add_argument("--guided-decoding-ratio",
+ type=float,
+ default=1.0,
+ help="Ratio of Guided Decoding requests")
+ parser.add_argument("--disable-frontend-multiprocessing",
+ action='store_true',
+ default=False,
+ help="Disable decoupled async engine frontend.")
+ parser.add_argument("--warmup",
+ action="store_true",
+ default=False,
+ help="Run warmup prompts before benchmark.")
+ parser.add_argument("--save-results",
+ action="store_true",
+ default=False,
+ help="save output results.")
+ args = parser.parse_args()
+ if args.tokenizer is None:
+ args.tokenizer = args.model
+ main(args)
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 6d33096ca1d11..5e9381f712e10 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -54,13 +54,30 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
print(f"cost time {end_time - start_time}")
-def sample_requests(
+@dataclasses.dataclass
+class Request:
+ prompt: str
+ prompt_len: int
+ output_len: int
+
+
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
+ vocab = tokenizer.get_vocab()
+ # Remove the special tokens.
+ vocab = {
+ k: v
+ for k, v in vocab.items() if k not in tokenizer.all_special_ids
+ }
+ return random.choices(list(vocab.values()), k=length)
+
+
+def sample_requests_from_dataset(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: Tuple[int, int],
fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> List[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@@ -77,31 +94,55 @@ def sample_requests(
random.shuffle(dataset)
min_len, max_len = input_length_range
+ assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
# Filter out sequences that are too long or too short
- filtered_dataset: List[Tuple[str, int, int]] = []
+ filtered_requests: List[Request] = []
+
for i in range(len(dataset)):
- if len(filtered_dataset) == num_requests:
+ if len(filtered_requests) == num_requests:
break
# Tokenize the prompts and completions.
- prompt = dataset[i][0]
- prompt_token_ids = tokenizer(prompt).input_ids
+ prompt_token_ids = tokenizer(dataset[i][0]).input_ids
+ prompt = tokenizer.decode(prompt_token_ids)
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
- output_len = len(completion_token_ids
- ) if fixed_output_len is None else fixed_output_len
- if prompt_len < 4 or output_len < 4:
- # Prune too short sequences.
- continue
+ output_len = (len(completion_token_ids)
+ if fixed_output_len is None else fixed_output_len)
if min_len <= prompt_len <= max_len:
- filtered_dataset.append((prompt, prompt_len, output_len))
+ filtered_requests.append(Request(prompt, prompt_len, output_len))
+
+ return filtered_requests
+
+
+def sample_requests_from_random(
+ num_requests: int,
+ tokenizer: PreTrainedTokenizerBase,
+ input_length_range: Tuple[int, int],
+ fixed_output_len: Optional[int],
+ prefix_len: int,
+) -> List[Request]:
- return filtered_dataset
+ requests = []
+ prefix_token_ids = sample_tokens(tokenizer, prefix_len)
+ min_len, max_len = input_length_range
+
+ for i in range(num_requests):
+ unique_part_token_ids = sample_tokens(
+ tokenizer,
+ random.randint(min_len - prefix_len, max_len - prefix_len))
+ prompt_token_ids = prefix_token_ids + unique_part_token_ids
+ prompt = tokenizer.decode(prompt_token_ids)
+ prompt_len = len(prompt_token_ids)
+ assert (min_len <= prompt_len <= max_len
+ ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+ requests.append(Request(prompt, prompt_len, fixed_output_len))
+ return requests
-def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
+def repeat_and_sort_requests(requests: List[Request],
repeat_count: int,
sort: bool = False) -> List[str]:
repeated_requests = requests * repeat_count
@@ -109,7 +150,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
repeated_requests.sort(key=lambda x: x[1])
else:
random.shuffle(repeated_requests)
- return [req[0] for req in repeated_requests]
+ return [req.prompt for req in repeated_requests]
def main(args):
@@ -117,9 +158,12 @@ def main(args):
input_length_range = tuple(map(int, args.input_length_range.split(':')))
random.seed(args.seed)
if args.dataset_path is not None:
- print(f"Start to sample {args.num_prompts} prompts"
+ if args.prefix_len > 0:
+ raise ValueError("prefix-len is not supported when "
+ "dataset-path is provided.")
+ print(f"Start to sample {args.num_prompts} prompts "
f"from {args.dataset_path}")
- filtered_datasets = sample_requests(
+ filtered_requests = sample_requests_from_dataset(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
@@ -127,9 +171,22 @@ def main(args):
fixed_output_len=args.output_len,
)
else:
- prompt_len = len(tokenizer(PROMPT).input_ids)
- filtered_datasets = [(PROMPT, prompt_len, args.output_len)
- ] * args.num_prompts
+ print(f"Start to sample {args.num_prompts} prompts from random")
+ filtered_requests = sample_requests_from_random(
+ num_requests=args.num_prompts,
+ tokenizer=tokenizer,
+ input_length_range=input_length_range,
+ fixed_output_len=args.output_len,
+ prefix_len=args.prefix_len,
+ )
+
+ # Print some helpful stats of the requests.
+ print(f"Sampled {len(filtered_requests)} requests.")
+ prompt_lens = [req.prompt_len for req in filtered_requests]
+ print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}")
+ print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}")
+ print(f"Min Prompt Length: {min(prompt_lens)}")
+ print(f"Max Prompt Length: {max(prompt_lens)}")
engine_args = EngineArgs.from_cli_args(args)
@@ -137,8 +194,8 @@ def main(args):
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
- print("Testing filtered datasets")
- prompts = repeat_and_sort_requests(filtered_datasets,
+ print("Testing filtered requests")
+ prompts = repeat_and_sort_requests(filtered_requests,
repeat_count=args.repeat_count,
sort=args.sort)
@@ -161,20 +218,29 @@ def main(args):
parser.add_argument('--output-len', type=int, default=10)
parser.add_argument('--num-prompts',
type=int,
- default=1,
+ required=True,
help="Number of the prompts sampled from dataset")
parser.add_argument('--repeat-count',
type=int,
- default=100,
+ default=1,
help='Number of times to repeat each prompt')
parser.add_argument('--sort',
action='store_true',
help='Sort prompts by input length')
parser.add_argument('--input-length-range',
type=str,
- default='128:256',
+ required=True,
help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").')
+ parser.add_argument(
+ "--prefix-len",
+ type=int,
+ default=0,
+ help="Specifies the length of a common prefix to be "
+ "added to the input prompt. The input-length-range will "
+ "subtract this length when filtering prompts. Only used "
+ "when dataset-path is not provided.",
+ )
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index bdb8ea8e2a5dc..4eb0e1f8ac903 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -199,6 +199,56 @@ def sample_sonnet_requests(
return sampled_requests
+def sample_mmmu_pro_vision_requests(
+ dataset,
+ num_requests: int,
+ tokenizer: PreTrainedTokenizerBase,
+ fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+ sampled_requests: List[Tuple[str, int, int, Dict[str,
+ Collection[str]]]] = []
+ for data in dataset:
+ if len(sampled_requests) == num_requests:
+ break
+
+ # MMMU-Pro vision direct prompt
+ # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
+ prompt = (
+ "Answer with the option letter from the given choices directly. "
+ "The last line of your response should be of the following "
+ "format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
+ "options.")
+
+ prompt_token_ids = tokenizer(prompt).input_ids
+ if fixed_output_len is None:
+ # Default max output len is set to 128
+ print("--hf-output-len is not provided. Using default value 128.")
+ fixed_output_len = 128
+
+ prompt_len = len(prompt_token_ids)
+ output_len = fixed_output_len
+
+ assert isinstance(
+ data["image"],
+ Image), ("Input image format must be `PIL.Image.Image`, "
+ f"given {type(data['image'])}.")
+ image: Image = data["image"]
+ image = image.convert("RGB")
+ image_data = io.BytesIO()
+ image.save(image_data, format='JPEG')
+ image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+ mm_content = {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{image_base64}"
+ },
+ }
+
+ sampled_requests.append((prompt, prompt_len, output_len, mm_content))
+
+ return sampled_requests
+
+
def sample_hf_requests(
dataset_path: str,
dataset_subset: str,
@@ -208,6 +258,21 @@ def sample_hf_requests(
random_seed: int,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+
+ # Special case for MMMU-Pro vision dataset
+ if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
+ assert dataset_split == "test"
+ dataset = load_dataset(dataset_path,
+ name=dataset_subset,
+ split=dataset_split,
+ streaming=True)
+ assert "image" in dataset.features, (
+ "MMMU/MMMU_Pro vision dataset must have 'image' column.")
+ filter_func = lambda x: isinstance(x["image"], Image)
+ dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
+ return sample_mmmu_pro_vision_requests(dataset, num_requests,
+ tokenizer, fixed_output_len)
+
dataset = load_dataset(dataset_path,
name=dataset_subset,
split=dataset_split,
@@ -251,6 +316,19 @@ def sample_hf_requests(
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
+ elif "image" in data and isinstance(data["image"], str):
+ if (data["image"].startswith("http://") or \
+ data["image"].startswith("file://")):
+ image_url = data["image"]
+ else:
+ image_url = f"file://{data['image']}"
+
+ mm_content = {
+ "type": "image_url",
+ "image_url": {
+ "url": image_url
+ },
+ }
else:
mm_content = None
@@ -703,6 +781,7 @@ def main(args: argparse.Namespace):
backend = args.backend
model_id = args.model
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+ tokenizer_mode = args.tokenizer_mode
if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}"
@@ -712,6 +791,7 @@ def main(args: argparse.Namespace):
base_url = f"http://{args.host}:{args.port}"
tokenizer = get_tokenizer(tokenizer_id,
+ tokenizer_mode=tokenizer_mode,
trust_remote_code=args.trust_remote_code)
if args.dataset is not None:
@@ -1132,5 +1212,15 @@ def main(args: argparse.Namespace):
"from the sampled HF dataset.",
)
+ parser.add_argument(
+ '--tokenizer-mode',
+ type=str,
+ default="auto",
+ choices=['auto', 'slow', 'mistral'],
+ help='The tokenizer mode.\n\n* "auto" will use the '
+ 'fast tokenizer if available.\n* "slow" will '
+ 'always use the slow tokenizer. \n* '
+ '"mistral" will always use the `mistral_common` tokenizer.')
+
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
new file mode 100644
index 0000000000000..4435d87e18a8a
--- /dev/null
+++ b/benchmarks/benchmark_serving_guided.py
@@ -0,0 +1,881 @@
+r"""Benchmark online serving throughput with guided decoding.
+
+On the server side, run one of the following commands:
+ (vLLM OpenAI API server)
+ vllm serve --disable-log-requests
+
+ (TGI backend)
+ ./launch_tgi_server.sh
+
+On the client side, run:
+ python benchmarks/benchmark_serving.py \
+ --backend \
+ --model \
+ --dataset json \
+ --guided-decoding-ratio 1.0 \
+ --guided-decoding-backend xgrammar \
+ --request-rate 10 \
+ --num-prompts 1000
+
+ when using tgi backend, add
+ --endpoint /generate_stream
+ to the end of the command above.
+"""
+import argparse
+import asyncio
+import dataclasses
+import json
+import os
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from typing import AsyncGenerator, List, Optional, Tuple
+
+import datasets
+import numpy as np
+import pandas as pd
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+ RequestFuncOutput)
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+try:
+ from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+ from backend_request_func import get_tokenizer
+
+try:
+ from vllm.utils import FlexibleArgumentParser
+except ImportError:
+ from argparse import ArgumentParser as FlexibleArgumentParser
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+ completed: int
+ total_input: int
+ total_output: int
+ request_throughput: float
+ request_goodput: float
+ output_throughput: float
+ total_token_throughput: float
+ mean_ttft_ms: float
+ median_ttft_ms: float
+ std_ttft_ms: float
+ percentiles_ttft_ms: List[Tuple[float, float]]
+ mean_tpot_ms: float
+ median_tpot_ms: float
+ std_tpot_ms: float
+ percentiles_tpot_ms: List[Tuple[float, float]]
+ mean_itl_ms: float
+ median_itl_ms: float
+ std_itl_ms: float
+ percentiles_itl_ms: List[Tuple[float, float]]
+ # E2EL stands for end-to-end latency per request.
+ # It is the time taken on the client side from sending
+ # a request to receiving a complete response.
+ mean_e2el_ms: float
+ median_e2el_ms: float
+ std_e2el_ms: float
+ percentiles_e2el_ms: List[Tuple[float, float]]
+
+
+@dataclasses.dataclass
+class SampleRequest:
+ """A class representing a single inference request for benchmarking.
+
+ Attributes:
+ prompt: The input text prompt for the model.
+ multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+ images).
+ prompt_len: The length of the prompt in tokens.
+ expected_output_len: The expected length of the output in tokens.
+ """
+ prompt: str
+ prompt_len: int
+ expected_output_len: int
+ schema: dict
+ structure_type: str
+ completion: str = None
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+ args: argparse.Namespace) -> List[SampleRequest]:
+ if args.dataset == 'json':
+ if args.json_schema_path is None:
+ dir_path = os.path.dirname(os.path.realpath(__file__))
+ args.json_schema_path = os.path.join(dir_path,
+ "structured_schemas",
+ "structured_schema_1.json")
+ with open(args.json_schema_path) as f:
+ schema = json.load(f)
+ prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
+ input_len = len(tokenizer(prompt).input_ids)
+ print(f"Input length of the prompt: {input_len} tokens")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=schema,
+ structure_type=args.structure_type)
+ for _ in range(args.num_prompts)
+ ]
+
+ elif args.dataset == "grammar":
+ schema = """
+ ?start: select_statement
+
+ ?select_statement: "SELECT " column_list " FROM " table_name
+
+ ?column_list: column_name ("," column_name)*
+
+ ?table_name: identifier
+
+ ?column_name: identifier
+
+ ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+ """
+ prompt = "Generate an SQL query to show the 'username' \
+ and 'email' from the 'users' table."
+
+ input_len = len(tokenizer(prompt).input_ids)
+ print(f"Input length of the prompt: {input_len} tokens")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=schema,
+ structure_type=args.structure_type)
+ for _ in range(args.num_prompts)
+ ]
+
+ elif args.dataset == "regex":
+ regex = r"\w+@\w+\.com\n"
+ args.regex = regex
+ prompt = "Generate an email address for Alan Turing, \
+ who works in Enigma. End in .com and new line. \
+ Example result: alan.turing@enigma.com\n"
+
+ input_len = len(tokenizer(prompt).input_ids)
+ print(f"Input length of the prompt: {input_len} tokens")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=regex,
+ structure_type=args.structure_type)
+ for _ in range(args.num_prompts)
+ ]
+
+ elif args.dataset == "choice":
+ choice = ["Positive", "Negative"]
+ args.choice = choice
+ prompt = "Classify this sentiment: vLLM is wonderful!"
+ input_len = len(tokenizer(prompt).input_ids)
+ print(f"Input length of the prompt: {input_len} tokens")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=choice,
+ structure_type=args.structure_type)
+ for _ in range(args.num_prompts)
+ ]
+
+ elif args.dataset == "xgrammar_bench":
+ requests: List[SampleRequest] = []
+ dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+ split="train")
+ print(f"dataset has {len(dataset)} entries")
+ len_dataset = len(dataset)
+ for data_point_idx in range(args.num_prompts):
+ idx = data_point_idx
+ while idx >= len_dataset:
+ idx -= len_dataset
+ schema = dataset["schema"][idx]
+ prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+ tokenize=False)
+ input_len = len(tokenizer(prompt).input_ids)
+ completion = dataset["completion"][idx]
+
+ requests.append(
+ SampleRequest(prompt=prompt,
+ prompt_len=input_len,
+ expected_output_len=args.output_len,
+ schema=schema,
+ structure_type=args.structure_type,
+ completion=completion))
+
+ return requests
+
+
+async def get_request(
+ input_requests: List[SampleRequest],
+ request_rate: float,
+ burstiness: float = 1.0,
+) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
+ """
+ Asynchronously generates requests at a specified rate
+ with OPTIONAL burstiness.
+
+ Args:
+ input_requests:
+ A list of input requests, each represented as a tuple.
+ request_rate:
+ The rate at which requests are generated (requests/s).
+ burstiness (optional):
+ The burstiness factor of the request generation.
+ Only takes effect when request_rate is not inf.
+ Default value is 1, which follows a Poisson process.
+ Otherwise, the request intervals follow a gamma distribution.
+ A lower burstiness value (0 < burstiness < 1) results
+ in more bursty requests, while a higher burstiness value
+ (burstiness > 1) results in a more uniform arrival of requests.
+ """
+ input_requests = iter(input_requests)
+
+ # Calculate scale parameter theta to maintain the desired request_rate.
+ assert burstiness > 0, (
+ f"A positive burstiness factor is expected, but given {burstiness}.")
+ theta = 1.0 / (request_rate * burstiness)
+
+ for i, request in enumerate(input_requests):
+ yield i, request
+
+ if request_rate == float("inf"):
+ # If the request rate is infinity, then we don't need to wait.
+ continue
+
+ # Sample the request interval from the gamma distribution.
+ # If burstiness is 1, it follows exponential distribution.
+ interval = np.random.gamma(shape=burstiness, scale=theta)
+ # The next request will be sent after the interval.
+ await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+ input_requests: List[Tuple[str, int, int]],
+ outputs: List[RequestFuncOutput],
+ dur_s: float,
+ tokenizer: PreTrainedTokenizerBase,
+ selected_percentile_metrics: List[str],
+ selected_percentiles: List[float],
+) -> Tuple[BenchmarkMetrics, List[int]]:
+ actual_output_lens: List[int] = []
+ total_input = 0
+ completed = 0
+ good_completed = 0
+ itls: List[float] = []
+ tpots: List[float] = []
+ all_tpots: List[float] = []
+ ttfts: List[float] = []
+ e2els: List[float] = []
+ for i in range(len(outputs)):
+ if outputs[i].success:
+ # We use the tokenizer to count the number of output tokens for all
+ # serving backends instead of looking at len(outputs[i].itl) since
+ # multiple output tokens may be bundled together
+ # Note : this may inflate the output token count slightly
+ output_len = len(
+ tokenizer(outputs[i].generated_text,
+ add_special_tokens=False).input_ids)
+ actual_output_lens.append(output_len)
+ total_input += input_requests[i].prompt_len
+ tpot = 0
+ if output_len > 1:
+ tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
+ 1)
+ tpots.append(tpot)
+ outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
+ # Note: if output_len <= 1, we regard tpot as 0 for goodput
+ all_tpots.append(tpot)
+ itls += outputs[i].itl
+ ttfts.append(outputs[i].ttft)
+ e2els.append(outputs[i].latency)
+ completed += 1
+ else:
+ actual_output_lens.append(0)
+
+ if completed == 0:
+ warnings.warn(
+ "All requests failed. This is likely due to a misconfiguration "
+ "on the benchmark arguments.",
+ stacklevel=2)
+ metrics = BenchmarkMetrics(
+ completed=completed,
+ total_input=total_input,
+ total_output=sum(actual_output_lens),
+ request_throughput=completed / dur_s,
+ request_goodput=good_completed / dur_s,
+ output_throughput=sum(actual_output_lens) / dur_s,
+ total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+ mean_ttft_ms=np.mean(ttfts or 0) *
+ 1000, # ttfts is empty if streaming is not supported by backend
+ std_ttft_ms=np.std(ttfts or 0) * 1000,
+ median_ttft_ms=np.median(ttfts or 0) * 1000,
+ percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+ for p in selected_percentiles],
+ mean_tpot_ms=np.mean(tpots or 0) * 1000,
+ std_tpot_ms=np.std(tpots or 0) * 1000,
+ median_tpot_ms=np.median(tpots or 0) * 1000,
+ percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+ for p in selected_percentiles],
+ mean_itl_ms=np.mean(itls or 0) * 1000,
+ std_itl_ms=np.std(itls or 0) * 1000,
+ median_itl_ms=np.median(itls or 0) * 1000,
+ percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+ for p in selected_percentiles],
+ mean_e2el_ms=np.mean(e2els or 0) * 1000,
+ std_e2el_ms=np.std(e2els or 0) * 1000,
+ median_e2el_ms=np.median(e2els or 0) * 1000,
+ percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+ for p in selected_percentiles],
+ )
+
+ return metrics, actual_output_lens
+
+
+async def benchmark(
+ backend: str,
+ api_url: str,
+ base_url: str,
+ model_id: str,
+ tokenizer: PreTrainedTokenizerBase,
+ input_requests: List[SampleRequest],
+ request_rate: float,
+ burstiness: float,
+ disable_tqdm: bool,
+ profile: bool,
+ selected_percentile_metrics: List[str],
+ selected_percentiles: List[str],
+ ignore_eos: bool,
+ max_concurrency: Optional[int],
+ guided_decoding_ratio: float,
+ guided_decoding_backend: str,
+):
+ if backend in ASYNC_REQUEST_FUNCS:
+ request_func = ASYNC_REQUEST_FUNCS[backend]
+ else:
+ raise ValueError(f"Unknown backend: {backend}")
+
+ def prepare_extra_body(request) -> dict:
+ extra_body = {}
+ # Add the schema to the extra_body
+ extra_body[request.structure_type] = request.schema
+ # Add the specific guided_decoding_backend
+ extra_body["guided_decoding_backend"] = guided_decoding_backend
+ return extra_body
+
+ print("Starting initial single prompt test run...")
+ guided_decoding_req_idx = random.sample(
+ range(len(input_requests)),
+ int(len(input_requests) * guided_decoding_ratio))
+
+ test_request = input_requests[0]
+ test_input = RequestFuncInput(
+ model=model_id,
+ prompt=test_request.prompt,
+ api_url=api_url,
+ prompt_len=test_request.prompt_len,
+ output_len=test_request.expected_output_len,
+ ignore_eos=ignore_eos,
+ extra_body=prepare_extra_body(test_request),
+ )
+ test_output = await request_func(request_func_input=test_input)
+ if not test_output.success:
+ raise ValueError(
+ "Initial test run failed - Please make sure benchmark arguments "
+ f"are correctly specified. Error: {test_output.error}")
+ else:
+ print("Initial test run completed. Starting main benchmark run...")
+
+ if profile:
+ print("Starting profiler...")
+ profile_input = RequestFuncInput(
+ model=model_id,
+ prompt=test_request.prompt,
+ api_url=base_url + "/start_profile",
+ prompt_len=test_request.prompt_len,
+ output_len=test_request.expected_output_len,
+ ignore_eos=ignore_eos,
+ extra_body=prepare_extra_body(test_request),
+ )
+ profile_output = await request_func(request_func_input=profile_input)
+ if profile_output.success:
+ print("Profiler started")
+
+ if burstiness == 1.0:
+ distribution = "Poisson process"
+ else:
+ distribution = "Gamma distribution"
+
+ print(f"Traffic request rate: {request_rate}")
+ print(f"Burstiness factor: {burstiness} ({distribution})")
+ print(f"Maximum request concurrency: {max_concurrency}")
+
+ pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+ # This can be used once the minimum Python version is 3.10 or higher,
+ # and it will simplify the code in limited_request_func.
+ # semaphore = (asyncio.Semaphore(max_concurrency)
+ # if max_concurrency else contextlib.nullcontext())
+ semaphore = (asyncio.Semaphore(max_concurrency)
+ if max_concurrency else None)
+
+ async def limited_request_func(request_func_input, pbar):
+ if semaphore is None:
+ return await request_func(request_func_input=request_func_input,
+ pbar=pbar)
+ async with semaphore:
+ return await request_func(request_func_input=request_func_input,
+ pbar=pbar)
+
+ benchmark_start_time = time.perf_counter()
+ tasks: List[asyncio.Task] = []
+ expected: List[str] = []
+ async for i, request in get_request(input_requests, request_rate,
+ burstiness):
+ extra_body = prepare_extra_body(
+ request) if i in guided_decoding_req_idx else None
+ request_func_input = RequestFuncInput(
+ model=model_id,
+ prompt=request.prompt,
+ api_url=api_url,
+ prompt_len=request.prompt_len,
+ output_len=request.expected_output_len,
+ ignore_eos=ignore_eos,
+ extra_body=extra_body,
+ )
+ expected.append(request.completion)
+ tasks.append(
+ asyncio.create_task(
+ limited_request_func(request_func_input=request_func_input,
+ pbar=pbar)))
+ outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+ if profile:
+ print("Stopping profiler...")
+ profile_input = RequestFuncInput(
+ model=model_id,
+ prompt=test_request.prompt,
+ api_url=base_url + "/stop_profile",
+ prompt_len=test_request.prompt_len,
+ output_len=test_request.expected_output_len,
+ extra_body={test_request.structure_type: test_request.schema},
+ )
+ profile_output = await request_func(request_func_input=profile_input)
+ if profile_output.success:
+ print("Profiler stopped")
+
+ if pbar is not None:
+ pbar.close()
+
+ benchmark_duration = time.perf_counter() - benchmark_start_time
+
+ metrics, actual_output_lens = calculate_metrics(
+ input_requests=input_requests,
+ outputs=outputs,
+ dur_s=benchmark_duration,
+ tokenizer=tokenizer,
+ selected_percentile_metrics=selected_percentile_metrics,
+ selected_percentiles=selected_percentiles,
+ )
+
+ print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+ print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+ print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+ benchmark_duration))
+ print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+ print("{:<40} {:<10}".format("Total generated tokens:",
+ metrics.total_output))
+ print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+ metrics.request_throughput))
+ print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+ metrics.output_throughput))
+ print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
+ metrics.total_token_throughput))
+
+ result = {
+ "duration":
+ benchmark_duration,
+ "completed":
+ metrics.completed,
+ "total_input_tokens":
+ metrics.total_input,
+ "total_output_tokens":
+ metrics.total_output,
+ "request_throughput":
+ metrics.request_throughput,
+ "output_throughput":
+ metrics.output_throughput,
+ "total_token_throughput":
+ metrics.total_token_throughput,
+ "ttft_description":
+ pd.Series([output.ttft for output in outputs]).describe().to_dict(),
+ "tpot_description":
+ pd.Series([output.tpot for output in outputs]).describe().to_dict(),
+ "input_lens": [output.prompt_len for output in outputs],
+ "output_lens":
+ actual_output_lens,
+ "ttfts": [output.ttft for output in outputs],
+ "itls": [output.itl for output in outputs],
+ "errors": [output.error for output in outputs],
+ }
+
+ ret = [{
+ 'generated': output.generated_text,
+ 'expected': gt
+ } for output, gt in zip(outputs, expected)]
+
+ def process_one_metric(
+ # E.g., "ttft"
+ metric_attribute_name: str,
+ # E.g., "TTFT"
+ metric_name: str,
+ # E.g., "Time to First Token"
+ metric_header: str,
+ ):
+ # This function prints and adds statistics of the specified
+ # metric.
+ if metric_attribute_name not in selected_percentile_metrics:
+ return
+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+ print("{:<40} {:<10.2f}".format(
+ f"Mean {metric_name} (ms):",
+ getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+ print("{:<40} {:<10.2f}".format(
+ f"Median {metric_name} (ms):",
+ getattr(metrics, f"median_{metric_attribute_name}_ms")))
+ result[f"mean_{metric_attribute_name}_ms"] = getattr(
+ metrics, f"mean_{metric_attribute_name}_ms")
+ result[f"median_{metric_attribute_name}_ms"] = getattr(
+ metrics, f"median_{metric_attribute_name}_ms")
+ result[f"std_{metric_attribute_name}_ms"] = getattr(
+ metrics, f"std_{metric_attribute_name}_ms")
+ for p, value in getattr(metrics,
+ f"percentiles_{metric_attribute_name}_ms"):
+ p_word = str(int(p)) if int(p) == p else str(p)
+ print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+ value))
+ result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+ process_one_metric("ttft", "TTFT", "Time to First Token")
+ process_one_metric("tpot", "TPOT",
+ "Time per Output Token (excl. 1st token)")
+ process_one_metric("itl", "ITL", "Inter-token Latency")
+ process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+ print("=" * 50)
+
+ return result, ret
+
+
+def evaluate(ret, args):
+
+ def _eval_correctness_json(expected, actual):
+ # extract json string from string using regex
+ import re
+ actual = actual.replace('\n', '').replace(' ', '').strip()
+ try:
+ actual = re.search(r'\{.*\}', actual).group()
+ actual = json.loads(actual)
+ except Exception:
+ return False
+
+ return True
+
+ def _eval_correctness_choice(expected, actual):
+ return actual in args.choice
+
+ def _eval_correctness_regex(expected, actual):
+ import re
+ return re.match(args.regex, actual) is not None
+
+ def _eval_correctness(expected, actual):
+ if args.structure_type == 'guided_json':
+ return _eval_correctness_json(expected, actual)
+ elif args.structure_type == 'guided_regex':
+ return _eval_correctness_regex(expected, actual)
+ elif args.structure_type == 'guided_choice':
+ return _eval_correctness_choice(expected, actual)
+ else:
+ return None
+
+ scores = []
+ for res in ret:
+ score = _eval_correctness(res['expected'], res['generated'])
+ res['correctness'] = score
+ scores.append(score)
+
+ not_none_scores = [score for score in scores if score is not None]
+
+ return (sum(not_none_scores) / len(not_none_scores) *
+ 100) if len(not_none_scores) > 0 else None
+
+
+def main(args: argparse.Namespace):
+ print(args)
+ random.seed(args.seed)
+ np.random.seed(args.seed)
+
+ backend = args.backend
+ model_id = args.model
+ tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+ if args.base_url is not None:
+ api_url = f"{args.base_url}{args.endpoint}"
+ base_url = f"{args.base_url}"
+ else:
+ api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+ base_url = f"http://{args.host}:{args.port}"
+
+ tokenizer = get_tokenizer(tokenizer_id,
+ trust_remote_code=args.trust_remote_code)
+
+ if args.dataset == 'grammar':
+ args.structure_type = 'guided_grammar'
+ elif args.dataset == 'regex':
+ args.structure_type = 'guided_regex'
+ elif args.dataset == 'choice':
+ args.structure_type = 'guided_choice'
+ else:
+ args.structure_type = 'guided_json'
+
+ if args.no_guided_decoding:
+ args.guided_decoding_ratio = 0
+ if args.save_results:
+ result_file_name = f'{args.guided_decoding_ratio}guided'
+ result_file_name += f"_{backend}"
+ result_file_name += f"_{args.request_rate}qps"
+ result_file_name += f"_{args.model.split('/')[-1]}"
+ result_file_name += f"_{args.dataset}"
+ result_file_name += f"_{args.num_prompts}"
+ result_file_name += f"_out{args.output_len}"
+ result_file_name += ".txt"
+ else:
+ result_file_name = None
+
+ input_requests = sample_requests(tokenizer, args)
+
+ benchmark_result, ret = asyncio.run(
+ benchmark(
+ backend=backend,
+ api_url=api_url,
+ base_url=base_url,
+ model_id=model_id,
+ tokenizer=tokenizer,
+ input_requests=input_requests,
+ request_rate=args.request_rate,
+ burstiness=args.burstiness,
+ disable_tqdm=args.disable_tqdm,
+ profile=args.profile,
+ selected_percentile_metrics=args.percentile_metrics.split(","),
+ selected_percentiles=[
+ float(p) for p in args.metric_percentiles.split(",")
+ ],
+ ignore_eos=args.ignore_eos,
+ max_concurrency=args.max_concurrency,
+ guided_decoding_ratio=args.guided_decoding_ratio,
+ guided_decoding_backend=args.guided_decoding_backend,
+ ))
+
+ # Save config and results to json
+ score = evaluate(ret, args)
+ print("correct_rate(%)", score, '\n')
+ if args.save_results:
+ results = {
+ "backend":
+ backend,
+ "model_id":
+ model_id,
+ "tokenizer_id":
+ tokenizer_id,
+ "num_prompts":
+ args.num_prompts,
+ "request_rate":
+ args.request_rate if args.request_rate < float("inf") else "inf",
+ "burstiness":
+ args.burstiness,
+ "max_concurrency":
+ args.max_concurrency,
+ "correct_rate(%)":
+ score
+ }
+ results = {"outputs": ret, **results, **benchmark_result}
+
+ # Save to file
+ if args.result_filename:
+ result_file_name = args.result_filename
+ if args.result_dir:
+ result_file_name = os.path.join(args.result_dir, result_file_name)
+ with open(result_file_name, "w", encoding='utf-8') as outfile:
+ json.dump(results, outfile, indent=4)
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(
+ description="Benchmark the online serving throughput.")
+ parser.add_argument(
+ "--backend",
+ type=str,
+ default="vllm",
+ choices=list(ASYNC_REQUEST_FUNCS.keys()),
+ )
+ parser.add_argument(
+ "--base-url",
+ type=str,
+ default=None,
+ help="Server or API base url if not using http host and port.",
+ )
+ parser.add_argument("--host", type=str, default="localhost")
+ parser.add_argument("--port", type=int, default=8000)
+ parser.add_argument(
+ "--endpoint",
+ type=str,
+ default="/v1/completions",
+ help="API endpoint.",
+ )
+ parser.add_argument(
+ "--dataset",
+ default='json',
+ choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+ parser.add_argument("--json_schema_path",
+ type=str,
+ default=None,
+ help="Path to json schema.")
+ parser.add_argument(
+ "--max-concurrency",
+ type=int,
+ default=None,
+ help="Maximum number of concurrent requests. This can be used "
+ "to help simulate an environment where a higher level component "
+ "is enforcing a maximum number of concurrent requests. While the "
+ "--request-rate argument controls the rate at which requests are "
+ "initiated, this argument will control how many are actually allowed "
+ "to execute at a time. This means that when used in combination, the "
+ "actual request rate may be lower than specified with --request-rate, "
+ "if the server is not processing requests fast enough to keep up.")
+ parser.add_argument(
+ "--model",
+ type=str,
+ required=True,
+ help="Name of the model.",
+ )
+ parser.add_argument(
+ "--tokenizer",
+ type=str,
+ help=
+ "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
+ )
+ parser.add_argument(
+ "--num-prompts",
+ type=int,
+ default=1000,
+ help="Number of prompts to process.",
+ )
+ parser.add_argument(
+ "--output-len",
+ type=int,
+ default=128,
+ help="Number of output tokens.",
+ )
+ parser.add_argument(
+ "--request-rate",
+ type=float,
+ default=float("inf"),
+ help="Number of requests per second. If this is inf, "
+ "then all the requests are sent at time 0. "
+ "Otherwise, we use Poisson process or gamma distribution "
+ "to synthesize the request arrival times.",
+ )
+ parser.add_argument(
+ "--burstiness",
+ type=float,
+ default=1.0,
+ help="Burstiness factor of the request generation. "
+ "Only take effect when request_rate is not inf. "
+ "Default value is 1, which follows Poisson process. "
+ "Otherwise, the request intervals follow a gamma distribution. "
+ "A lower burstiness value (0 < burstiness < 1) results in more "
+ "bursty requests. A higher burstiness value (burstiness > 1) "
+ "results in a more uniform arrival of requests.",
+ )
+ parser.add_argument("--seed", type=int, default=0)
+ parser.add_argument(
+ "--trust-remote-code",
+ action="store_true",
+ help="Trust remote code from huggingface",
+ )
+ parser.add_argument(
+ "--disable-tqdm",
+ action="store_true",
+ help="Specify to disable tqdm progress bar.",
+ )
+ parser.add_argument(
+ "--save-results",
+ action="store_true",
+ help="Specify to save benchmark results to a json file",
+ )
+ parser.add_argument(
+ "--profile",
+ action="store_true",
+ help="Use Torch Profiler. The endpoint must be launched with "
+ "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+ )
+ parser.add_argument(
+ "--result-dir",
+ type=str,
+ default=None,
+ help="Specify directory to save benchmark json results."
+ "If not specified, results are saved in the current directory.",
+ )
+ parser.add_argument(
+ "--result-filename",
+ type=str,
+ default=None,
+ help="Specify the filename to save benchmark json results."
+ "If not specified, results will be saved in "
+ "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+ " format.",
+ )
+ parser.add_argument(
+ "--ignore-eos",
+ action="store_true",
+ help="Set ignore_eos flag when sending the benchmark request."
+ "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+ parser.add_argument(
+ "--percentile-metrics",
+ type=str,
+ default="ttft,tpot,itl",
+ help="Comma-seperated list of selected metrics to report percentils. "
+ "This argument specifies the metrics to report percentiles. "
+ "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
+ "Default value is \"ttft,tpot,itl\".")
+ parser.add_argument(
+ "--metric-percentiles",
+ type=str,
+ default="99",
+ help="Comma-seperated list of percentiles for selected metrics. "
+ "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+ "Default value is \"99\". "
+ "Use \"--percentile-metrics\" to select metrics.",
+ )
+ parser.add_argument("--no-guided-decoding",
+ action='store_true',
+ default=False,
+ help="Whether to disable JSON decoding or not.")
+ parser.add_argument("--guided-decoding-ratio",
+ type=float,
+ default=1.0,
+ help="Ratio of Guided Decoding requests")
+ parser.add_argument("--guided-decoding-backend",
+ type=str,
+ choices=["outlines", "lm-format-enforcer", "xgrammar"],
+ default="xgrammar",
+ help="Backend to use for guided decoding")
+
+ args = parser.parse_args()
+ main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 159cf055737ce..1e5967bd9bf8b 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -294,23 +294,36 @@ def main(args: argparse.Namespace):
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
- # Synthesize a prompt with the given input length.
- # As tokenizer may add additional tokens like BOS, we need to try
- # different lengths to get the desired input length.
- for i in range(-10, 10):
- prompt = "hi " * (args.input_len + i)
- tokenized_prompt = tokenizer(prompt).input_ids
- if len(tokenized_prompt) == args.input_len:
- break
- else:
- raise ValueError(
- f"Failed to synthesize a prompt with {args.input_len} tokens.")
- requests = [
- SampleRequest(prompt=prompt,
- prompt_len=args.input_len,
- expected_output_len=args.output_len)
- for _ in range(args.num_prompts)
- ]
+ vocab_size = tokenizer.vocab_size
+ requests = []
+ for _ in range(args.num_prompts):
+ # Synthesize a prompt with the given input length.
+ candidate_ids = [
+ random.randint(0, vocab_size - 1)
+ for _ in range(args.input_len)
+ ]
+ # As tokenizer may add additional tokens like BOS, we need to try
+ # different lengths to get the desired input length.
+ for _ in range(5): # Max attempts to correct
+ candidate_prompt = tokenizer.decode(candidate_ids)
+ tokenized_len = len(tokenizer.encode(candidate_prompt))
+
+ if tokenized_len == args.input_len:
+ break
+
+ # Adjust length based on difference
+ diff = args.input_len - tokenized_len
+ if diff > 0:
+ candidate_ids.extend([
+ random.randint(100, vocab_size - 100)
+ for _ in range(diff)
+ ])
+ else:
+ candidate_ids = candidate_ids[:diff]
+ requests.append(
+ SampleRequest(prompt=candidate_prompt,
+ prompt_len=args.input_len,
+ expected_output_len=args.output_len))
else:
requests = sample_requests(tokenizer, args)
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
new file mode 100644
index 0000000000000..3d1c5e392f9e2
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -0,0 +1,384 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+ **kwargs) -> TMeasurement:
+ min_run_time = 1
+
+ globals = {
+ "args": args,
+ "kwargs": kwargs,
+ "fn": fn,
+ }
+ return TBenchmark.Timer(
+ stmt="fn(*args, **kwargs)",
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description=description,
+ ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+ sub_label: str) -> Iterable[TMeasurement]:
+ assert dtype == torch.int8
+ b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+ scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+ scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+ bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+ out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+ torch.bfloat16)
+ out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+ if not torch.allclose(out, out_ref):
+ print("Incorrect results")
+ print(out)
+ print(out_ref)
+ else:
+ print("Correct results")
+
+ timers = []
+ # pytorch impl - bfloat16
+ timers.append(
+ bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+ torch.mm, a.to(dtype=torch.bfloat16),
+ b.to(dtype=torch.bfloat16)))
+
+ # pytorch impl - float16
+ timers.append(
+ bench_fn(label, sub_label,
+ "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+ a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+ # cutlass impl
+ timers.append(
+ bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+ ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+ torch.bfloat16))
+
+ # cutlass with bias
+ timers.append(
+ bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+ ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+ bias))
+
+ # cutlass sparse impl
+ timers.append(
+ bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
+ ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+ scale_b, torch.bfloat16))
+
+ # cutlass sparse with bias
+ timers.append(
+ bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+ ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+ scale_b, torch.bfloat16, bias))
+
+ return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+ sub_label: str) -> Iterable[TMeasurement]:
+ assert dtype == torch.float8_e4m3fn
+ b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
+ k)
+ scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+ scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+ bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+ out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+ torch.bfloat16)
+ out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+ if not torch.allclose(out, out_ref):
+ print("Incorrect results")
+ print(out)
+ print(out_ref)
+ else:
+ print("Correct results")
+
+ timers = []
+
+ # pytorch impl w. bf16
+ timers.append(
+ bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+ torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+ b.to(dtype=torch.bfloat16, device="cuda")))
+
+ # pytorch impl: bf16 output, without fp8 fast accum
+ timers.append(
+ bench_fn(label,
+ sub_label,
+ "pytorch_fp8_fp8_bf16_scaled_mm",
+ torch._scaled_mm,
+ a,
+ b,
+ scale_a=scale_a,
+ scale_b=scale_b,
+ out_dtype=torch.bfloat16))
+
+ # pytorch impl: bf16 output, with fp8 fast accum
+ timers.append(
+ bench_fn(label,
+ sub_label,
+ "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+ torch._scaled_mm,
+ a,
+ b,
+ scale_a=scale_a,
+ scale_b=scale_b,
+ out_dtype=torch.bfloat16,
+ use_fast_accum=True))
+
+ # pytorch impl: fp16 output, without fp8 fast accum
+ timers.append(
+ bench_fn(label,
+ sub_label,
+ "pytorch_fp8_fp8_fp16_scaled_mm",
+ torch._scaled_mm,
+ a,
+ b,
+ scale_a=scale_a,
+ scale_b=scale_b,
+ out_dtype=torch.float16))
+
+ # pytorch impl: fp16 output, with fp8 fast accum
+ timers.append(
+ bench_fn(label,
+ sub_label,
+ "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+ torch._scaled_mm,
+ a,
+ b,
+ scale_a=scale_a,
+ scale_b=scale_b,
+ out_dtype=torch.float16,
+ use_fast_accum=True))
+
+ # cutlass impl: bf16 output
+ timers.append(
+ bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+ ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+ torch.bfloat16))
+
+ # cutlass impl: bf16 output
+ timers.append(
+ bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+ ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+ scale_b, torch.bfloat16))
+
+ # cutlass impl: fp16 output
+ timers.append(
+ bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+ ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+ scale_b, torch.float16))
+
+ # cutlass impl: bf16 output, with bias
+ timers.append(
+ bench_fn(label, sub_label,
+ "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+ ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+ scale_b, torch.bfloat16, bias))
+
+ # cutlass impl: fp16 output, with bias
+ timers.append(
+ bench_fn(label, sub_label,
+ "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+ ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+ scale_b, torch.float16, bias.to(dtype=torch.float16)))
+
+ return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+ sub_label: str) -> Iterable[TMeasurement]:
+ if dtype == torch.int8:
+ return bench_int8(dtype, m, k, n, label, sub_label)
+ if dtype == torch.float8_e4m3fn:
+ return bench_fp8(dtype, m, k, n, label, sub_label)
+ raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+ compare = TBenchmark.Compare(timers)
+ compare.print()
+
+
+def run(dtype: torch.dtype,
+ MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+ results = []
+ for m, k, n in MKNs:
+ timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+ f"MKN=({m}x{k}x{n})")
+ print_timers(timers)
+ results.extend(timers)
+
+ return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+ MKNs: Iterable[Tuple[int, int, int]],
+ base_description: str,
+ timestamp=None):
+ print(f"== All Results {base_description} ====")
+ print_timers(data)
+
+ # pickle all the results
+ timestamp = int(time.time()) if timestamp is None else timestamp
+ with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+ pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+ dim_sizes = list(
+ range(args.dim_start, args.dim_end + 1, args.dim_increment))
+ MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+ data = run(args.dtype, MKNs)
+
+ make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+ dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+ n = len(dim_sizes)
+ Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+ Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+ Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+ MKNs = list(zip(Ms, Ks, Ns))
+ data = run(args.dtype, MKNs)
+
+ make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+ print("Benchmarking models:")
+ for i, model in enumerate(args.models):
+ print(f"[{i}] {model}")
+
+ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+ KNs = []
+ for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+ KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+ KNs.append(KN)
+ return KNs
+
+ model_bench_data = []
+ models_tps = list(itertools.product(args.models, args.tp_sizes))
+ for model, tp_size in models_tps:
+ Ms = args.batch_sizes
+ KNs = model_shapes(model, tp_size)
+ MKNs = []
+ for m in Ms:
+ for k, n in KNs:
+ MKNs.append((m, k, n))
+
+ data = run(args.dtype, MKNs)
+ model_bench_data.append(data)
+
+ # Print all results
+ for data, model_tp in zip(model_bench_data, models_tps):
+ model, tp_size = model_tp
+ print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+ print_timers(data)
+
+ timestamp = int(time.time())
+
+ all_data = []
+ for d in model_bench_data:
+ all_data.extend(d)
+ # pickle all data
+ with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+ pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+ def to_torch_dtype(dt):
+ if dt == "int8":
+ return torch.int8
+ if dt == "fp8":
+ return torch.float8_e4m3fn
+ raise ValueError("unsupported dtype")
+
+ parser = FlexibleArgumentParser(
+ description="""
+Benchmark Cutlass GEMM.
+
+ To run square GEMMs:
+ python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+
+ To run constant N and K and sweep M:
+ python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+
+ To run dimensions from a model:
+ python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+
+ Output:
+ - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+ """, # noqa: E501
+ formatter_class=argparse.RawTextHelpFormatter)
+
+ parser.add_argument("--dtype",
+ type=to_torch_dtype,
+ required=True,
+ help="Available options are ['int8', 'fp8']")
+ subparsers = parser.add_subparsers(dest="cmd")
+
+ square_parser = subparsers.add_parser("square_bench")
+ square_parser.add_argument("--dim-start", type=int, required=True)
+ square_parser.add_argument("--dim-end", type=int, required=True)
+ square_parser.add_argument("--dim-increment", type=int, required=True)
+ square_parser.set_defaults(func=run_square_bench)
+
+ range_parser = subparsers.add_parser("range_bench")
+ range_parser.add_argument("--dim-start", type=int, required=True)
+ range_parser.add_argument("--dim-end", type=int, required=True)
+ range_parser.add_argument("--dim-increment", type=int, required=True)
+ range_parser.add_argument("--m-constant", type=int, default=None)
+ range_parser.add_argument("--n-constant", type=int, default=None)
+ range_parser.add_argument("--k-constant", type=int, default=None)
+ range_parser.set_defaults(func=run_range_bench)
+
+ model_parser = subparsers.add_parser("model_bench")
+ model_parser.add_argument("--models",
+ nargs="+",
+ type=str,
+ default=DEFAULT_MODELS,
+ choices=WEIGHT_SHAPES.keys())
+ model_parser.add_argument("--tp-sizes",
+ nargs="+",
+ type=int,
+ default=DEFAULT_TP_SIZES)
+ model_parser.add_argument("--batch-sizes",
+ nargs="+",
+ type=int,
+ default=DEFAULT_BATCH_SIZES)
+ model_parser.set_defaults(func=run_model_bench)
+
+ args = parser.parse_args()
+ args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
new file mode 100644
index 0000000000000..ef06fcd6604dd
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -0,0 +1,96 @@
+# Cutlass bench utils
+from typing import Iterable, Tuple
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+ finfo = torch.finfo(torch.float8_e4m3fn)
+ return torch.round(tensor.clamp(
+ min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+ return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+ return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+ return tensor.to(dtype=torch.float16)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+ k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+ a = torch.randn((m, k), device='cuda') * 5
+ b = torch.randn((n, k), device='cuda').t() * 5
+
+ if dtype == torch.int8:
+ return to_int8(a), to_int8(b)
+ if dtype == torch.float8_e4m3fn:
+ return to_fp8(a), to_fp8(b)
+
+ raise ValueError("unsupported dtype")
+
+
+def prune_to_2_4(tensor):
+ # Reshape tensor to [N, 4] where N is number of groups of 4
+ original_shape = tensor.shape
+ reshaped = tensor.reshape(-1, 4)
+
+ # Get indices of top 2 absolute values in each group of 4
+ _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+ # Create binary mask
+ mask = torch.zeros_like(reshaped)
+ mask.scatter_(dim=1,
+ index=indices,
+ src=torch.ones_like(indices, dtype=mask.dtype))
+
+ # Apply mask and reshape back
+ pruned = reshaped * mask
+
+ # Turn all -0.0 to 0.0
+ pruned[pruned == -0.0] = 0.0
+
+ return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
+ k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+ a = torch.randn((m, k), device='cuda') * 5
+ b = torch.randn((n, k), device='cuda').t() * 5
+
+ b = prune_to_2_4(b.t()).t()
+
+ if dtype == torch.int8:
+ a, b = to_int8(a), to_int8(b)
+ elif dtype == torch.float8_e4m3fn:
+ a, b = to_fp8(a), to_fp8(b)
+ elif dtype == torch.float16:
+ a, b = to_fp16(a), to_fp16(b)
+ elif dtype == torch.bfloat16:
+ a, b = to_bf16(a), to_bf16(b)
+ else:
+ raise ValueError("unsupported dtype")
+
+ b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+ # Compressed B, Metadata, Original A, B
+ return b_compressed, e, a, b
+
+
+def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
+ m: int, n: int, k: int) -> \
+ Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+ ABs = []
+ for _ in range(num_tensors):
+ b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+ if b_comp is not None:
+ ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
+ BComps, Es, As, Bs = zip(*ABs)
+ return list(BComps), list(Es), list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 63cf5d50cac75..d0353bc8cb42a 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -8,6 +8,7 @@
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_tensors
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
@@ -17,31 +18,6 @@
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
DEFAULT_TP_SIZES = [1]
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
- finfo = torch.finfo(torch.float8_e4m3fn)
- return torch.round(tensor.clamp(
- min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
- return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
- k: int) -> Tuple[torch.Tensor, torch.Tensor]:
- a = torch.randn((m, k), device='cuda') * 5
- b = torch.randn((n, k), device='cuda').t() * 5
-
- if dtype == torch.int8:
- return to_int8(a), to_int8(b)
- if dtype == torch.float8_e4m3fn:
- return to_fp8(a), to_fp8(b)
-
- raise ValueError("unsupported dtype")
-
# bench
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
@@ -386,4 +362,4 @@ def to_torch_dtype(dt):
model_parser.set_defaults(func=run_model_bench)
args = parser.parse_args()
- args.func(args)
+ args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index 25ec9d6028627..d58fb0bf86374 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -40,4 +40,4 @@
([8192, 57344], 1),
([28672, 8192], 0),
],
-}
+}
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
new file mode 100644
index 0000000000000..2924ea4a49f54
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# benchmark the overhead of disaggregated prefill.
+# methodology:
+# - send all request to prefill vLLM instance. It will buffer KV cache.
+# - then send all request to decode instance.
+# - The TTFT of decode instance is the overhead.
+
+set -ex
+
+kill_gpu_processes() {
+ # kill all processes on GPU.
+ pkill -f pt_main_thread
+ sleep 10
+
+ # remove vllm config file
+ rm -rf ~/.config/vllm
+
+ # Print the GPU memory usage
+ # so that we know if all GPU processes are killed.
+ gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+ # The memory usage should be 0 MB.
+ echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+ # wait for vllm server to start
+ # return 1 if vllm server crashes
+ local port=$1
+ timeout 1200 bash -c "
+ until curl -s localhost:${port}/v1/completions > /dev/null; do
+ sleep 1
+ done" && return 0 || return 1
+}
+
+
+benchmark() {
+
+ export VLLM_LOGGING_LEVEL=DEBUG
+ export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+ # compare chunked prefill with disaggregated prefill
+
+ results_folder="./results"
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+ dataset_name="sonnet"
+ dataset_path="../sonnet_4x.txt"
+ num_prompts=10
+ qps=$1
+ prefix_len=50
+ input_len=2048
+ output_len=$2
+
+
+ CUDA_VISIBLE_DEVICES=0 python3 \
+ -m vllm.entrypoints.openai.api_server \
+ --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+ --port 8100 \
+ --max-model-len 10000 \
+ --gpu-memory-utilization 0.6 \
+ --kv-transfer-config \
+ '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+
+ CUDA_VISIBLE_DEVICES=1 python3 \
+ -m vllm.entrypoints.openai.api_server \
+ --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+ --port 8200 \
+ --max-model-len 10000 \
+ --gpu-memory-utilization 0.6 \
+ --kv-transfer-config \
+ '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+ wait_for_server 8100
+ wait_for_server 8200
+
+ # let the prefill instance finish prefill
+ python3 ../benchmark_serving.py \
+ --backend vllm \
+ --model $model \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --sonnet-input-len $input_len \
+ --sonnet-output-len "$output_len" \
+ --sonnet-prefix-len $prefix_len \
+ --num-prompts $num_prompts \
+ --port 8100 \
+ --save-result \
+ --result-dir $results_folder \
+ --result-filename disagg_prefill_2xtp4.json \
+ --request-rate "inf"
+
+
+ # send the request to decode.
+ # The TTFT of this command will be the overhead of disagg prefill impl.
+ python3 ../benchmark_serving.py \
+ --backend vllm \
+ --model $model \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --sonnet-input-len $input_len \
+ --sonnet-output-len "$output_len" \
+ --sonnet-prefix-len $prefix_len \
+ --num-prompts $num_prompts \
+ --port 8200 \
+ --save-result \
+ --result-dir $results_folder \
+ --result-filename disagg_prefill_2xtp4.json \
+ --request-rate "$qps"
+ kill_gpu_processes
+
+}
+
+
+main() {
+
+ (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+ (which jq) || (apt-get -y install jq)
+ (which socat) || (apt-get -y install socat)
+
+ pip install quart httpx
+
+ cd "$(dirname "$0")"
+
+ cd ..
+ # create sonnet-4x.txt
+ echo "" > sonnet_4x.txt
+ for _ in {1..4}
+ do
+ cat sonnet.txt >> sonnet_4x.txt
+ done
+ cd disagg_benchmarks
+
+ rm -rf results
+ mkdir results
+
+ default_qps=1
+ default_output_len=1
+ benchmark $default_qps $default_output_len
+
+}
+
+
+main "$@"
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
new file mode 100644
index 0000000000000..d8d9e976dce76
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Requirement: 8x H100 GPUs.
+
+
+# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV
+# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
+# Resource: 8x H100
+# Approaches:
+# 1. Chunked prefill: 1 vllm instance with tp=8
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+ # kill all processes on GPU.
+ pgrep pt_main_thread | xargs -r kill -9
+ pgrep python3 | xargs -r kill -9
+ for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
+ sleep 1
+}
+
+wait_for_server() {
+ # wait for vllm server to start
+ # return 1 if vllm server crashes
+ local port=$1
+ timeout 1200 bash -c "
+ until curl -s localhost:${port}/v1/completions > /dev/null; do
+ sleep 1
+ done" && return 0 || return 1
+}
+
+
+launch_chunked_prefill() {
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+ # disagg prefill
+ CUDA_VISIBLE_DEVICES=0 python3 \
+ -m vllm.entrypoints.openai.api_server \
+ --model $model \
+ --port 8100 \
+ --max-model-len 10000 \
+ --enable-chunked-prefill \
+ --gpu-memory-utilization 0.6 &
+ CUDA_VISIBLE_DEVICES=1 python3 \
+ -m vllm.entrypoints.openai.api_server \
+ --model $model \
+ --port 8200 \
+ --max-model-len 10000 \
+ --enable-chunked-prefill \
+ --gpu-memory-utilization 0.6 &
+ wait_for_server 8100
+ wait_for_server 8200
+ python3 round_robin_proxy.py &
+ sleep 1
+}
+
+
+launch_disagg_prefill() {
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+ # disagg prefill
+ CUDA_VISIBLE_DEVICES=0 python3 \
+ -m vllm.entrypoints.openai.api_server \
+ --model $model \
+ --port 8100 \
+ --max-model-len 10000 \
+ --gpu-memory-utilization 0.6 \
+ --kv-transfer-config \
+ '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+ CUDA_VISIBLE_DEVICES=1 python3 \
+ -m vllm.entrypoints.openai.api_server \
+ --model $model \
+ --port 8200 \
+ --max-model-len 10000 \
+ --gpu-memory-utilization 0.6 \
+ --kv-transfer-config \
+ '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+ wait_for_server 8100
+ wait_for_server 8200
+ python3 disagg_prefill_proxy_server.py &
+ sleep 1
+}
+
+
+benchmark() {
+ results_folder="./results"
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+ dataset_name="sonnet"
+ dataset_path="../sonnet_4x.txt"
+ num_prompts=100
+ qps=$1
+ prefix_len=50
+ input_len=1024
+ output_len=$2
+ tag=$3
+
+ python3 ../benchmark_serving.py \
+ --backend vllm \
+ --model $model \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --sonnet-input-len $input_len \
+ --sonnet-output-len "$output_len" \
+ --sonnet-prefix-len $prefix_len \
+ --num-prompts $num_prompts \
+ --port 8000 \
+ --save-result \
+ --result-dir $results_folder \
+ --result-filename "$tag"-qps-"$qps".json \
+ --request-rate "$qps"
+
+ sleep 2
+
+}
+
+
+main() {
+
+ (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+ (which jq) || (apt-get -y install jq)
+ (which socat) || (apt-get -y install socat)
+
+ pip install quart httpx matplotlib aiohttp
+
+ cd "$(dirname "$0")"
+
+ cd ..
+ # create sonnet-4x.txt so that we can sample 2048 tokens for input
+ echo "" > sonnet_4x.txt
+ for _ in {1..4}
+ do
+ cat sonnet.txt >> sonnet_4x.txt
+ done
+ cd disagg_benchmarks
+
+ rm -rf results
+ mkdir results
+
+ default_output_len=6
+
+ export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+ launch_chunked_prefill
+ for qps in 2 4 6 8; do
+ benchmark $qps $default_output_len chunked_prefill
+ done
+ kill_gpu_processes
+
+ launch_disagg_prefill
+ for qps in 2 4 6 8; do
+ benchmark $qps $default_output_len disagg_prefill
+ done
+ kill_gpu_processes
+
+ python3 visualize_benchmark_results.py
+
+}
+
+
+main "$@"
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
new file mode 100644
index 0000000000000..4058b1c0a3b79
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -0,0 +1,61 @@
+import os
+
+import aiohttp
+from quart import Quart, make_response, request
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+async def forward_request(url, data):
+ async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+ headers = {
+ "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+ }
+ async with session.post(url=url, json=data,
+ headers=headers) as response:
+ if response.status == 200:
+ # if response.headers.get('Transfer-Encoding') == 'chunked':
+ if True:
+ async for chunk_bytes in response.content.iter_chunked(
+ 1024):
+ yield chunk_bytes
+ else:
+ content = await response.read()
+ yield content
+
+
+@app.route('/v1/completions', methods=['POST'])
+async def handle_request():
+ try:
+ original_request_data = await request.get_json()
+
+ prefill_request = original_request_data.copy()
+ # change max_tokens = 1 to let it only do prefill
+ prefill_request['max_tokens'] = 1
+
+ # finish prefill
+ async for _ in forward_request('http://localhost:8100/v1/completions',
+ prefill_request):
+ continue
+
+ # return decode
+ generator = forward_request('http://localhost:8200/v1/completions',
+ original_request_data)
+ response = await make_response(generator)
+ response.timeout = None
+
+ return response
+
+ except Exception as e:
+ import sys
+ import traceback
+ exc_info = sys.exc_info()
+ print("Error occurred in disagg prefill proxy server")
+ print(e)
+ print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == '__main__':
+ app.run(port=8000)
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
new file mode 100644
index 0000000000000..6eb5f63980070
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -0,0 +1,60 @@
+import asyncio
+import itertools
+
+import aiohttp
+from aiohttp import web
+
+
+class RoundRobinProxy:
+
+ def __init__(self, target_ports):
+ self.target_ports = target_ports
+ self.port_cycle = itertools.cycle(self.target_ports)
+
+ async def handle_request(self, request):
+ target_port = next(self.port_cycle)
+ target_url = f"http://localhost:{target_port}{request.path_qs}"
+
+ async with aiohttp.ClientSession() as session:
+ try:
+ # Forward the request
+ async with session.request(
+ method=request.method,
+ url=target_url,
+ headers=request.headers,
+ data=request.content,
+ ) as response:
+ # Start sending the response
+ resp = web.StreamResponse(status=response.status,
+ headers=response.headers)
+ await resp.prepare(request)
+
+ # Stream the response content
+ async for chunk in response.content.iter_any():
+ await resp.write(chunk)
+
+ await resp.write_eof()
+ return resp
+
+ except Exception as e:
+ return web.Response(text=f"Error: {str(e)}", status=500)
+
+
+async def main():
+ proxy = RoundRobinProxy([8100, 8200])
+ app = web.Application()
+ app.router.add_route('*', '/{path:.*}', proxy.handle_request)
+
+ runner = web.AppRunner(app)
+ await runner.setup()
+ site = web.TCPSite(runner, 'localhost', 8000)
+ await site.start()
+
+ print("Proxy server started on http://localhost:8000")
+
+ # Keep the server running
+ await asyncio.Event().wait()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
new file mode 100644
index 0000000000000..e59d8bb0e6c8c
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -0,0 +1,46 @@
+import json
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+if __name__ == "__main__":
+
+ data = []
+ for name in ['disagg_prefill', 'chunked_prefill']:
+ for qps in [2, 4, 6, 8]:
+ with open(f"results/{name}-qps-{qps}.json") as f:
+ x = json.load(f)
+ x['name'] = name
+ x['qps'] = qps
+ data.append(x)
+
+ df = pd.DataFrame.from_dict(data)
+ dis_df = df[df['name'] == 'disagg_prefill']
+ chu_df = df[df['name'] == 'chunked_prefill']
+
+ plt.style.use('bmh')
+ plt.rcParams['font.size'] = 20
+
+ for key in [
+ 'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
+ 'median_itl_ms', 'p99_itl_ms'
+ ]:
+
+ fig, ax = plt.subplots(figsize=(11, 7))
+ plt.plot(dis_df['qps'],
+ dis_df[key],
+ label='disagg_prefill',
+ marker='o',
+ linewidth=4)
+ plt.plot(chu_df['qps'],
+ chu_df[key],
+ label='chunked_prefill',
+ marker='o',
+ linewidth=4)
+ ax.legend()
+
+ ax.set_xlabel('QPS')
+ ax.set_ylabel(key)
+ ax.set_ylim(bottom=0)
+ fig.savefig(f'results/{key}.png')
+ plt.close(fig)
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
new file mode 100644
index 0000000000000..ef91f9f8eb529
--- /dev/null
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -0,0 +1,173 @@
+import pickle as pkl
+import time
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Iterable, List, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from tqdm import tqdm
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+@dataclass
+class bench_params_t:
+ num_tokens: int
+ hidden_size: int
+ add_residual: bool
+ dtype: torch.dtype
+
+ def description(self):
+ return (f'N {self.num_tokens} '
+ f'x D {self.hidden_size} '
+ f'x R {self.add_residual} '
+ f'x DT {self.dtype}')
+
+
+def get_bench_params() -> List[bench_params_t]:
+ ## Test Fixtures
+ NUM_TOKENS = [2**x for x in range(11)]
+ HIDDEN_SIZES = list(range(1024, 8129, 1024))
+ ADD_RESIDUAL = [True, False]
+ DTYPES = [torch.bfloat16, torch.float]
+
+ combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+ bench_params = list(map(lambda x: \
+ bench_params_t(x[0], x[1], x[2], x[3]), combinations))
+ return bench_params
+
+
+# Reference impls
+def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+ residual: Optional[torch.Tensor],
+ quant_dtype: torch.dtype):
+ # Norm
+ torch_out = None
+ if residual is None:
+ torch_out = rms_norm_layer.forward_cuda(x, residual)
+ else:
+ torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+ # Quant
+ torch_out, _, _ = ops.scaled_int8_quant(torch_out)
+
+
+def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+ residual: Optional[torch.Tensor],
+ quant_dtype: torch.dtype):
+ # Norm
+ torch_out = None
+ if residual is None:
+ torch_out = rms_norm_layer.forward_cuda(x, residual)
+ else:
+ torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+ # Quant
+ torch_out, _ = ops.scaled_fp8_quant(torch_out)
+
+
+def fused_impl(
+ rms_norm_layer: RMSNorm, # this stores the weights
+ x: torch.Tensor,
+ residual: Optional[torch.Tensor],
+ quant_dtype: torch.dtype):
+ out, _ = ops.rms_norm_dynamic_per_token_quant(x,
+ rms_norm_layer.weight,
+ 1e-6,
+ quant_dtype,
+ residual=residual)
+
+
+# Bench functions
+def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
+ quant_dtype: torch.dtype, label: str, sub_label: str,
+ fn: Callable, description: str) -> TMeasurement:
+
+ min_run_time = 1
+
+ globals = {
+ "rms_norm_layer": rms_norm_layer,
+ "x": x,
+ "residual": residual,
+ "quant_dtype": quant_dtype,
+ "fn": fn,
+ }
+ return TBenchmark.Timer(
+ stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description=description,
+ ).blocked_autorange(min_run_time=min_run_time)
+
+def bench(params: bench_params_t, label: str, sub_label: str) \
+ -> Iterable[TMeasurement]:
+
+ # Make inputs
+ layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
+ # Make weights
+ layer.weight.data.normal_(mean=1.0, std=0.1)
+ # Make inputs
+ scale = 1 / params.hidden_size
+ x = torch.randn(params.num_tokens,
+ params.hidden_size,
+ dtype=params.dtype,
+ device='cuda') * scale
+ residual = (torch.randn_like(x) * scale).to(device='cuda') \
+ if params.add_residual else None
+
+ timers = []
+
+ # unfused int8 impl.
+ timers.append(
+ bench_fn(layer, x, residual, torch.int8, label, sub_label,
+ unfused_int8_impl, "unfused_int8_impl"))
+
+ # unfused fp8 impl.
+ timers.append(
+ bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+ unfused_fp8_impl, "unfused_fp8_impl"))
+
+ # fused int8 impl.
+ timers.append(
+ bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
+ "fused_int8_impl"))
+
+ # fused fp8 impl.
+ timers.append(
+ bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+ fused_impl, "fused_fp8_impl"))
+
+ print_timers(timers)
+
+ return timers
+
+
+# launch bench
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+ compare = TBenchmark.Compare(timers)
+ compare.print()
+
+
+def main():
+ torch.set_default_device('cuda')
+ bench_params = get_bench_params()
+
+ timers = []
+ for bp in tqdm(bench_params):
+ timers.extend(
+ bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
+ print_timers(timers)
+
+ # pickle all the results
+ timestamp = int(time.time())
+ with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f:
+ pkl.dump(timers, f)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 665b50bf18cf0..46bab74ae8adf 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -2,8 +2,10 @@
import copy
import itertools
import math
+import os
import pickle as pkl
import time
+from dataclasses import dataclass
from itertools import product
from typing import Callable, Iterable, List, Optional, Tuple
@@ -15,11 +17,12 @@
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
- GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
+ GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales,
+ marlin_zero_points)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
- gptq_pack, pack_rows, quantize_weights)
+ pack_rows, quantize_weights)
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
@@ -27,149 +30,350 @@
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
DEFAULT_TP_SIZES = [1]
+NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False)
+
+if NVTX_PROFILE:
+ import nvtx
+
+
+def terse_type_name(dt):
+ return {
+ torch.bfloat16: "bf16",
+ torch.float16: "fp16",
+ torch.int8: "int8",
+ torch.float8_e4m3fn: "fp8",
+ torch.bfloat16: "bf16",
+ torch.float: "float",
+ torch.int: "int",
+ }[dt]
+
+
+@dataclass
+class BenchmarkTensors:
+ w_ref: torch.Tensor
+ a: torch.Tensor
+
+ w_q: torch.Tensor
+ group_size: Optional[int]
+ wtype: ScalarType
+ w_g_s: torch.Tensor
+ w_g_zp: Optional[torch.Tensor]
+ w_ch_s: Optional[torch.Tensor]
+ w_tok_s: Optional[torch.Tensor]
+
+
+@dataclass
+class TypeConfig:
+ act_type: torch.dtype
+ weight_type: ScalarType
+ output_type: Optional[torch.dtype]
+ group_scale_type: Optional[torch.dtype]
+ group_zero_type: Optional[torch.dtype]
+ channel_scale_type: Optional[torch.dtype]
+ token_scale_type: Optional[torch.dtype]
+
+
+def rand_data(shape, dtype=torch.float16, scale=1):
+ if dtype.is_floating_point:
+ return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype)
+ else:
+ return torch.randint(-15, 15, shape, dtype=dtype, device="cuda")
+
+
+def quantize_and_pack(atype: torch.dtype,
+ w: torch.Tensor,
+ wtype: ScalarType,
+ stype: Optional[torch.dtype],
+ group_size: Optional[int],
+ zero_points: bool = False):
+ assert wtype.is_integer(), "TODO: support floating point weights"
+
+ w_ref, w_q, w_s, w_zp = quantize_weights(
+ w,
+ wtype,
+ group_size=group_size,
+ zero_points=zero_points,
+ # to match how the kernel applies zps
+ ref_zero_points_after_scales=True)
-def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
- w_q = w_q.t().contiguous().t() # make col major
- return ops.machete_prepack_B(w_q, wtype)
+ return w_ref, w_q, w_s, w_zp
-def make_bench_tensors(
- atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
- k: int
-) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
- torch.tensor]]]:
- assert wtype.is_integer(), "TODO: support floating point weights"
+def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
+ group_size: Optional[int]) -> List[BenchmarkTensors]:
+ m, n, k = shape
# we want to make sure that weights don't fit into L2 cache between runs so
# we construct enough weights to exceed L2 cache, which is 50mb on a H100
# so we target total weight size > 2*50mb
- num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
-
- a = torch.randn((m, k), device="cuda", dtype=atype) * 5
- weights = [
- torch.randn((k, n), device="cuda", dtype=atype)
- for _ in range(num_weights)
- ]
- quanitized_weights = [
- quantize_weights(w, wtype, group_size) for w in weights
- ]
-
- return a, quanitized_weights
+ num_weights = math.ceil(2 * 50 * 1024**2 * 8 /
+ (k * n * types.weight_type.size_bits))
+
+ a = rand_data((m, k), types.act_type, scale=5)
+
+ benchmark_tensors: List[BenchmarkTensors] = []
+ for _ in range(num_weights):
+ w = rand_data((k, n), types.act_type, scale=5)
+
+ if types.group_scale_type is not None:
+ w = w.to(types.group_scale_type)
+ if w.dtype.itemsize == 1:
+ w = w.to(torch.float16)
+
+ w_ref, w_q_packed, w_s, w_zp = quantize_and_pack(
+ a.dtype, w, types.weight_type, types.group_scale_type, group_size,
+ types.group_zero_type is not None)
+
+ if not a.dtype.is_floating_point:
+ aiinfo = torch.iinfo(a.dtype)
+ w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
+
+ w_ref = w_ref.to(torch.float32)
+
+ w_ch_s = None if types.channel_scale_type is None else\
+ rand_data((n,), types.channel_scale_type)
+ w_tok_s = None if types.token_scale_type is None else\
+ rand_data((m,), types.token_scale_type)
+
+ benchmark_tensors.append(
+ BenchmarkTensors(w_ref=w_ref,
+ a=a,
+ w_q=w_q_packed,
+ wtype=types.weight_type,
+ w_g_s=w_s,
+ w_g_zp=w_zp,
+ group_size=group_size,
+ w_ch_s=w_ch_s,
+ w_tok_s=w_tok_s))
+
+ return benchmark_tensors
+
+
+def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+ a = bt.a
+ w = bt.w_ref.to(bt.a.dtype) # use float reference tensor
+ if a.dtype not in [torch.float16, torch.bfloat16]:
+ a = a.to(torch.float16)
+ w = w.to(torch.float16)
+ return lambda: torch.matmul(a, w)
+
+
+def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+ if bt.w_ch_s is not None and bt.w_tok_s is not None:
+ scale_a = bt.w_tok_s.to(torch.float32)
+ scale_b = bt.w_ch_s.to(torch.float32)
+ else:
+ scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+ scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+ w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t()
+ return lambda: ops.cutlass_scaled_mm(
+ bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16)
+
+
+def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+ device = bt.a.device
+
+ workspace = MarlinWorkspace(bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_MAX_PARALLEL)
+
+ if bt.w_g_zp is None:
+ w_zp = torch.empty(0, dtype=torch.int, device=device)
+ else:
+ w_zp = marlin_zero_points(bt.w_g_zp, bt.w_ref.shape[0],
+ bt.w_ref.shape[1], bt.wtype.size_bits)
+
+ if bt.group_size is None:
+ w_s = torch.tensor([], device="cuda", dtype=torch.half)
+ else:
+ w_s = marlin_permute_scales(bt.w_g_s, bt.w_ref.shape[0],
+ bt.w_ref.shape[1], bt.group_size)
+
+ sort_indices = torch.empty(0, dtype=torch.int, device=device)
+ g_idx = torch.empty(0, dtype=torch.int, device=device)
+ w_q = ops.gptq_marlin_repack(bt.w_q, sort_indices, bt.w_ref.shape[0],
+ bt.w_ref.shape[1], bt.wtype.size_bits)
+
+ if bt.a.dtype.is_floating_point:
+ assert bt.w_ch_s is None
+ assert bt.w_tok_s is None
+ assert bt.group_size is not None
+
+ fn = lambda: ops.gptq_marlin_gemm(a=bt.a,
+ b_q_weight=w_q,
+ b_scales=w_s,
+ b_zeros=w_zp,
+ g_idx=g_idx,
+ perm=sort_indices,
+ workspace=workspace.scratch,
+ b_q_type=bt.wtype,
+ size_m=bt.a.shape[0],
+ size_n=bt.w_ref.shape[1],
+ size_k=bt.w_ref.shape[0],
+ is_k_full=True,
+ is_zp_float=False)
+ else:
+ assert bt.a.dtype == torch.int8
+ assert bt.wtype == scalar_types.uint4b8
+
+ if bt.w_ch_s is not None:
+ s_ch = bt.w_ch_s.to(torch.float32)
+ else:
+ s_ch = torch.ones(bt.w_ref.shape[1],
+ dtype=torch.float32,
+ device=device)
+
+ if bt.w_tok_s is not None:
+ s_tok = bt.w_tok_s.to(torch.float32)
+ else:
+ s_tok = torch.ones(bt.a.shape[0],
+ dtype=torch.float32,
+ device=device)
+
+ fn = lambda: ops.marlin_qqq_gemm(a=bt.a,
+ b_q_weight=w_q,
+ s_group=w_s,
+ s_tok=s_tok,
+ s_ch=s_ch,
+ workspace=workspace.scratch,
+ size_m=bt.a.shape[0],
+ size_n=bt.w_ref.shape[1],
+ size_k=bt.w_ref.shape[0])
+
+ return fn
+
+
+def machete_create_bench_fn(bt: BenchmarkTensors,
+ out_type=torch.dtype,
+ schedule=None) -> Callable:
+ w_q = bt.w_q.t().contiguous().t() # make col major
+ w_q = ops.machete_prepack_B(w_q, bt.a.dtype, bt.wtype,
+ None if bt.w_g_s is None else bt.w_g_s.dtype)
+
+ w_g_zp = bt.w_g_zp
+ if w_g_zp is not None:
+ w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype))
+
+ return lambda: ops.machete_mm(
+ a=bt.a,
+ b_q=bt.w_q,
+ b_type=bt.wtype,
+ b_group_scales=bt.w_g_s,
+ b_group_zeros=w_g_zp,
+ b_group_size=bt.group_size,
+ b_channel_scales=bt.w_ch_s,
+ a_token_scales=bt.w_tok_s,
+ out_type=out_type,
+ schedule=schedule,
+ )
# impl
-
# bench
-def bench_fn(label: str, sub_label: str, description: str,
- fn: Callable) -> TMeasurement:
- min_run_time = 1
- return TBenchmark.Timer(
- stmt="fn()",
+
+def bench_fns(label: str, sub_label: str, description: str,
+ fns: List[Callable]):
+
+ min_run_time = 1 if not NVTX_PROFILE else 0.1
+ res = TBenchmark.Timer(
+ stmt="""
+ for fn in fns:
+ fn()
+ """,
globals={
- "fn": fn
+ "fns": fns
},
label=label,
sub_label=sub_label,
description=description,
).blocked_autorange(min_run_time=min_run_time)
+ if NVTX_PROFILE:
+ with nvtx.annotate("mm-bench"), nvtx.annotate(
+ f"{label}|{sub_label}|{description}"):
+ fns[0]()
-def loop_over_weights(
- a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
- torch.tensor, torch.tensor]],
- fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
- None]):
- for w_ref, w_q, w_s, _ in weights:
- fn(a, w_ref, w_q, w_s)
+ return res
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
-def bench(atype: torch.dtype,
- wtype: ScalarType,
+def bench(types: TypeConfig,
group_size: int,
m: int,
k: int,
n: int,
label: str,
sub_label: str,
- benchmark_marlinv1: bool = True,
- sweep_schedules: bool = True) -> Iterable[TMeasurement]:
- global _SWEEP_SCHEDULES_RESULTS
-
- a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
- sub_label += f", L={len(weights)}"
-
- weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
- for w_ref, w_q, w_s, w_zp in weights]
+ sweep_schedules: bool = True) -> List[TMeasurement]:
+ benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
+ sub_label += f", L={len(benchmark_tensors)}"
+
+ name_type_string = f"W{types.weight_type}"+\
+ f"-A{terse_type_name(types.act_type)}"
+ if types.group_scale_type is not None:
+ name_type_string += f"-GS{terse_type_name(types.group_scale_type)}"
+ if types.group_zero_type is not None:
+ name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}"
+ if group_size is not None:
+ name_type_string += f"-G{group_size}"
+ if types.channel_scale_type is not None:
+ name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}"
+ if types.token_scale_type is not None:
+ name_type_string += f"-TS{terse_type_name(types.token_scale_type)}"
timers = []
# pytorch impl
timers.append(
- bench_fn(
- label, sub_label, "torch.matmul", lambda: loop_over_weights(
- a,
- weights,
- lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
- )))
+ bench_fns(
+ label, sub_label, "torch.matmul (fp16)",
+ [torch_matmul_f16_create_bench_fn(bt)
+ for bt in benchmark_tensors]))
- if benchmark_marlinv1:
- w_ref = weights[0][0]
-
- w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
- sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
- g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
-
- def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
- w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
- return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
- wtype.size_bits)
-
- def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
- return marlin_permute_scales(w_s, *w_ref.shape, group_size)
-
- weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
- marlinv1_permute_scales(w_s), w_zp)
- for w_ref, w_q, w_s, w_zp in weights]
-
- workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
- GPTQ_MARLIN_MAX_PARALLEL)
-
- # marlinv1
+ if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn:
+ timers.append(
+ bench_fns(
+ label, sub_label,
+ f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", [
+ cutlass_scaled_mm_create_bench_fn(bt)
+ for bt in benchmark_tensors
+ ]))
+
+ if types.act_type != torch.float8_e4m3fn:
timers.append(
- bench_fn(
- label, sub_label, "marlin_orig", lambda: loop_over_weights(
- a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
- gptq_marlin_gemm(a,
- w_q,
- w_s,
- w_zp_empty,
- g_idx,
- sort_indices,
- workspace.scratch,
- wtype,
- size_m=a.shape[0],
- size_n=w_ref.shape[1],
- size_k=w_ref.shape[0],
- is_k_full=True))))
+ bench_fns(label, sub_label, f"marlin ({name_type_string})",
+ [marlin_create_bench_fn(bt)
+ for bt in benchmark_tensors]))
# machete
timers.append(
- bench_fn(
- label, sub_label, "machete_heuristic", lambda: loop_over_weights(
- a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
- a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
+ bench_fns(label, sub_label, f"machete ({name_type_string})", [
+ machete_create_bench_fn(bt, out_type=types.output_type)
+ for bt in benchmark_tensors
+ ]))
if sweep_schedules:
+ global _SWEEP_SCHEDULES_RESULTS
+
print("Finding best schedule for machete")
best = None
best_schedule = None
- schedules = ops.machete_supported_schedules(wtype)
+ schedules = ops.machete_supported_schedules(
+ a_type=types.act_type,
+ b_type=types.weight_type,
+ group_scales_type=types.group_scale_type,
+ group_zeros_type=types.group_zero_type,
+ token_scales_type=types.token_scale_type,
+ channel_scales_type=types.channel_scale_type,
+ out_type=types.output_type)
+
+ if schedules is None or len(schedules) == 0:
+ raise ValueError("No schedules found to sweep")
+
for schedule in reversed(schedules):
schedule_M = int(schedule.split("_")[0].split("x")[1])
@@ -177,16 +381,11 @@ def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
continue
- def run(a, _, w_q, w_s, schedule=schedule):
- ops.machete_gemm(a,
- w_q,
- wtype,
- w_s,
- b_group_size=group_size,
- schedule=schedule)
-
- res = bench_fn(label, sub_label, "machete_best",
- lambda: loop_over_weights(a, weights_machete, run))
+ res = bench_fns(label, sub_label, "machete_best", [
+ machete_create_bench_fn(
+ bt, out_type=types.output_type, schedule=schedule)
+ for bt in benchmark_tensors
+ ])
results_row = {
"M": m,
@@ -213,25 +412,33 @@ def run(a, _, w_q, w_s, schedule=schedule):
# runner
-def print_timers(timers: Iterable[TMeasurement]):
+def print_timers(timers: List[TMeasurement]):
compare = TBenchmark.Compare(timers)
compare.print()
-def run(dtype: torch.dtype, sweep_schedules: bool,
- MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+ types = TypeConfig(
+ act_type=args.act_type,
+ weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
+ else scalar_types.uint4,
+ output_type=args.out_type,
+ group_scale_type=args.group_scale_type,
+ group_zero_type=args.group_zero_type,
+ channel_scale_type=args.channel_scale_type,
+ token_scale_type=args.token_scale_type,
+ )
- results = []
+ results: List[TMeasurement] = []
for m, k, n in MKNs:
- timers = bench(dtype,
- scalar_types.uint4b8,
- 128,
+ timers = bench(types,
+ args.group_size,
m,
k,
n,
- f"{dtype}-gemm",
+ f"{args.act_type}-gemm",
f"MKN=({m}x{k}x{n})",
- sweep_schedules=sweep_schedules)
+ sweep_schedules=args.sweep_schedules)
print_timers(timers)
results.extend(timers)
@@ -240,7 +447,7 @@ def run(dtype: torch.dtype, sweep_schedules: bool,
# output makers
def make_output(
- data: Iterable[TMeasurement],
+ data: List[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
base_description: str,
timestamp=None,
@@ -262,7 +469,6 @@ def run_square_bench(args):
dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-
data = run(args.dtype, args.sweep_schedules, MKNs)
make_output(data, MKNs, f"square_bench-{args.dtype}")
@@ -306,33 +512,49 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
for k, n in KNs:
MKNs.append((m, k, n))
- data = run(args.dtype, args.sweep_schedules, MKNs)
+ data = run(args, MKNs)
model_bench_data.append(data)
+ type_string = f"{args.act_type}"
+
# Print all results
for data, model_tp in zip(model_bench_data, models_tps):
model, tp_size = model_tp
- print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+ print(f"== Results {type_string} {model}-TP{tp_size} ====")
print_timers(data)
- timestamp = int(time.time())
+ timestr = time.strftime("%Y%m%d-%H%M%S")
- all_data = []
+ all_results = []
for d in model_bench_data:
- all_data.extend(d)
+ all_results.extend(d)
+
# pickle all data
- with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
- pkl.dump(all_data, f)
+ with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f:
+ args_dict = vars(args)
+ args_dict.pop("func")
+ pkl.dump({
+ "args": args_dict,
+ "results": all_results,
+ }, f)
if __name__ == "__main__":
def to_torch_dtype(dt):
- if dt == "bfloat16":
- return torch.bfloat16
- if dt == "float16":
- return torch.float16
- raise ValueError("unsupported dtype")
+ return {
+ "bfloat16": torch.bfloat16,
+ "float16": torch.float16,
+ "int8": torch.int8,
+ "float8_e4m3fn": torch.float8_e4m3fn,
+ "int": torch.int,
+ "float": torch.float,
+ }[dt]
+
+ class ToTorchDtype(argparse.Action):
+
+ def __call__(self, parser, namespace, values, option_string=None):
+ setattr(namespace, self.dest, to_torch_dtype(values))
parser = FlexibleArgumentParser(
description="""
@@ -352,12 +574,42 @@ def to_torch_dtype(dt):
""", # noqa: E501
formatter_class=argparse.RawTextHelpFormatter,
)
-
parser.add_argument(
- "--dtype",
- type=to_torch_dtype,
+ "--act-type",
+ action=ToTorchDtype,
required=True,
- help="Available options are ['bfloat16', 'float16']",
+ choices=['bfloat16', 'float16', 'int8', 'float8_e4m3fn'],
+ )
+ parser.add_argument(
+ "--group-scale-type",
+ action=ToTorchDtype,
+ choices=['bfloat16', 'float16'],
+ )
+ parser.add_argument(
+ "--group-zero-type",
+ type=to_torch_dtype,
+ choices=['bfloat16', 'float16'],
+ )
+ parser.add_argument(
+ "--channel-scale-type",
+ action=ToTorchDtype,
+ choices=['float'],
+ )
+ parser.add_argument(
+ "--token-scale-type",
+ action=ToTorchDtype,
+ choices=['float'],
+ )
+ parser.add_argument(
+ "--out-type",
+ action=ToTorchDtype,
+ choices=['bfloat16', 'float16'],
+ )
+ parser.add_argument(
+ "--group-size",
+ type=int,
+ help="Available options are ['None', '-1', '128'], default=128",
+ default=128,
)
parser.add_argument(
"--sweep-schedules",
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 536c133bb3341..8fb44e3a3dbd8 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -131,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
results.append(
benchmark.Timer(
stmt=
- "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501
+ "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@@ -141,7 +141,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
results.append(
benchmark.Timer(
stmt=
- "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501
+ "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
new file mode 100644
index 0000000000000..baa5de0fff1bd
--- /dev/null
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -0,0 +1,262 @@
+import itertools
+from typing import Optional, Tuple, Union
+
+import torch
+import triton
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from torch import nn
+
+from vllm import _custom_ops as vllm_ops
+
+
+class HuggingFaceRMSNorm(nn.Module):
+
+ def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ residual: Optional[torch.Tensor] = None,
+ ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+ orig_dtype = x.dtype
+ x = x.to(torch.float32)
+ if residual is not None:
+ x = x + residual.to(torch.float32)
+ residual = x.to(orig_dtype)
+
+ variance = x.pow(2).mean(dim=-1, keepdim=True)
+ x = x * torch.rsqrt(variance + self.variance_epsilon)
+ x = x.to(orig_dtype) * self.weight
+ if residual is None:
+ return x
+ else:
+ return x, residual
+
+
+def rmsnorm_naive(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ residual: Optional[torch.Tensor] = None,
+ eps: float = 1e-6,
+):
+ naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
+ naive_norm.weight = nn.Parameter(weight)
+ naive_norm = naive_norm.to(x.device)
+
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+ if residual is not None:
+ residual = residual.view(-1, residual.shape[-1])
+
+ output = naive_norm(x, residual)
+
+ if isinstance(output, tuple):
+ output = (output[0].view(orig_shape), output[1].view(orig_shape))
+ else:
+ output = output.view(orig_shape)
+ return output
+
+
+def rmsnorm_flashinfer(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ residual: Optional[torch.Tensor] = None,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+ if residual is not None:
+ residual = residual.view(-1, residual.shape[-1])
+
+ if residual is not None:
+ fused_add_rmsnorm(x, residual, weight, eps)
+ output = (x, residual)
+ else:
+ output = rmsnorm(x, weight, eps)
+
+ if isinstance(output, tuple):
+ output = (output[0].view(orig_shape), output[1].view(orig_shape))
+ else:
+ output = output.view(orig_shape)
+ return output
+
+
+def rmsnorm_vllm(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ residual: Optional[torch.Tensor] = None,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+ if residual is not None:
+ residual = residual.view(-1, residual.shape[-1])
+
+ if residual is not None:
+ vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+ output = (x, residual)
+ else:
+ out = torch.empty_like(x)
+ vllm_ops.rms_norm(out, x, weight, eps)
+ output = out
+
+ if isinstance(output, tuple):
+ output = (output[0].view(orig_shape), output[1].view(orig_shape))
+ else:
+ output = output.view(orig_shape)
+ return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
+ dtype = torch.bfloat16
+ x = torch.randn(batch_size,
+ seq_len,
+ hidden_size,
+ dtype=dtype,
+ device="cuda")
+ weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+ residual = torch.randn_like(x) if use_residual else None
+
+ output_naive = rmsnorm_naive(
+ x.clone(), weight,
+ residual.clone() if residual is not None else None)
+ output_flashinfer = rmsnorm_flashinfer(
+ x.clone(), weight,
+ residual.clone() if residual is not None else None)
+ output_vllm = rmsnorm_vllm(
+ x.clone(), weight,
+ residual.clone() if residual is not None else None)
+
+ if use_residual:
+ output_naive = output_naive[0]
+ output_flashinfer = output_flashinfer[0]
+ output_vllm = output_vllm[0]
+
+ print(f"Naive output={output_naive}")
+ print(f"FlashInfer output={output_flashinfer}")
+ print(f"VLLM output={output_vllm}")
+
+ if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
+ rtol=1e-2) and torch.allclose(
+ output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+ print("✅ All implementations match")
+ else:
+ print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+head_num_range = [32, 48]
+configs = list(
+ itertools.product(head_num_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark(use_residual):
+
+ @triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["head_num", "batch_size", "seq_len"],
+ x_vals=[list(_) for _ in configs],
+ line_arg="provider",
+ line_vals=["huggingface", "flashinfer", "vllm"],
+ line_names=["HuggingFace", "FlashInfer", "vLLM"],
+ styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+ ylabel="us",
+ plot_name=
+ f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
+ args={},
+ ))
+ def benchmark(head_num, batch_size, seq_len, provider):
+ dtype = torch.bfloat16
+ hidden_size = head_num * 128 # assuming head_dim = 128
+
+ x = torch.randn(batch_size,
+ seq_len,
+ hidden_size,
+ dtype=dtype,
+ device="cuda")
+ weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+ residual = torch.randn_like(x) if use_residual else None
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "huggingface":
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: rmsnorm_naive(
+ x.clone(),
+ weight,
+ residual.clone() if residual is not None else None,
+ ),
+ quantiles=quantiles,
+ )
+ elif provider == "flashinfer":
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: rmsnorm_flashinfer(
+ x.clone(),
+ weight,
+ residual.clone() if residual is not None else None,
+ ),
+ quantiles=quantiles,
+ )
+ else:
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: rmsnorm_vllm(
+ x.clone(),
+ weight,
+ residual.clone() if residual is not None else None,
+ ),
+ quantiles=quantiles,
+ )
+
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+ return benchmark
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=4,
+ help="Batch size",
+ )
+ parser.add_argument(
+ "--seq-len",
+ type=int,
+ default=128,
+ help="Sequence length",
+ )
+ parser.add_argument(
+ "--hidden-size",
+ type=int,
+ default=4096,
+ help="Hidden size (2nd dimension) of the sequence",
+ )
+ parser.add_argument("--use-residual",
+ action="store_true",
+ help="Whether to use residual connection")
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default="./configs/rmsnorm/",
+ help="Path to save rmsnorm benchmark results",
+ )
+
+ args = parser.parse_args()
+
+ # Run correctness test
+ calculate_diff(batch_size=args.batch_size,
+ seq_len=args.seq_len,
+ hidden_size=args.hidden_size,
+ use_residual=args.use_residual)
+
+ # Get the benchmark function with proper use_residual setting
+ benchmark = get_benchmark(args.use_residual)
+ # Run performance benchmark
+ benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index de608fd05af70..7d0bd84150a27 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -20,10 +20,11 @@
args = parser.parse_args()
with open(args.filename, 'rb') as f:
- data: List[TMeasurement] = pickle.load(f)
+ data = pickle.load(f)
+ raw_results: List[TMeasurement] = data["results"]
results = defaultdict(lambda: list())
- for v in data:
+ for v in raw_results:
result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
if result is not None:
KN = result.group(1)
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index 25ec9d6028627..51f24f3ba1774 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -40,4 +40,10 @@
([8192, 57344], 1),
([28672, 8192], 0),
],
+ "meta-llama/Llama-3.1-405b-hf": [
+ ([16384, 18432], 1),
+ ([16384, 16384], 0),
+ ([16384, 106496], 1),
+ ([53248, 16384], 0),
+ ],
}
diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json
new file mode 100644
index 0000000000000..6003698469e8d
--- /dev/null
+++ b/benchmarks/structured_schemas/structured_schema_1.json
@@ -0,0 +1,113 @@
+{
+ "$schema":
+ "https://json-schema.org/draft/2020-12/schema",
+ "title":
+ "User Profile",
+ "type":
+ "object",
+ "properties": {
+ "userId": {
+ "type": "string",
+ "description": "Unique identifier for the user."
+ },
+ "personalInfo": {
+ "type": "object",
+ "properties": {
+ "firstName": {
+ "type": "string",
+ "description": "The user's first name."
+ },
+ "lastName": {
+ "type": "string",
+ "description": "The user's last name."
+ },
+ "age": {
+ "type": "integer",
+ "minimum": 0,
+ "description": "The user's age."
+ },
+ "phoneNumbers": {
+ "type":
+ "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["home", "work", "mobile"],
+ "description": "Type of phone number."
+ },
+ "number": {
+ "type": "string",
+ "pattern": "^\\+?[1-9]\\d{1,14}$",
+ "description": "Phone number in E.164 format."
+ }
+ },
+ "required": ["type", "number"]
+ },
+ "description":
+ "List of phone numbers associated with the user."
+ }
+ },
+ "required": ["firstName", "lastName"]
+ },
+ "address": {
+ "type": "object",
+ "properties": {
+ "street": {
+ "type": "string",
+ "description": "Street address."
+ },
+ "city": {
+ "type": "string",
+ "description": "City name."
+ },
+ "state": {
+ "type": "string",
+ "description": "State or province."
+ },
+ "postalCode": {
+ "type": "string",
+ "pattern": "^\\d{5}(-\\d{4})?$",
+ "description": "Postal code."
+ },
+ "country": {
+ "type": "string",
+ "description": "Country name."
+ }
+ },
+ "required": ["street", "city", "state", "postalCode", "country"]
+ },
+ "preferences": {
+ "type": "object",
+ "properties": {
+ "newsletterSubscribed": {
+ "type":
+ "boolean",
+ "description":
+ "Indicates if the user is subscribed to the newsletter."
+ },
+ "favoriteCategories": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "List of user's favorite categories."
+ }
+ },
+ "required": ["newsletterSubscribed"]
+ },
+ "accountStatus": {
+ "type": "string",
+ "enum": ["active", "inactive", "suspended"],
+ "description": "Current status of the user's account."
+ },
+ "registrationDate": {
+ "type": "string",
+ "format": "date-time",
+ "description": "ISO 8601 formatted date-time of user registration."
+ }
+ },
+ "required":
+ ["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
+}
\ No newline at end of file
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 5912c5c02ede7..68f7ca1af05ad 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -16,9 +16,14 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
#
# Check the compile flags
#
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+ list(APPEND CXX_COMPILE_FLAGS
+ "-mf16c"
+ )
+endif()
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
- "-mf16c"
"-DVLLM_CPU_EXTENSION")
execute_process(COMMAND cat /proc/cpuinfo
@@ -53,6 +58,8 @@ find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
if (AVX512_FOUND AND NOT AVX512_DISABLED)
list(APPEND CXX_COMPILE_FLAGS
@@ -72,9 +79,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
else()
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
endif()
+
elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
message(WARNING "vLLM CPU backend using AVX2 ISA")
+
elseif (POWER9_FOUND OR POWER10_FOUND)
message(STATUS "PowerPC detected")
# Check for PowerPC VSX support
@@ -82,8 +91,20 @@ elseif (POWER9_FOUND OR POWER10_FOUND)
"-mvsx"
"-mcpu=native"
"-mtune=native")
+
+elseif (ASIMD_FOUND)
+ message(STATUS "ARMv8 or later architecture detected")
+ if(ARM_BF16_FOUND)
+ message(STATUS "BF16 extension detected")
+ set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
+ add_compile_definitions(ARM_BF16_SUPPORT)
+ else()
+ message(WARNING "BF16 functionality is not available")
+ set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
+ endif()
+ list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
else()
- message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
+ message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
endif()
#
@@ -153,4 +174,4 @@ define_gpu_extension_target(
WITH_SOABI
)
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
index 741cd0c82dc89..cb1a069942069 100644
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@@ -140,13 +140,10 @@ void paged_attention_v1_launcher(
blocksparse_block_size, blocksparse_head_sliding_step);
#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- switch (is_block_sparse) { \
- case true: \
- CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
- break; \
- case false: \
- CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
- break; \
+ if (is_block_sparse) { \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ } else { \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
}
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
index 6de8d0bdd5b8d..c457bdb89008e 100644
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@@ -147,13 +147,10 @@ void paged_attention_v2_launcher(
blocksparse_head_sliding_step);
#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- switch (is_block_sparse) { \
- case true: \
- CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
- break; \
- case false: \
- CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
- break; \
+ if (is_block_sparse) { \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ } else { \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
}
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 1be806bbfa43c..8a95279f9a25a 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -307,10 +307,20 @@ void reshape_and_cache_flash(
torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor&
value_cache, // [num_blocks, block_size, num_heads, head_size]
- torch::Tensor& slot_mapping, // [num_tokens]
+ torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
const std::string& kv_cache_dtype, const double k_scale,
const double v_scale) {
- int num_tokens = key.size(0);
+ // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+ // slot_mapping.size(0) because of padding for CUDA graphs.
+ // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+ // both include padding.
+ // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+ // since key includes padding for CUDA graphs, while slot_mapping does not.
+ // In this case, slot_mapping.size(0) represents the actual number of tokens
+ // before padding.
+ // For compatibility with both cases, we use slot_mapping.size(0) as the
+ // number of tokens.
+ int num_tokens = slot_mapping.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);
int block_size = key_cache.size(1);
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
new file mode 100644
index 0000000000000..ba9f40a230c8e
--- /dev/null
+++ b/csrc/core/math.hpp
@@ -0,0 +1,7 @@
+#include
+#include
+
+inline uint32_t next_pow_2(uint32_t const num) {
+ if (num <= 1) return num;
+ return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
\ No newline at end of file
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index e73eca1b345fd..e21832ba7582f 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -24,12 +24,20 @@ struct KernelVecType {
template <>
struct KernelVecType {
+#ifdef __powerpc64__
+ // Power architecture-specific vector types
+ using q_load_vec_type = vec_op::FP32Vec8;
+ using k_load_vec_type = vec_op::FP32Vec16;
+ using v_load_vec_type = vec_op::FP32Vec16;
+#else
+ // Fallback for other architectures, including x86
using q_load_vec_type = vec_op::FP16Vec8;
- using q_vec_type = vec_op::FP32Vec16;
using k_load_vec_type = vec_op::FP16Vec16;
+ using v_load_vec_type = vec_op::FP16Vec16;
+#endif
+ using q_vec_type = vec_op::FP32Vec16;
using k_vec_type = vec_op::FP32Vec16;
using qk_acc_vec_type = vec_op::FP32Vec16;
- using v_load_vec_type = vec_op::FP16Vec16;
};
#ifdef __AVX512BF16__
@@ -43,6 +51,21 @@ struct KernelVecType {
using v_load_vec_type = vec_op::BF16Vec16;
};
#else
+ #ifdef __aarch64__
+ #ifndef ARM_BF16_SUPPORT
+ // pass
+ #else
+template <>
+struct KernelVecType {
+ using q_load_vec_type = vec_op::BF16Vec8;
+ using q_vec_type = vec_op::FP32Vec16;
+ using k_load_vec_type = vec_op::BF16Vec16;
+ using k_vec_type = vec_op::FP32Vec16;
+ using qk_acc_vec_type = vec_op::FP32Vec16;
+ using v_load_vec_type = vec_op::BF16Vec16;
+};
+ #endif
+ #else
template <>
struct KernelVecType {
using q_load_vec_type = vec_op::BF16Vec8;
@@ -52,6 +75,7 @@ struct KernelVecType {
using qk_acc_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::BF16Vec16;
};
+ #endif
#endif
template
@@ -771,4 +795,4 @@ void paged_attention_v2(
CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
});
-}
+}
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 0213be09105ed..28db0479748bf 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -1,4 +1,3 @@
-
#ifndef CPU_TYPES_HPP
#define CPU_TYPES_HPP
@@ -8,8 +7,11 @@
#elif defined(__POWER9_VECTOR__)
//ppc implementation
#include "cpu_types_vsx.hpp"
+#elif defined(__aarch64__)
+ //arm implementation
+ #include "cpu_types_arm.hpp"
#else
#warning "unsupported vLLM cpu implementation"
#endif
-#endif
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
new file mode 100644
index 0000000000000..73e0f8cb2e0fb
--- /dev/null
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -0,0 +1,515 @@
+#include
+#include
+#include
+
+namespace vec_op {
+
+#ifdef ARM_BF16_SUPPORT
+ #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
+ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
+ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#else
+ #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
+ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME) \
+ std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+ template
+ constexpr void unroll_loop_item(std::integer_sequence, F &&f) {
+ (f(std::integral_constant{}), ...);
+ };
+};
+
+template >>
+constexpr void unroll_loop(F &&f) {
+ unroll_loop_item(std::make_integer_sequence{}, std::forward(f));
+}
+
+template struct Vec {
+ constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct FP16Vec8 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 8;
+
+ float16x8_t reg;
+
+ explicit FP16Vec8(const void *ptr)
+ : reg(vld1q_f16(static_cast(ptr))) {};
+
+ explicit FP16Vec8(const FP32Vec8 &);
+
+ void save(void *ptr) const {
+ vst1q_f16(static_cast<__fp16 *>(ptr), reg);
+ }
+};
+
+struct FP16Vec16 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 16;
+
+ float16x8x2_t reg;
+
+ explicit FP16Vec16(const void *ptr) {
+ reg.val[0] = vld1q_f16(reinterpret_cast(ptr));
+ reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8);
+ }
+
+ explicit FP16Vec16(const FP32Vec16& vec);
+
+ void save(void *ptr) const {
+ vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+ vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+ }
+
+ void save(void *ptr, const int elem_num) const {
+ int full_blocks = elem_num / 8;
+ int remainder = elem_num % 8;
+
+ if (full_blocks > 0) {
+ vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+ if (full_blocks > 1) {
+ vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+ }
+ }
+
+ if (remainder > 0) {
+ float16x8_t temp = reg.val[full_blocks];
+ for (int i = 0; i < remainder; ++i) {
+ reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+ }
+ }
+ }
+};
+
+
+#ifdef ARM_BF16_SUPPORT
+struct BF16Vec8 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 8;
+
+ bfloat16x8_t reg;
+
+ explicit BF16Vec8(const void *ptr)
+ : reg(*reinterpret_cast(ptr)) {};
+
+ explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
+
+ explicit BF16Vec8(const FP32Vec8 &);
+
+ explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};
+
+ void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 16;
+
+ bfloat16x8x2_t reg;
+
+ explicit BF16Vec16(const void *ptr)
+ : reg(*reinterpret_cast(ptr)) {};
+
+ explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
+
+ explicit BF16Vec16(const FP32Vec16 &);
+
+ explicit BF16Vec16(float32x4x4_t v) : reg({
+ vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
+ vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])
+ }){};
+
+ void save(void *ptr) const { *reinterpret_cast(ptr) = reg; };
+};
+
+struct BF16Vec32 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 32;
+
+ bfloat16x8x4_t reg;
+
+ explicit BF16Vec32(const void *ptr)
+ : reg(*reinterpret_cast(ptr)) {};
+
+ explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
+
+ explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
+ vec8_data.reg,
+ vec8_data.reg,
+ vec8_data.reg,
+ vec8_data.reg
+ }) {};
+
+ void save(void *ptr) const { *reinterpret_cast(ptr) = reg; };
+};
+#endif
+
+struct FP32Vec4 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 4;
+
+ union AliasReg {
+ float32x4_t reg;
+ float values[VEC_ELEM_NUM];
+ };
+
+ float32x4_t reg;
+
+ explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {};
+
+ explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
+
+ explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {};
+
+ explicit FP32Vec4(float32x4_t data) : reg(data) {};
+
+ explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {};
+};
+
+struct FP32Vec8 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 8;
+ union AliasReg {
+ float32x4x2_t reg;
+ float values[VEC_ELEM_NUM];
+ };
+
+ float32x4x2_t reg;
+
+ explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {};
+
+ explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
+
+ explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
+
+ explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
+
+ explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {};
+
+ explicit FP32Vec8(const FP16Vec8 &v) {
+ reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));
+ reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg));
+ };
+
+ explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
+
+ #ifdef ARM_BF16_SUPPORT
+
+ explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
+
+ explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
+
+ #endif
+
+ float reduce_sum() const {
+ AliasReg ar;
+ ar.reg = reg;
+ float answer = 0;
+ unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; });
+
+ return answer;
+ }
+
+ FP32Vec8 exp() const {
+ AliasReg ar;
+ ar.reg = reg;
+
+ float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])};
+ float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])};
+ float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])};
+ float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])};
+
+ float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1);
+ float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3);
+
+ float32x4x2_t result;
+ result.val[0] = result0;
+ result.val[1] = result1;
+
+ return FP32Vec8(result);
+ }
+
+ FP32Vec8 tanh() const {
+ AliasReg ar;
+ ar.reg = reg;
+
+ float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])};
+ float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])};
+ float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])};
+ float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])};
+
+ float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1);
+ float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3);
+
+ float32x4x2_t result;
+ result.val[0] = result0;
+ result.val[1] = result1;
+
+ return FP32Vec8(result);
+ }
+
+ FP32Vec8 er() const {
+ AliasReg ar;
+ ar.reg = reg;
+
+ float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), static_cast(erf(ar.values[1]))};
+ float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), static_cast(erf(ar.values[3]))};
+ float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), static_cast(erf(ar.values[5]))};
+ float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), static_cast(erf(ar.values[7]))};
+
+ float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
+ float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
+
+ float32x4x2_t result;
+ result.val[0] = result0;
+ result.val[1] = result1;
+
+ return FP32Vec8(result);
+ }
+
+ FP32Vec8 operator*(const FP32Vec8 &b) const {
+ return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])}));
+ }
+
+ FP32Vec8 operator+(const FP32Vec8 &b) const {
+ return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])}));
+ }
+
+ FP32Vec8 operator-(const FP32Vec8 &b) const {
+ return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])}));
+ }
+
+ FP32Vec8 operator/(const FP32Vec8 &b) const {
+ return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])}));
+ }
+
+ void save(float *ptr) const {
+ vst1q_f32(ptr, reg.val[0]);
+ vst1q_f32(ptr + 4, reg.val[1]);
+ }
+};
+
+struct FP32Vec16 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 16;
+ union AliasReg {
+ float32x4x4_t reg;
+ float values[VEC_ELEM_NUM];
+ };
+
+ float32x4x4_t reg;
+
+ explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+
+ explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
+
+ explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {}
+
+ explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
+
+ explicit FP32Vec16(const FP32Vec8 &data) {
+ reg.val[0] = data.reg.val[0];
+ reg.val[1] = data.reg.val[1];
+ reg.val[2] = data.reg.val[0];
+ reg.val[3] = data.reg.val[1];
+ }
+
+ explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+
+ explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {}
+
+ #ifdef ARM_BF16_SUPPORT
+ explicit FP32Vec16(bfloat16x8x2_t v) : reg({
+ vcvtq_low_f32_bf16(v.val[0]),
+ vcvtq_high_f32_bf16(v.val[0]),
+ vcvtq_low_f32_bf16(v.val[1]),
+ vcvtq_high_f32_bf16(v.val[1])
+ }) {};
+ #endif
+
+ explicit FP32Vec16(const FP32Vec4 &data) {
+ reg.val[0] = data.reg;
+ reg.val[1] = data.reg;
+ reg.val[2] = data.reg;
+ reg.val[3] = data.reg;
+ };
+
+ #ifdef ARM_BF16_SUPPORT
+ explicit FP32Vec16(const BF16Vec16 &v) : reg({
+ vcvtq_low_f32_bf16(v.reg.val[0]),
+ vcvtq_high_f32_bf16(v.reg.val[0]),
+ vcvtq_low_f32_bf16(v.reg.val[1]),
+ vcvtq_high_f32_bf16(v.reg.val[1])
+ }) {};
+
+ explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {};
+ #endif
+
+ explicit FP32Vec16(const FP16Vec16 &v) {
+ reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
+ reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
+ reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
+ reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
+ };
+
+ FP32Vec16 operator+(const FP32Vec16 &b) const {
+ return FP32Vec16(float32x4x4_t({
+ vaddq_f32(reg.val[0], b.reg.val[0]),
+ vaddq_f32(reg.val[1], b.reg.val[1]),
+ vaddq_f32(reg.val[2], b.reg.val[2]),
+ vaddq_f32(reg.val[3], b.reg.val[3])}));
+ };
+
+ FP32Vec16 operator*(const FP32Vec16 &b) const {
+ return FP32Vec16(float32x4x4_t({
+ vmulq_f32(reg.val[0], b.reg.val[0]),
+ vmulq_f32(reg.val[1], b.reg.val[1]),
+ vmulq_f32(reg.val[2], b.reg.val[2]),
+ vmulq_f32(reg.val[3], b.reg.val[3])}));
+ };
+
+ FP32Vec16 operator-(const FP32Vec16 &b) const {
+ return FP32Vec16(float32x4x4_t({
+ vsubq_f32(reg.val[0], b.reg.val[0]),
+ vsubq_f32(reg.val[1], b.reg.val[1]),
+ vsubq_f32(reg.val[2], b.reg.val[2]),
+ vsubq_f32(reg.val[3], b.reg.val[3])
+ }));
+ };
+
+ FP32Vec16 operator/(const FP32Vec16 &b) const {
+ return FP32Vec16(float32x4x4_t({
+ vdivq_f32(reg.val[0], b.reg.val[0]),
+ vdivq_f32(reg.val[1], b.reg.val[1]),
+ vdivq_f32(reg.val[2], b.reg.val[2]),
+ vdivq_f32(reg.val[3], b.reg.val[3])
+ }));
+ };
+
+ float reduce_sum() const {
+ AliasReg ar;
+ ar.reg = reg;
+ float answer = 0;
+ unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; });
+
+ return answer;
+ };
+
+ template float reduce_sub_sum(int idx) {
+ static_assert(VEC_ELEM_NUM % group_size == 0);
+
+ AliasReg ar;
+ ar.reg = reg;
+ float answer = 0;
+ const int start = idx * group_size;
+ unroll_loop(
+ [&answer, &start, ar](int i) { answer += ar.values[start + i]; });
+
+ return answer;
+ };
+
+ void save(float *ptr) const {
+ vst1q_f32(ptr, reg.val[0]);
+ vst1q_f32(ptr + 4, reg.val[1]);
+ vst1q_f32(ptr + 8, reg.val[2]);
+ vst1q_f32(ptr + 12, reg.val[3]);
+ };
+};
+
+template struct VecType { using vec_type = void; };
+
+template using vec_t = typename VecType::vec_type;
+
+template <> struct VecType { using vec_type = FP32Vec8; };
+
+template <> struct VecType { using vec_type = FP16Vec8; };
+
+#ifdef ARM_BF16_SUPPORT
+template <> struct VecType { using vec_type = BF16Vec8; };
+#endif
+
+template void storeFP32(float v, T *ptr) { *ptr = v; }
+
+template <> inline void storeFP32(float v, c10::Half *ptr) {
+ *reinterpret_cast<__fp16 *>(ptr) = v;
+}
+
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) {
+ float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
+ float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
+ float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
+ float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
+
+ reg.val[0] = vcombine_f16(low_0, high_0);
+ reg.val[1] = vcombine_f16(low_1, high_1);
+};
+
+inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) {
+ float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
+ float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
+
+ reg = vcombine_f16(lower_half, upper_half);
+};
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+
+ acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
+ acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
+ acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
+ acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
+};
+
+#ifdef ARM_BF16_SUPPORT
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+
+ float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
+ float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
+ float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
+ float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1]));
+
+ float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0]));
+ float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0]));
+ float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1]));
+ float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1]));
+
+ acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low);
+ acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high);
+ acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
+ acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
+};
+#endif
+
+#ifdef ARM_BF16_SUPPORT
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
+ vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
+ vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
+ }){};
+#endif
+
+inline void prefetch(const void *addr) {
+ __builtin_prefetch(addr, 0, 1);
+};
+
+#ifdef ARM_BF16_SUPPORT
+template <>
+inline void storeFP32(float v, c10::BFloat16 *ptr) {
+ *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
+};
+#endif
+};
\ No newline at end of file
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index f42fa2361a2db..d9aed657a3113 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -25,7 +25,13 @@ struct KernelVecType {
template <>
struct KernelVecType {
+#ifdef __powerpc64__
+ // Power architecture-specific vector type
+ using load_vec_type = vec_op::FP32Vec16;
+#else
+ // Fallback for other architectures
using load_vec_type = vec_op::FP16Vec16;
+#endif
using azp_adj_load_vec_type = vec_op::INT32Vec16;
using cvt_vec_type = vec_op::FP32Vec16;
};
diff --git a/csrc/cutlass_extensions/common.cpp b/csrc/cutlass_extensions/common.cpp
new file mode 100644
index 0000000000000..3d2093ab94297
--- /dev/null
+++ b/csrc/cutlass_extensions/common.cpp
@@ -0,0 +1,11 @@
+#include "cutlass_extensions/common.hpp"
+
+int32_t get_sm_version_num() {
+ int32_t major_capability, minor_capability;
+ cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+ 0);
+ cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+ 0);
+ int32_t version_num = major_capability * 10 + minor_capability;
+ return version_num;
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
new file mode 100644
index 0000000000000..85e359aa57113
--- /dev/null
+++ b/csrc/cutlass_extensions/common.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include
+#include "cuda_runtime.h"
+#include
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status) \
+ { \
+ cutlass::Status error = status; \
+ TORCH_CHECK(error == cutlass::Status::kSuccess, \
+ cutlassGetStatusString(error)); \
+ }
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status) \
+ { \
+ cudaError_t error = status; \
+ TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
+ }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+ int max_shared_mem_per_block_opt_in = 0;
+ cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+ cudaDevAttrMaxSharedMemoryPerBlockOptin,
+ device);
+ return max_shared_mem_per_block_opt_in;
+}
+
+int32_t get_sm_version_num();
diff --git a/csrc/cutlass_extensions/cute_utils.cuh b/csrc/cutlass_extensions/cute_utils.cuh
index 1842fab8b2cac..f61fe3ceb978a 100644
--- a/csrc/cutlass_extensions/cute_utils.cuh
+++ b/csrc/cutlass_extensions/cute_utils.cuh
@@ -20,9 +20,9 @@ CUTE_HOST_DEVICE static constexpr auto permute_layout(Layout l) {
// is the layout f(x) = x
template
CUTE_HOST_DEVICE static constexpr bool is_identity_layout() {
- if constexpr (std::is_same_v)
+ if constexpr (std::is_same_v) {
return true;
- else {
+ } else {
constexpr auto coalesced_layout = coalesce(Layout{});
if constexpr (rank(coalesced_layout) == 1 &&
stride<0>(coalesced_layout) == 1) {
diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
similarity index 99%
rename from csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
rename to csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
index d407d66ab2aa6..7aa87feb4cce2 100644
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
@@ -52,6 +52,7 @@
// clang-format off
#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
#include "cute/tensor.hpp"
namespace cutlass::epilogue::threadblock {
diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
rename to csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
new file mode 100644
index 0000000000000..c69e87999ae71
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -0,0 +1,317 @@
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
+
+/*
+ This file defines custom epilogues for fusing channel scales, token scales,
+ bias, and activation zero-points onto a GEMM operation using the
+ CUTLASS 2.x API, for sm80 (Ampere) NVIDIA GPUs.
+
+ Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+ as well as a static prepare_args function that constructs an
+ EVTCompute::Arguments struct.
+*/
+
+namespace vllm::c2x {
+
+using namespace cute;
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template
+struct ScaledEpilogueBase {
+ protected:
+ using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+ template
+ using ColOrScalarLoad =
+ cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
+ OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>;
+
+ template
+ using RowOrScalarLoad =
+ cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
+ OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>;
+
+ template
+ using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast<
+ OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>;
+
+ template
+ using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+ OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>;
+
+ template
+ using RowOrZeroLoad =
+ cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast<
+ OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>;
+
+ // This utility function constructs the arguments for the load descriptors
+ // from a tensor. It can handle both row and column, as well as row/column or
+ // scalar cases.
+ template
+ static auto args_from_tensor(torch::Tensor const& tensor) {
+ using Arguments = typename Descriptor::Arguments;
+ auto* data_ptr = static_cast(tensor.data_ptr());
+ if constexpr (std::is_same_v> ||
+ std::is_same_v>) {
+ return Arguments{data_ptr, tensor.numel() != 1};
+ } else {
+ // it would technically work but no use case as data_ptr is never nullptr
+ static_assert(!std::is_same_v>);
+ return Arguments{data_ptr};
+ }
+ }
+
+ // This overload handles the case where there might not be a tensor, in which
+ // case a nullptr is passed and a constant (0) is used.
+ template
+ static auto args_from_tensor(c10::optional const& tensor) {
+ static_assert(std::is_same_v>);
+ using Arguments = typename Descriptor::Arguments;
+ auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr;
+ return Arguments{data_ptr};
+ }
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template
+struct ScaledEpilogue
+ : private ScaledEpilogueBase {
+ private:
+ using SUPER = ScaledEpilogueBase;
+ using Accum = typename SUPER::Accum;
+ using ScaleA = typename SUPER::template ColOrScalarLoad;
+ using ScaleB = typename SUPER::template RowOrScalarLoad;
+
+ using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::multiplies, float, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTCompute0 =
+ cutlass::epilogue::threadblock::Sm80EVT;
+
+ using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::multiplies, ElementD, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+ using EVTCompute =
+ cutlass::epilogue::threadblock::Sm80EVT;
+ using ArgumentType = typename EVTCompute::Arguments;
+
+ static ArgumentType prepare_args(torch::Tensor const& a_scales,
+ torch::Tensor const& b_scales) {
+ auto a_args = SUPER::template args_from_tensor(a_scales);
+ auto b_args = SUPER::template args_from_tensor(b_scales);
+
+ typename EVTCompute0::Arguments evt0_args{b_args};
+ return ArgumentType{a_args, evt0_args};
+ }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template
+struct ScaledEpilogueBias
+ : protected ScaledEpilogueBase {
+ protected:
+ using SUPER = ScaledEpilogueBase;
+ using Accum = typename SUPER::Accum;
+ using ScaleA = typename SUPER::template ColOrScalarLoad;
+ using ScaleB = typename SUPER::template RowOrScalarLoad;
+ using Bias = typename SUPER::template RowLoad;
+ using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::multiplies, float, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTCompute0 =
+ cutlass::epilogue::threadblock::Sm80EVT;
+
+ using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::multiply_add, ElementD, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+ using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT;
+ using ArgumentType = typename EVTCompute::Arguments;
+ static ArgumentType prepare_args(torch::Tensor const& a_scales,
+ torch::Tensor const& b_scales,
+ torch::Tensor const& bias) {
+ auto a_args = SUPER::template args_from_tensor(a_scales);
+ auto b_args = SUPER::template args_from_tensor(b_scales);
+ auto bias_args = SUPER::template args_from_tensor(bias);
+
+ typename EVTCompute0::Arguments evt0_args{b_args};
+ return ArgumentType{a_args, evt0_args, bias_args};
+ }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template
+struct ScaledEpilogueBiasAzp
+ : protected ScaledEpilogueBase {
+ private:
+ using SUPER = ScaledEpilogueBase;
+ using Accum = typename SUPER::Accum;
+ using ScaleA = typename SUPER::template ColOrScalarLoad;
+ using ScaleB = typename SUPER::template RowOrScalarLoad;
+ using Bias = typename SUPER::template RowOrZeroLoad;
+
+ // This is the full AZP term, azp * J @ B, shape (1,n)
+ using AzpWithAdj = typename SUPER::template RowLoad;
+
+ // Compute float(accum - azp_adj), both operands are int32_t
+ using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::minus, float, int32_t,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeAzp =
+ cutlass::epilogue::threadblock::Sm80EVT;
+
+ using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::multiplies, float, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeScaleB =
+ cutlass::epilogue::threadblock::Sm80EVT;
+
+ using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::multiply_add, ElementD, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+ using EVTCompute =
+ cutlass::epilogue::threadblock::Sm80EVT;
+
+ using ArgumentType = typename EVTCompute::Arguments;
+
+ static ArgumentType prepare_args(torch::Tensor const& a_scales,
+ torch::Tensor const& b_scales,
+ torch::Tensor const& azp_adj,
+ c10::optional const& bias) {
+ auto a_args = SUPER::template args_from_tensor(a_scales);
+ auto b_args = SUPER::template args_from_tensor(b_scales);
+ auto bias_args = SUPER::template args_from_tensor(bias);
+ auto azp_adj_args =
+ SUPER::template args_from_tensor(azp_adj);
+
+ typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
+ typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
+ return ArgumentType{a_args, evt_scale_b_args, bias_args};
+ }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template
+struct ScaledEpilogueBiasAzpToken
+ : protected ScaledEpilogueBase {
+ private:
+ using SUPER = ScaledEpilogueBase;
+ using Accum = typename SUPER::Accum;
+ using ScaleA = typename SUPER::template ColOrScalarLoad;
+ using ScaleB = typename SUPER::template RowOrScalarLoad;
+ using Bias = typename SUPER::template RowOrZeroLoad;
+
+ // Per-token azp term, shape (m,1)
+ using Azp = typename SUPER::template ColLoad;
+
+ // This is the AZP adjustment term, J @ B, shape (1,n)
+ using AzpAdj = typename SUPER::template RowLoad;
+
+ // Compute azp * azp_adj
+ using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::multiplies, int32_t, int32_t,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeAzp =
+ cutlass::epilogue::threadblock::Sm80EVT;
+
+ // Compute float(accum - azp*azp_adj), all operands are int32_t
+ using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::minus, float, int32_t,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeAcc =
+ cutlass::epilogue::threadblock::Sm80EVT;
+
+ using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::multiplies, float, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeScaleB =
+ cutlass::epilogue::threadblock::Sm80EVT;
+
+ using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
+ cutlass::multiply_add, ElementD, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+ using EVTCompute =
+ cutlass::epilogue::threadblock::Sm80EVT;
+
+ using ArgumentType = typename EVTCompute::Arguments;
+
+ static ArgumentType prepare_args(torch::Tensor const& a_scales,
+ torch::Tensor const& b_scales,
+ torch::Tensor const& azp_adj,
+ torch::Tensor const& azp,
+ c10::optional const& bias) {
+ auto a_args = SUPER::template args_from_tensor(a_scales);
+ auto b_args = SUPER::template args_from_tensor(b_scales);
+ auto bias_args = SUPER::template args_from_tensor(bias);
+ auto azp_args = SUPER::template args_from_tensor(azp);
+ auto azp_adj_args =
+ SUPER::template args_from_tensor(azp_adj);
+
+ typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
+ typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
+ typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
+ return ArgumentType{a_args, evt_scale_b_args, bias_args};
+ }
+};
+
+}; // namespace vllm::c2x
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
new file mode 100644
index 0000000000000..fcc17c7727f94
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -0,0 +1,315 @@
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+
+/*
+ This file defines custom epilogues for fusing channel scales, token scales,
+ bias, and activation zero-points onto a GEMM operation using the
+ CUTLASS 3.x API, for NVIDIA GPUs with sm90a (Hopper) or later.
+
+ Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+ as well as a static prepare_args function that constructs an
+ EVTCompute::Arguments struct.
+*/
+
+namespace vllm::c3x {
+
+using namespace cute;
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template
+struct ScaledEpilogueBase {
+ protected:
+ using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+ template
+ using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+ 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+ Stride, Int<0>, Int<0>>>;
+
+ template
+ using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+ 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+ Stride, Int<1>, Int<0>>>;
+
+ // Don't want to support nullptr by default
+ template
+ using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+ 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+ Stride, Int<0>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>;
+
+ // Don't want to support nullptr by default
+ template
+ using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+ 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+ Stride, Int<1>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>;
+
+ // This utility function constructs the arguments for the load descriptors
+ // from a tensor. It can handle both row and column, as well as row/column or
+ // scalar cases.
+ template
+ static auto args_from_tensor(torch::Tensor const& tensor) {
+ using Arguments = typename Descriptor::Arguments;
+ auto* data_ptr = static_cast(tensor.data_ptr());
+ if constexpr (std::is_same_v> ||
+ std::is_same_v>) {
+ return Arguments{data_ptr, tensor.numel() != 1};
+ } else {
+ static_assert(!std::is_same_v> &&
+ !std::is_same_v>);
+ return Arguments{data_ptr};
+ }
+ }
+
+ // This overload handles the case where there might not be a tensor, in which
+ // case a nullptr is passed and a constant (0) is used.
+ template
+ static auto args_from_tensor(c10::optional const& tensor) {
+ using Arguments = typename Descriptor::Arguments;
+ auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr;
+ static_assert(std::is_same_v> ||
+ std::is_same_v>);
+ return Arguments{data_ptr};
+ }
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch.scaled_mm_.
+
+ A and B may be both either int8 or fp8_e4m3. A can be
+ quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template
+struct ScaledEpilogue
+ : private ScaledEpilogueBase {
+ private:
+ using SUPER = ScaledEpilogueBase;
+ using Accum = typename SUPER::Accum;
+ using ScaleA = typename SUPER::template ColOrScalarLoad;
+ using ScaleB = typename SUPER::template RowOrScalarLoad;
+
+ using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::multiplies, float, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTCompute0 =
+ cutlass::epilogue::fusion::Sm90EVT;
+
+ using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::multiplies, ElementD, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+ using EVTCompute =
+ cutlass::epilogue::fusion::Sm90EVT;
+ using ArgumentType = typename EVTCompute::Arguments;
+
+ static ArgumentType prepare_args(torch::Tensor const& a_scales,
+ torch::Tensor const& b_scales) {
+ auto a_args = SUPER::template args_from_tensor(a_scales);
+ auto b_args = SUPER::template args_from_tensor(b_scales);
+
+ typename EVTCompute0::Arguments evt0_args{b_args};
+ return ArgumentType{a_args, evt0_args};
+ }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template
+struct ScaledEpilogueBias
+ : private ScaledEpilogueBase {
+ private:
+ using SUPER = ScaledEpilogueBase;
+ using Accum = typename SUPER::Accum;
+ using ScaleA = typename SUPER::template ColOrScalarLoad;
+ using ScaleB = typename SUPER::template RowOrScalarLoad;
+ using Bias = typename SUPER::template RowLoad;
+
+ using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::multiplies, float, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTCompute0 =
+ cutlass::epilogue::fusion::Sm90EVT;
+
+ using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::multiply_add, ElementD, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+ using EVTCompute =
+ cutlass::epilogue::fusion::Sm90EVT;
+
+ using ArgumentType = typename EVTCompute::Arguments;
+ static ArgumentType prepare_args(torch::Tensor const& a_scales,
+ torch::Tensor const& b_scales,
+ torch::Tensor const& bias) {
+ auto a_args = SUPER::template args_from_tensor(a_scales);
+ auto b_args = SUPER::template args_from_tensor(b_scales);
+ auto bias_args = SUPER::template args_from_tensor(bias);
+
+ typename EVTCompute0::Arguments evt0_args{b_args};
+ return ArgumentType{a_args, evt0_args, bias_args};
+ }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template
+struct ScaledEpilogueBiasAzp
+ : private ScaledEpilogueBase {
+ private:
+ using SUPER = ScaledEpilogueBase;
+ using Accum = typename SUPER::Accum;
+ using ScaleA = typename SUPER::template ColOrScalarLoad;
+ using ScaleB = typename SUPER::template RowOrScalarLoad;
+ using Bias = typename SUPER::template RowLoad;
+
+ // This is the full AZP term, azp * J @ B, shape (1,n)
+ using AzpWithAdj = typename SUPER::template RowLoad;
+
+ // Compute float(accum - azp_adj), both operands are int32_t
+ using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::minus, float, int32_t,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeAzp =
+ cutlass::epilogue::fusion::Sm90EVT;
+
+ using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::multiplies, float, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeScaleB =
+ cutlass::epilogue::fusion::Sm90EVT;
+
+ using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::multiply_add, ElementD, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+ using EVTCompute =
+ cutlass::epilogue::fusion::Sm90EVT;
+ using ArgumentType = typename EVTCompute::Arguments;
+
+ static ArgumentType prepare_args(torch::Tensor const& a_scales,
+ torch::Tensor const& b_scales,
+ torch::Tensor const& azp_adj,
+ c10::optional const& bias) {
+ auto a_args = SUPER::template args_from_tensor(a_scales);
+ auto b_args = SUPER::template args_from_tensor(b_scales);
+ auto bias_args = SUPER::template args_from_tensor(bias);
+ auto azp_adj_args =
+ SUPER::template args_from_tensor(azp_adj);
+
+ typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
+ typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
+ return ArgumentType{a_args, evt_scale_b_args, bias_args};
+ }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template
+struct ScaledEpilogueBiasAzpToken
+ : private ScaledEpilogueBase {
+ private:
+ using SUPER = ScaledEpilogueBase;
+ using Accum = typename SUPER::Accum;
+ using ScaleA = typename SUPER::template ColOrScalarLoad;
+ using ScaleB = typename SUPER::template RowOrScalarLoad;
+ using Bias = typename SUPER::template RowLoad;
+
+ // Per-token azp term, shape (m,1)
+ using Azp = typename SUPER::template ColLoad;
+
+ // This is the AZP adjustment term, J @ B, shape (1,n)
+ using AzpAdj = typename SUPER::template RowLoad;
+
+ // Compute azp * azp_adj
+ using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::multiplies, int32_t, int32_t,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeAzp =
+ cutlass::epilogue::fusion::Sm90EVT;
+
+ // Compute float(accum - azp*azp_adj), all operands are int32_t
+ using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::minus, float, int32_t,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeAcc =
+ cutlass::epilogue::fusion::Sm90EVT;
+
+ using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::multiplies, float, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ using EVTComputeScaleB =
+ cutlass::epilogue::fusion::Sm90EVT;
+
+ using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+ cutlass::multiply_add, ElementD, float,
+ cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+ using EVTCompute =
+ cutlass::epilogue::fusion::Sm90EVT;
+ using ArgumentType = typename EVTCompute::Arguments;
+
+ static ArgumentType prepare_args(torch::Tensor const& a_scales,
+ torch::Tensor const& b_scales,
+ torch::Tensor const& azp_adj,
+ torch::Tensor const& azp,
+ c10::optional const& bias) {
+ auto a_args = SUPER::template args_from_tensor(a_scales);
+ auto b_args = SUPER::template args_from_tensor(b_scales);
+ auto bias_args = SUPER::template args_from_tensor(bias);
+ auto azp_args = SUPER::template args_from_tensor(azp);
+ auto azp_adj_args =
+ SUPER::template args_from_tensor(azp_adj);
+
+ typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
+ typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
+ typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
+ return ArgumentType{a_args, evt_scale_b_args, bias_args};
+ }
+};
+
+}; // namespace vllm::c3x
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index 4fcfcd311aa91..a5beea1a35e49 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -35,6 +35,35 @@ class MixedInputKernelScheduleType(enum.Enum):
}
}
+VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
+ **DataTypeSize, # type: ignore
+ **{
+ VLLMDataType.u4b8: 4,
+ VLLMDataType.u8b128: 8,
+ }
+}
+
+VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+ VLLMDataType.u4b8: "vllm::kU4B8",
+ VLLMDataType.u8b128: "vllm::kU8B128",
+ DataType.u4: "vllm::kU4",
+ DataType.u8: "vllm::kU8",
+ DataType.s4: "vllm::kS4",
+ DataType.s8: "vllm::kS8",
+ DataType.f16: "vllm::kFloat16",
+ DataType.bf16: "vllm::kBfloat16",
+}
+
+VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+ DataType.u8: "at::ScalarType::Byte",
+ DataType.s8: "at::ScalarType::Char",
+ DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
+ DataType.s32: "at::ScalarType::Int",
+ DataType.f16: "at::ScalarType::Half",
+ DataType.bf16: "at::ScalarType::BFloat16",
+ DataType.f32: "at::ScalarType::Float",
+}
+
VLLMKernelScheduleTag: Dict[Union[
MixedInputKernelScheduleType, KernelScheduleType], str] = {
**KernelScheduleTag, # type: ignore
diff --git a/csrc/cutlass_extensions/vllm_numeric_conversion.cuh b/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
index 2ad914f8e9868..90f226cf64c0a 100644
--- a/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
+++ b/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
@@ -3,6 +3,7 @@
#include "cutlass/numeric_conversion.h"
#include "cutlass_extensions/vllm_custom_types.cuh"
#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/vllm_type_utils.cuh"
// this file extends:
// https://github.com/NVIDIA/cutlass/blob/cutlass-3.5.0/include/cutlass/numeric_conversion.h
@@ -28,8 +29,19 @@ struct InterleavedNumericArrayConverter {
CUTLASS_DEVICE
static result_type convert(source_type const& source) {
- CUTE_INVALID_CONTROL_PATH(
- "InterleavedNumericArrayConverter not implemented\n");
+ if (cute::elect_one_sync()) {
+ if constexpr (std::is_same_v) {
+ printf(
+ "Convert %s <= %s (N = %d, IlvBlkLayout = void), not implemented\n",
+ nameof_v, nameof_v, N);
+ } else {
+ printf(
+ "Convert %s <= %s (N = %d, size(IlvBlkLayout{}) = %d), not "
+ "implemented\n",
+ nameof_v, nameof_v, N, size(IlvBlkLayout{}));
+ }
+ __brkpt();
+ }
return {};
}
@@ -56,11 +68,6 @@ struct InterleavedNumericArrayConverter<
result_type operator()(source_type const& s) const { return convert(s); }
};
-// TODO (LucasWilkinson): Implement
-// for Array <= Array
-
-// ....
-
template
struct ArrayConverterPacked32Bit {
using result_type = Array;
@@ -86,14 +93,16 @@ struct ArrayConverterPacked32Bit {
using ScalarConverter = NumericConverter;
template
- CUTLASS_DEVICE static uint32_t to_reg(PackedSrc const& source) {
+ CUTLASS_DEVICE static auto to_regs(PackedSrc const& src) {
if constexpr (sizeof(PackedSrc) == 1) {
- return static_cast(reinterpret_cast(source));
+ return Array{reinterpret_cast(src)};
} else if constexpr (sizeof(PackedSrc) == 2) {
- return static_cast(reinterpret_cast(source));
+ return Array{reinterpret_cast(src)};
+ } else if constexpr (sizeof(PackedSrc) == 4) {
+ return Array{reinterpret_cast(src)};
} else {
- static_assert(sizeof(PackedSrc) == 4);
- return reinterpret_cast(source);
+ static_assert(sizeof(PackedSrc) == 8);
+ return reinterpret_cast const&>(src);
}
}
@@ -110,7 +119,7 @@ struct ArrayConverterPacked32Bit {
static_assert(std::is_same_v);
static_assert(std::is_same_v);
- return RegConvert32bit::template convert(to_reg(source));
+ return RegConvert32bit::template convert(to_regs(source));
}
friend class detail::VectorizedConverter;
@@ -140,6 +149,131 @@ struct ArrayConverterPacked32Bit {
}
};
+// Convert 8 4bit values packed into a 32bit register to 8 8bit values packed
+// into 2 32bit register.
+template
+CUTLASS_DEVICE cutlass::AlignedArray lut_4bit_to_8bit_convert(
+ uint32_t src) {
+ cutlass::AlignedArray r;
+ // Determines if the value is in the top half of the LUT if set or
+ // (i.e. LUT[8:15]) in the bottom half (i.e. LUT[0:7]) if not set. Then move
+ // into bit position 0x4 of each nibble so when or'd with final_prmt_base it
+ // selects the correct candidate. When elements in final_prmt_base
+ // are >= 0x4, the high candidate is selected (i.e. LUT[8:15]), when elements
+ // are < 0x4, the low candidate is selected (i.e. LUT[0:7])
+ uint32_t high_bit = (src & 0x88888888) >> 1;
+
+ // `high_bit` is OR'd with 0x31203120 to find the correct value in the LUT
+ // (selects correct high or low candidate)
+ const uint32_t final_prmt_base = 0x32103210;
+
+ // Ignore the high bit when indexing into LUT, for each 4bit value
+ // we index into both the high and low candidates then use
+ // high_bit | final_prmt_base to select the correct candidate
+ uint32_t lut_idx = (src & 0x77777777);
+
+ auto pack = [](uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
+ return uint32_t(a) | (uint32_t(b) << 8) | (uint32_t(c) << 16) |
+ (uint32_t(d) << 24);
+ };
+
+ static constexpr uint32_t LOW_0 = pack(LUT0, LUT1, LUT2, LUT3);
+ static constexpr uint32_t LOW_1 = pack(LUT4, LUT5, LUT6, LUT7);
+ static constexpr uint32_t HIGH_0 = pack(LUT8, LUT9, LUT10, LUT11);
+ static constexpr uint32_t HIGH_1 = pack(LUT12, LUT13, LUT14, LUT15);
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int ii = 0; ii < 2; ++ii, lut_idx >>= 16, high_bit >>= 16) {
+ uint32_t final_prmt_idx = final_prmt_base | high_bit;
+
+ // This uses a look up table to convert packed int4s to packed int8s,
+ // using the int4 value as the index to prmt. It first select both the
+ // high and low candidates, then uses the high bit (i.e. `high_bit`) to
+ // select the correct candidate.
+ asm volatile(
+ "{\n"
+ " .reg .b32 low, high;\n"
+ " prmt.b32 low, %1, %2, %5;\n"
+ " prmt.b32 high, %3, %4, %5;\n"
+ " prmt.b32 %0, low, high, %6;\n"
+ "}\n"
+ : "=r"(r[ii])
+ : "n"(LOW_0), "n"(LOW_1), "n"(HIGH_0), "n"(HIGH_1), "r"(lut_idx),
+ "r"(final_prmt_idx));
+ }
+
+ return r;
+};
+
+// for Array <= Array
+template
+struct NumericArrayConverter {
+ using result_type = Array;
+ using source_type = Array;
+
+ static FloatRoundStyle const round_style = Round;
+
+ private:
+ struct RegConvert {
+ template
+ CUTLASS_DEVICE static PackedResultType convert(Array