Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[do-not-merge] Ibm 20241219 #267

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
c77eb8a
[Bugfix] Set temperature=0.7 in test_guided_choice_chat (#11264)
mgoin Dec 18, 2024
bf8717e
[V1] Prefix caching for vision language models (#11187)
comaniac Dec 18, 2024
866fa45
[Bugfix] Restore support for larger block sizes (#11259)
kzawora-intel Dec 18, 2024
8b79f9e
[Bugfix] Fix guided decoding with tokenizer mode mistral (#11046)
wallashss Dec 18, 2024
f04e407
[MISC][XPU]update ipex link for CI fix (#11278)
yma11 Dec 18, 2024
60508ff
[Kernel]: Cutlass 2:4 Sparsity + FP8/Int8 Quant Support (#10995)
dsikka Dec 18, 2024
996aa70
[Bugfix] Fix broken phi3-v mm_processor_kwargs tests (#11263)
Isotr0py Dec 18, 2024
362cff1
[CI][Misc] Remove Github Action Release Workflow (#11274)
simon-mo Dec 18, 2024
f954fe0
[FIX] update openai version (#11287)
jikunshang Dec 18, 2024
ca5f54a
[Bugfix] fix minicpmv test (#11304)
joerunde Dec 18, 2024
fdea8ec
[V1] VLM - enable processor cache by default (#11305)
alexm-neuralmagic Dec 18, 2024
5a9da2e
[Bugfix][Build/CI] Fix sparse CUTLASS compilation on CUDA [12.0, 12.2…
tlrmchlsmth Dec 19, 2024
17ca964
[Model] IBM Granite 3.1 (#11307)
tjohnson31415 Dec 19, 2024
a30482f
[CI] Expand test_guided_generate to test all backends (#11313)
mgoin Dec 19, 2024
c6b0a7d
[V1] Simplify prefix caching logic by removing `num_evictable_compute…
heheda12345 Dec 19, 2024
6142ef0
[VLM] Merged multimodal processor for Qwen2-Audio (#11303)
DarkLight1337 Dec 19, 2024
8936316
[Kernel] Refactor Cutlass c3x (#10049)
varun-sundar-rabindranath Dec 19, 2024
f26c4ae
[Misc] Optimize ray worker initialization time (#11275)
ruisearch42 Dec 19, 2024
9835673
[misc] benchmark_throughput : Add LoRA (#11267)
varun-sundar-rabindranath Dec 19, 2024
5aef498
[Feature] Add load generation config from model (#11164)
liuyanyi Dec 19, 2024
a0f7d53
[Bugfix] Cleanup Pixtral HF code (#11333)
DarkLight1337 Dec 19, 2024
6c7f881
[Model] Add JambaForSequenceClassification model (#10860)
yecohn Dec 19, 2024
d45405a
Merge branch 'main' of https://github.com/vllm-project/vllm into ibm-…
fialhocoelho Dec 19, 2024
5e5307e
Squash 6357
fialhocoelho Dec 19, 2024
e362236
Squash 10235
fialhocoelho Dec 19, 2024
27fe49e
install adapter for 0.6.5 and PR#196 for adapter
yecohn Dec 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,12 @@ steps:
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/model_executor/layers
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
command: pytest -v -s test_logits_processor.py
- tests/model_executor/test_guided_processors
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py

- label: Speculative decoding tests # 30min
source_file_dependencies:
Expand Down
123 changes: 62 additions & 61 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,67 +39,68 @@ jobs:
const script = require('.github/workflows/scripts/create_release.js')
await script(github, context, core)

wheel:
name: Build Wheel
runs-on: ${{ matrix.os }}
needs: release

strategy:
fail-fast: false
matrix:
os: ['ubuntu-20.04']
python-version: ['3.9', '3.10', '3.11', '3.12']
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']

steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Setup ccache
uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
with:
create-symlink: true
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}

- name: Set up Linux Env
if: ${{ runner.os == 'Linux' }}
run: |
bash -x .github/workflows/scripts/env.sh

- name: Set up Python
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}

- name: Install CUDA ${{ matrix.cuda-version }}
run: |
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}

- name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
run: |
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}

- name: Build wheel
shell: bash
env:
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
run: |
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
asset_name=${wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"

- name: Upload Release Asset
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.release.outputs.upload_url }}
asset_path: ./dist/${{ env.wheel_name }}
asset_name: ${{ env.asset_name }}
asset_content_type: application/*
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
# wheel:
# name: Build Wheel
# runs-on: ${{ matrix.os }}
# needs: release

# strategy:
# fail-fast: false
# matrix:
# os: ['ubuntu-20.04']
# python-version: ['3.9', '3.10', '3.11', '3.12']
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
# cuda-version: ['11.8', '12.1']

# steps:
# - name: Checkout
# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

# - name: Setup ccache
# uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
# with:
# create-symlink: true
# key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}

# - name: Set up Linux Env
# if: ${{ runner.os == 'Linux' }}
# run: |
# bash -x .github/workflows/scripts/env.sh

# - name: Set up Python
# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
# with:
# python-version: ${{ matrix.python-version }}

# - name: Install CUDA ${{ matrix.cuda-version }}
# run: |
# bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}

# - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
# run: |
# bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}

# - name: Build wheel
# shell: bash
# env:
# CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
# run: |
# bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
# wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
# asset_name=${wheel_name//"linux"/"manylinux1"}
# echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
# echo "asset_name=${asset_name}" >> "$GITHUB_ENV"

# - name: Upload Release Asset
# uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
# env:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# with:
# upload_url: ${{ needs.release.outputs.upload_url }}
# asset_path: ./dist/${{ env.wheel_name }}
# asset_name: ${{ env.asset_name }}
# asset_content_type: application/*

# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
# - name: Publish package
Expand Down
39 changes: 33 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")

# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
Expand All @@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_TAG v3.5.1
GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
GIT_PROGRESS TRUE

# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW TRUE
GIT_SHALLOW FALSE
)
endif()
FetchContent_MakeAvailable(cutlass)
Expand All @@ -241,7 +241,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
"csrc/cutlass_extensions/common.cpp")

set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
Expand Down Expand Up @@ -270,7 +273,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
" in CUDA target architectures")
endif()

#
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
Expand Down Expand Up @@ -323,6 +325,31 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()

#
# 2:4 Sparse Kernels

# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"if you intend on running FP8 sparse quantized models on Hopper.")
else()
message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
endif()


#
# Machete kernels
Expand Down Expand Up @@ -404,7 +431,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)

Expand Down
Loading
Loading