diff --git a/Dockerfile.rocm.ubi b/Dockerfile.rocm.ubi index 8766b995bb555..8067d260470bc 100644 --- a/Dockerfile.rocm.ubi +++ b/Dockerfile.rocm.ubi @@ -1,169 +1,131 @@ -## Global Args ################################################################## -ARG BASE_UBI_IMAGE_TAG=9.4 +## Global Args ################################################################# +ARG BASE_UBI_IMAGE_TAG=9.5-1733767867 ARG PYTHON_VERSION=3.12 -# Default ROCm ARCHes to build vLLM for. -ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" -ARG MAX_JOBS=12 -FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" +ARG vllm_fa_cmake_gpu_arches='80-real;90-real' +## Base Layer ################################################################## +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base ARG PYTHON_VERSION +ENV PYTHON_VERSION=${PYTHON_VERSION} +RUN microdnf -y update && microdnf install -y \ + python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ + && microdnf clean all -ENV VIRTUAL_ENV=/opt/vllm -ENV PATH="$VIRTUAL_ENV/bin:$PATH" +WORKDIR /workspace -RUN --mount=type=cache,target=/root/.cache/pip \ - microdnf -y update && \ - microdnf install -y --setopt=install_weak_deps=0 --nodocs \ - python${PYTHON_VERSION}-devel \ - python${PYTHON_VERSION}-pip \ - python${PYTHON_VERSION}-wheel && \ - python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \ - pip install -U pip wheel setuptools uv && \ - microdnf clean all - - -FROM base AS rocm_base -ARG ROCM_VERSION=6.2.3 -ARG PYTHON_VERSION -ARG BASE_UBI_IMAGE_TAG - -RUN printf "[amdgpu]\n\ -name=amdgpu\n\ -baseurl=https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/${BASE_UBI_IMAGE_TAG}/main/x86_64/\n\ -enabled=1\n\ -priority=50\n\ -gpgcheck=1\n\ -gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key\n\ -[ROCm-${ROCM_VERSION}]\n\ -name=ROCm${ROCM_VERSION}\n\ -baseurl=https://repo.radeon.com/rocm/rhel9/${ROCM_VERSION}/main\n\ -enabled=1\n\ -priority=50\n\ -gpgcheck=1\n\ -gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" > /etc/yum.repos.d/amdgpu.repo +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 +# Some utils for dev purposes - tar required for kubectl cp +RUN microdnf install -y \ + which procps findutils tar vim git\ + && microdnf clean all -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/uv \ - export version="$(awk -F. '{print $1"."$2}' <<< $ROCM_VERSION)" && \ - uv pip install --pre \ - --index-url "https://download.pytorch.org/whl/nightly/rocm${version}" \ - torch==2.6.0.dev20241107+rocm${version}\ - torchvision==0.20.0.dev20241107+rocm${version} && \ - # Install libdrm-amdgpu to avoid errors when retrieving device information (amdgpu.ids: No such file or directory) - microdnf install -y libdrm-amdgpu && \ - microdnf clean all +## Python Installer ############################################################ +FROM base as python-install +ARG PYTHON_VERSION -ENV LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/numpy.libs:$LD_LIBRARY_PATH" -ENV LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/pillow.libs:$LD_LIBRARY_PATH" -ENV LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/triton/backends/amd/lib:$LD_LIBRARY_PATH" -ENV LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/torch/lib:$LD_LIBRARY_PATH" +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +ENV PYTHON_VERSION=${PYTHON_VERSION} +RUN microdnf install -y \ + python${PYTHON_VERSION}-devel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all -RUN echo $LD_LIBRARY_PATH | tr : \\n >> /etc/ld.so.conf.d/torch-venv.conf && \ - ldconfig -FROM rocm_base as rocm_devel +## CUDA Base ################################################################### +FROM python-install as cuda-base -ENV CCACHE_DIR=/root/.cache/ccache +RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ + https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo -RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ - rpm -ql epel-release && \ - microdnf -y update && \ - microdnf -y install \ - ccache \ - git \ - # packages required to build vllm - amd-smi \ - hipblas-devel \ - hipblaslt-devel \ - hipcc \ - hipcub-devel \ - hipfft-devel \ - hiprand-devel \ - hipsolver-devel \ - hipsparse-devel \ - hsa-rocr-devel \ - miopen-hip-devel \ - rccl-devel \ - rocblas-devel \ - rocm-device-libs \ - rocprim-devel \ - rocrand-devel \ - rocthrust-devel \ - # end packages required to build vllm - wget \ - which && \ +RUN microdnf install -y \ + cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \ microdnf clean all -WORKDIR /workspace - -ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer -ENV PATH=$PATH:/opt/rocm/bin -ENV CPLUS_INCLUDE_PATH=$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/torch/include:/opt/rocm/include +ENV CUDA_HOME="/usr/local/cuda" \ + PATH="${CUDA_HOME}/bin:${PATH}" \ + LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" +## Python cuda base ################################################################# +FROM cuda-base AS python-cuda-base -FROM rocm_devel AS build_amdsmi - -# Build AMD SMI wheel -RUN cd /opt/rocm/share/amd_smi && \ - python3 -m pip wheel . --wheel-dir=/install +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" -################################################################################################## +# install cuda and common dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ + uv pip install \ + -r requirements-cuda.txt -FROM rocm_devel AS build_flashattention -ARG FA_GFX_ARCHS="gfx90a;gfx942" +## Development ################################################################# +FROM python-cuda-base AS dev -# the FA_BRANCH commit belongs to the ROCm/flash-attention fork, `main_perf` branch -ARG FA_BRANCH="3cea2fb" -ARG MAX_JOBS -ENV MAX_JOBS=${MAX_JOBS} +# install build and runtime dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ + --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ + --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ + --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ + uv pip install \ + -r requirements-cuda.txt \ + -r requirements-dev.txt -RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=cache,target=/workspace/build \ - mkdir -p /libs && \ - cd /libs && \ - git clone https://github.com/ROCm/flash-attention.git && \ - cd flash-attention && \ - git checkout ${FA_BRANCH} && \ - git submodule update --init && \ - uv pip install cmake ninja packaging && \ - env \ - GPU_ARCHS="${FA_GFX_ARCHS}" \ - python3 setup.py bdist_wheel --dist-dir=/install +## Builder ##################################################################### +FROM dev AS build -################################################################################################## +# install build dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ + uv pip install -r requirements-build.txt -FROM rocm_devel AS build_vllm -ARG PYTORCH_ROCM_ARCH -ARG PYTHON_VERSION -ARG MAX_JOBS -ENV MAX_JOBS=${MAX_JOBS} -ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} +# install compiler cache to speed up compilation leveraging local or remote caching +# git is required for the cutlass kernels +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all COPY . . -ENV VLLM_TARGET_DEVICE="rocm" -ENV MAX_JOBS=${MAX_JOBS} -# Make sure punica kernels are built (for LoRA) +ARG TORCH_CUDA_ARCH_LIST +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST +ARG vllm_fa_cmake_gpu_arches +ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} + +# max jobs used by Ninja to build extensions +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} +# number of threads used by nvcc +ARG nvcc_threads=8 +ENV NVCC_THREADS=$nvcc_threads +# make sure punica kernels are built (for LoRA) ENV VLLM_INSTALL_PUNICA_KERNELS=1 +# Make sure the cuda environment is in the PATH +ENV PATH=/usr/local/cuda/bin:$PATH + +ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install -v -U \ - ninja setuptools-scm>=8 "cmake>=3.26" packaging && \ + --mount=type=bind,src=.git,target=/workspace/.git \ env CFLAGS="-march=haswell" \ CXXFLAGS="$CFLAGS $CXXFLAGS" \ CMAKE_BUILD_TYPE=Release \ - python3 setup.py bdist_wheel --dist-dir=dist + python3 setup.py bdist_wheel --dist-dir=dist #################### libsodium Build IMAGE #################### -FROM rocm_base as libsodium-builder +FROM base as libsodium-builder -RUN microdnf install -y gcc gzip tar \ +RUN microdnf install -y gcc gzip \ && microdnf clean all WORKDIR /usr/src/libsodium @@ -174,44 +136,43 @@ RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM && rm -f libsodium*.tar.gz \ && mv libsodium*/* ./ -RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection" \ - ./configure \ - --prefix="/usr/" \ - --libdir=/usr/lib64 && \ - make -j $(nproc) && \ - make check +RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\ + ./configure --prefix="/usr/" && make -j $MAX_JOBS && make check -################################################################################################## - -FROM rocm_base AS vllm-openai -ARG MAX_JOBS +## Release ##################################################################### +FROM python-install AS vllm-openai +ARG PYTHON_VERSION WORKDIR /workspace ENV VIRTUAL_ENV=/opt/vllm ENV PATH=$VIRTUAL_ENV/bin:$PATH -# Required for triton -RUN microdnf install -y --setopt=install_weak_deps=0 --nodocs gcc rsync && \ - microdnf clean all +# force using the python venv's cuda runtime libraries +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}" + +# Triton needs a CC compiler +RUN microdnf install -y gcc \ + rsync \ + && microdnf clean all + +# install vllm wheel first, so that torch etc will be installed +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose # Install libsodium for Tensorizer encryption RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ cd /usr/src/libsodium \ && make install -RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install/amdsmi/ \ - --mount=type=bind,from=build_flashattention,src=/install,target=/install/flashattention \ - --mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \ - --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/uv \ - export version="$(awk -F. '{print $1"."$2}' <<< $ROCM_VERSION)" && \ uv pip install \ - --index-strategy=unsafe-best-match \ - --extra-index-url "https://download.pytorch.org/whl/nightly/rocm${version}" \ - /install/amdsmi/*.whl\ - /install/flashattention/*.whl\ - /install/vllm/*.whl + "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp312-cp312-linux_x86_64.whl" ENV HF_HUB_OFFLINE=1 \ HOME=/home/vllm \ @@ -222,11 +183,6 @@ ENV HF_HUB_OFFLINE=1 \ VLLM_USAGE_SOURCE=production-docker-image \ VLLM_WORKER_MULTIPROC_METHOD=fork \ VLLM_NO_USAGE_STATS=1 \ - # Silences the HF Tokenizers warning - TOKENIZERS_PARALLELISM=false \ - RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \ - VLLM_USE_TRITON_FLASH_ATTN=0 \ - HIP_FORCE_DEV_KERNARG=1 \ OUTLINES_CACHE_DIR=/tmp/outlines \ NUMBA_CACHE_DIR=/tmp/numba \ TRITON_CACHE_DIR=/tmp/triton @@ -234,8 +190,8 @@ ENV HF_HUB_OFFLINE=1 \ # setup non-root user for OpenShift RUN umask 002 && \ useradd --uid 2000 --gid 0 vllm && \ - mkdir -p /licenses /home/vllm && \ - chmod g+rwx /home/vllm + mkdir -p /home/vllm && \ + chmod g+rwx /home/vllm /usr/src /workspace COPY LICENSE /licenses/vllm.md COPY examples/*.jinja /app/data/template/ @@ -252,8 +208,11 @@ USER root RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \ - HOME=/root uv pip install /install/vllm/*.whl vllm-tgis-adapter==0.5.3 + --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ + HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" + +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install git+https://github.com/opendatahub-io/vllm-tgis-adapter@vllm0p6p5 ENV GRPC_PORT=8033 \ PORT=8000 \ diff --git a/Dockerfile.ubi b/Dockerfile.ubi index a4b7c4df58a9c..8067d260470bc 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -209,7 +209,10 @@ USER root RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ - HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.5.3 + HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" + +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install git+https://github.com/opendatahub-io/vllm-tgis-adapter@vllm0p6p5 ENV GRPC_PORT=8033 \ PORT=8000 \